In [None]:

# A box plot, also known as a box-and-whisker plot, is a type of graphical representation that displays the distribution 
# of a dataset along with key statistical measures. It provides a concise summary of the dataset's central tendency, 
# variability, and potential outliers.

# Here are the components of a box plot:

#  Box:
#  The box represents the interquartile range (IQR), which is the middle 50% of the data.
#  The bottom edge of the box corresponds to the first quartile (Q1), which is the value below which 25% of the data falls.
#  The top edge of the box corresponds to the third quartile (Q3), which is the value below which 75% of the data falls.
#  The length of the box (Q3 - Q1) indicates the spread or variability of the middle 50% of the data.

#  Median:
#  The line inside the box represents the median, which is the middle value of the dataset when it is ordered from smallest 
#  to largest.
#  The median divides the dataset into two halves, with 50% of the data falling below it and 50% above it.

#  Whiskers:
#  The whiskers extend from the edges of the box to the minimum and maximum values within a specified range 
#  (usually 1.5 times the IQR).
#  They represent the range of the dataset excluding potential outliers.

#  Outliers:
#  Individual data points that fall outside the whiskers are considered outliers and are plotted individually as points.
#  Outliers are typically defined as values that are significantly higher or lower than the rest of the data.
        

In [12]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
import seaborn as sb
import scipy
from scipy.stats import spearmanr
from sklearn import datasets
import statsmodels.api as sm
import numpy as np
import pandas as pd

mtcars = sm.datasets.get_rdataset("mtcars", "datasets", cache=True).data
df = pd.DataFrame(mtcars)

df = df.reset_index()

df.head()


Unnamed: 0,rownames,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [18]:

df_melt = pd.melt(df, id_vars=['rownames'], value_vars=['mpg','cyl','disp','hp','drat','wt','qsec','vs','am','gear','carb'])
print(df_melt.head(10))


            rownames variable  value
0          Mazda RX4      mpg   21.0
1      Mazda RX4 Wag      mpg   21.0
2         Datsun 710      mpg   22.8
3     Hornet 4 Drive      mpg   21.4
4  Hornet Sportabout      mpg   18.7
5            Valiant      mpg   18.1
6         Duster 360      mpg   14.3
7          Merc 240D      mpg   24.4
8           Merc 230      mpg   22.8
9           Merc 280      mpg   19.2


In [28]:

import plotly.express as px

fig = px.box(df_melt, x="variable", y="value")
fig.update_traces(quartilemethod="inclusive") # or "inclusive", or "linear" by default
fig.show()


In [31]:

# Assuming df_melt is your DataFrame used to create the box plot
# You can access the DataFrame used to create the plot

# Filter the DataFrame based on the variable and value columns
box_data = df_melt[['variable', 'value']]

# You can further filter the data if needed, for example, by variable
# box_data_filtered = box_data[box_data['variable'] == 'your_variable']

# Print or use the box plot data
print(box_data)


    variable  value
0        mpg   21.0
1        mpg   21.0
2        mpg   22.8
3        mpg   21.4
4        mpg   18.7
..       ...    ...
347     carb    2.0
348     carb    4.0
349     carb    6.0
350     carb    8.0
351     carb    2.0

[352 rows x 2 columns]
