In [1]:
# this notebook illustrates use of descriptive statistics to explore a dataset

# this is a popular dataset so you can learn from others who have posted similar exploratory steps online
# examples:
# https://www.kaggle.com/code/gauravsharma99/eda-on-mpg-data
# https://medium.com/analytics-vidhya/eda-for-beginners-using-python-4fcd2b57d1f7

topics in this notebook

- reading in a CSV file into dataframe (df)
- displaying the shape of the df
- check the data type of columns in the df
- measures of central tendency
- measures of dispersion
- measures of location

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# mpg - miles per gallon

In [2]:
mpg_data = sns.load_dataset("mpg")

In [3]:
# shape of dataset
mpg_data.shape

(398, 9)

In [4]:
# column types in dataset
mpg_data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin           object
name             object
dtype: object

In [5]:
# Display the first few rows of the dataset
mpg_data.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [34]:
mpg_data.tail(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger
397,31.0,4,119.0,82.0,2720,19.4,82,usa,chevy s-10


In [6]:
mpg_data.select_dtypes(include='number')

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70
4,17.0,8,302.0,140.0,3449,10.5,70
...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82
394,44.0,4,97.0,52.0,2130,24.6,82
395,32.0,4,135.0,84.0,2295,11.6,82
396,28.0,4,120.0,79.0,2625,18.6,82


In [7]:
mpg_data.select_dtypes(include='object')

Unnamed: 0,origin,name
0,usa,chevrolet chevelle malibu
1,usa,buick skylark 320
2,usa,plymouth satellite
3,usa,amc rebel sst
4,usa,ford torino
...,...,...
393,usa,ford mustang gl
394,europe,vw pickup
395,usa,dodge rampage
396,usa,ford ranger


# measures of central tendency

In [8]:
# summary statistics of numeric columns
mpg_data.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [10]:
mpg_data.weight

0      3504
1      3693
2      3436
3      3433
4      3449
       ... 
393    2790
394    2130
395    2295
396    2625
397    2720
Name: weight, Length: 398, dtype: int64

In [39]:
mpg_wt_mean = mpg_data.weight.mean()

2970.424623115578

In [11]:
# Calculate mean, median, and mode of a specific column
mean_mpg = mpg_data["mpg"].mean()
median_mpg = mpg_data["mpg"].median()
mode_mpg = mpg_data["mpg"].mode().values[0]

print("Mean MPG:", mean_mpg)
print("Median MPG:", median_mpg)
print("Mode MPG:", mode_mpg)

Mean MPG: 23.514572864321607
Median MPG: 23.0
Mode MPG: 13.0


In [11]:
# check counts of various cylinders
mpg_data.cylinders.value_counts()

cylinders
4    204
8    103
6     84
3      4
5      3
Name: count, dtype: int64

In [12]:
# check counts of model_years
mpg_data.model_year.value_counts()

model_year
73    40
78    36
76    34
82    31
75    30
70    29
79    29
80    29
81    29
71    28
72    28
77    28
74    27
Name: count, dtype: int64

In [13]:
# similar check for origin of car
mpg_data.origin.value_counts()

origin
usa       249
japan      79
europe     70
Name: count, dtype: int64

# measures of dispersion

In [17]:
mpg_data.weight.max(), mpg_data.weight.min()

(5140, 1613)

In [14]:
# range = max - min

# for a column

mpg_data.weight.max() - mpg_data.weight.min()

3527

In [40]:
mpg_data_num = mpg_data.select_dtypes(include='number')

In [41]:
mpg_data_num

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70
4,17.0,8,302.0,140.0,3449,10.5,70
...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82
394,44.0,4,97.0,52.0,2130,24.6,82
395,32.0,4,135.0,84.0,2295,11.6,82
396,28.0,4,120.0,79.0,2625,18.6,82


In [20]:
# for all num columns together

mpg_data_num.max() - mpg_data_num.min()

mpg               37.6
cylinders          5.0
displacement     387.0
horsepower       184.0
weight          3527.0
acceleration      16.8
model_year        12.0
dtype: float64

In [21]:
mpg_data_num.var()

mpg                 61.089611
cylinders            2.893415
displacement     10872.199152
horsepower        1481.569393
weight          717140.990526
acceleration         7.604848
model_year          13.672443
dtype: float64

In [22]:
# variance of individual column
mpg_data["mpg"].var()

61.089610774274405

In [None]:
# using numpy library
np.var(mpg_data['mpg'])

In [None]:
# also
np.var(mpg_data.mpg)

In [23]:
# compute variance of all num columns together

# var() itself selects just the right columns and does this.
mpg_data_num.var()

mpg                 61.089611
cylinders            2.893415
displacement     10872.199152
horsepower        1481.569393
weight          717140.990526
acceleration         7.604848
model_year          13.672443
dtype: float64

In [44]:
mpg_data.mpg.var(), mpg_data.mpg.std()**2 

(61.089610774274405, 61.089610774274405)

In [25]:
mpg_data_num.min()

mpg                9.0
cylinders          3.0
displacement      68.0
horsepower        46.0
weight          1613.0
acceleration       8.0
model_year        70.0
dtype: float64

In [26]:
# Calculate range, variance, and standard deviation of a specific column
range_mpg = mpg_data["mpg"].max() - mpg_data["mpg"].min()
variance_mpg = mpg_data["mpg"].var()
std_dev_mpg = mpg_data["mpg"].std()

print("Range of MPG:", range_mpg)
print("Variance of MPG:", variance_mpg)
print("Standard Deviation of MPG:", std_dev_mpg)

Range of MPG: 37.6
Variance of MPG: 61.089610774274405
Standard Deviation of MPG: 7.815984312565782


In [None]:
np.quantile(mpg_data["mpg"],0.25)

# measures of location

In [47]:
# median
# Q1,Q2,Q3
# Percentiles
# Deciles
# quantiles

np.quantile(mpg_data.mpg,0.5), mpg_data.mpg.median()

(23.0, 23.0)

In [48]:
np.quantile(mpg_data.mpg,0.25), np.quantile(mpg_data.mpg,0.5), np.quantile(mpg_data.mpg,0.75)

(17.5, 23.0, 29.0)

In [49]:
np.quantile(mpg_data.mpg,0.1), np.quantile(mpg_data.mpg,0.2),np.quantile(mpg_data.mpg,0.8), np.quantile(mpg_data.mpg,0.9)

(14.0, 16.0, 31.0, 34.33)

In [51]:
np.quantile(mpg_data.mpg,0.01), np.percentile(mpg_data["mpg"],1)

(11.0, 11.0)

In [33]:
# Calculate the 25th, 50th (median), and 75th percentiles of a specific column
percentiles = np.percentile(mpg_data["mpg"], [25, 50, 75])

print("25th Percentile of MPG:", percentiles[0])
print("Median (50th Percentile) of MPG:", percentiles[1])
print("75th Percentile of MPG:", percentiles[2])


# Calculate any quantile in a column. this can be used to get the Q1, Q2, Q3, deciles or any percentiles also.

Q1_mpg = np.quantile(mpg_data["mpg"],0.25)
median_mpg = np.quantile(mpg_data["mpg"],0.5)
Q3_mpg = np.quantile(mpg_data["mpg"],0.75)

IQR_mpg = Q3_mpg - Q1_mpg

a_quantile_mpg = np.quantile(mpg_data["mpg"],0.3145)

print("first quartile:", Q1_mpg)
print("third quartile:", Q3_mpg)
print("IQR:", IQR_mpg)
print("median or Q2:", median_mpg)

25th Percentile of MPG: 17.5
Median (50th Percentile) of MPG: 23.0
75th Percentile of MPG: 29.0
first quartile: 17.5
third quartile: 29.0
IQR: 11.5
median or Q2: 23.0


In [28]:
np.percentile(mpg_data.mpg,25), np.quantile(mpg_data.mpg,0.25)

(17.5, 17.5)