# Mean, Median, Variance, Standard deviation, Minimum, Maximum

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

In [2]:
# load the dataset tips from the seaborn library

tips_df  = sns.load_dataset('tips')
tips_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
# Mean 

tips_df['tip'].mean()

2.99827868852459

In [4]:
# Median 

tips_df['tip'].median()

2.9

In [5]:
# Variance 

tips_df['tip'].var()

1.9144546380624725

In [6]:
# Standard deviation 

tips_df['tip'].std()

1.3836381890011826

In [7]:
# Min 

tips_df['tip'].min()

1.0

In [8]:
# Max

tips_df['tip'].max()

10.0

In [9]:
# Spread 

tips_df['tip'].max() - tips_df['tip'].min()

9.0

In [10]:
# Let's check if the Standard deviation is effectively the square root of the variance

tips_df['tip'].std() == (tips_df['tip'].var()) ** 0.5

True

In [11]:
# Distribution of the variable 'time'

tips_df['time'].value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [12]:
# Table of descriptive statistics

tips_df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [13]:
# Quartile of the variable 'total_bill' : method .quantile()
# 1st quartile, 2nd quartile, 3rd quartile

tips_df['total_bill'].quantile([0.25, 0.5, 0.75])

0.25    13.3475
0.50    17.7950
0.75    24.1275
Name: total_bill, dtype: float64

In [14]:
# Quartile of the variable 'total_bill' : function quantile of numpy

np.quantile(tips_df['total_bill'], [0.25, 0.5, 0.75])

array([13.3475, 17.795 , 24.1275])

In [15]:
# Quartile of the variable 'total_bill' : function percentile of numpy

np.percentile(tips_df['total_bill'], [25, 50, 75])

array([13.3475, 17.795 , 24.1275])

In [16]:
# Calcul of quintiles

np.quantile(tips_df['total_bill'], [0.2, 0.4, 0.6, 0.8, 1])

array([12.636, 16.222, 19.818, 26.098, 50.81 ])

In [17]:
# IQR of the variable 'total_bill' (interquartile difference)

np.quantile(tips_df['total_bill'], 0.75) - np.quantile(tips_df['total_bill'], 0.25)

10.779999999999998

In [18]:
# IQR de la variable 'total_bill' avec le module stats
from scipy import stats

stats.iqr(tips_df['total_bill'])

10.779999999999998

In [19]:
# Dataframe with the clients of saturday and sunday (filtration)
tips_sat_sun = tips_df[tips_df['day'].isin(['Sat', 'Sun'])]
tips_sat_sun

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2


In [20]:
# Mean and median of the consumption for the clients of saturday and sunday

tips_sat_sun[tips_sat_sun['day'] == 'Sun']['total_bill'].mean()

21.41

In [24]:
tips_sat_sun.groupby('day')['total_bill'].agg(['mean', 'median'])

Unnamed: 0_level_0,mean,median
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Thur,,
Fri,,
Sat,20.441379,18.24
Sun,21.41,19.63
