# PARAMETRES STATISTIQUES DE DESCRIPTION D'UN ENSEMBLE DE DONNEES

## Importation des packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [4]:
# Données se trouvent dans seaborn
sns.get_dataset_names()


['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'taxis',
 'tips',
 'titanic']

In [5]:
# Importation des données

df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.3 KB


# Paramètres statiques usels

In [7]:
# Pourboire moyen

df['tip'].mean()

2.9982786885245902

In [9]:
# Pourboire median

df['tip'].median()

2.9

In [10]:
# Pourbois minimum

df.tip.min()

1.0

In [11]:
# Pourbois 

df.tip.max()

10.0

In [14]:
# Distribution de fréquence de la variable 'tip'

df.tip.value_counts()

2.00    33
3.00    23
4.00    12
5.00    10
2.50    10
        ..
2.83     1
1.58     1
3.71     1
3.35     1
2.18     1
Name: tip, Length: 123, dtype: int64

In [17]:
# Distribution de fréquence de la variable 'time' 

df.time.value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [18]:
# Distribution de fréquence de la variable 'time'  en pourcentage

df.time.value_counts(normalize=True)

Dinner    0.721311
Lunch     0.278689
Name: time, dtype: float64

In [19]:
# Etendue de la variable 'tip'

df.tip.max() - df.tip.min()

9.0

In [20]:
# Variance de la variable 'tip'

df.tip.var()

1.9144546380624725

In [21]:
# Equart-type  de la variable 'tip'

df.tip.std()

1.3836381890011826

In [23]:
# Vérifion que l'equart-type est belle et bien égale à la racine carrée de la variance

df.tip.std() == df.tip.var()**0.5

True

In [43]:
# Vérifion que l'equart-type est belle et bien égale à la racine carrée de la variance

round(df.tip.std()**2, 3)==round(df.tip.var(), 3)

True

In [44]:
# Table de statistique descriptives

df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [45]:
df.describe(include='all')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
count,244.0,244.0,244,244,244,244,244.0
unique,,,2,2,4,2,
top,,,Male,No,Sat,Dinner,
freq,,,157,151,87,176,
mean,19.785943,2.998279,,,,,2.569672
std,8.902412,1.383638,,,,,0.9511
min,3.07,1.0,,,,,1.0
25%,13.3475,2.0,,,,,2.0
50%,17.795,2.9,,,,,2.0
75%,24.1275,3.5625,,,,,3.0


In [48]:
# Quartile de la variable 'total_bill'
print('1er quartile: {}'.format(df.total_bill.quantile(0.25)))

1er quartile: 13.3475


In [49]:
# Quartile de la variable 'total_bill'
print('2ème quartile: {}'.format(df.total_bill.quantile(0.5)))

2ème quartile: 17.795


In [50]:
# Quartile de la variable 'total_bill'
print('3ème quartile: {}'.format(df.total_bill.quantile(0.75)))

3ème quartile: 24.127499999999998


In [52]:
# Quartile de la variable 'total_bill'
print('1er quartile: {}'.format(np.quantile(df.total_bill, 0.25)))
# Quartile de la variable 'total_bill'
print('2ème quartile: {}'.format(np.quantile(df.total_bill, 0.5)))
# Quartile de la variable 'total_bill'
print('3ème quartile: {}'.format(np.quantile(df.total_bill, 0.75)))

1er quartile: 13.3475
2ème quartile: 17.795
3ème quartile: 24.127499999999998


In [53]:
# Quartile de la variable 'total_bill'
print('1er quartile: {}'.format(np.percentile(df.total_bill, 25)))
# Quartile de la variable 'total_bill'
print('2ème quartile: {}'.format(np.percentile(df.total_bill, 50)))
# Quartile de la variable 'total_bill'
print('3ème quartile: {}'.format(np.percentile(df.total_bill, 75)))

1er quartile: 13.3475
2ème quartile: 17.795
3ème quartile: 24.127499999999998


In [56]:
# Tous les quartiles en une seule lignes de code

np.quantile(df.total_bill, [0.25, 0.5, 0.75])

array([13.3475, 17.795 , 24.1275])

In [57]:
# les quintiles
np.quantile(df.total_bill, [.2, .4, .6, .8, 1])

array([12.636, 16.222, 19.818, 26.098, 50.81 ])

In [59]:
# IQR de la variable 'total_bill'

df.total_bill.quantile(.75) - df.total_bill.quantile(0.25)

10.779999999999998

In [60]:
from scipy import stats

stats.iqr(df.total_bill)

10.779999999999998

In [83]:
sat_sun_df = df[(df.day=='Sat') | (df.day == 'Sun')]
sat_sun_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2


In [84]:
## Consommation moyenne et median des clients du weekend
sat_sun_df.groupby('day')['total_bill'].agg(['mean', 'median'])

Unnamed: 0_level_0,mean,median
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Thur,,
Fri,,
Sat,20.441379,18.24
Sun,21.41,19.63


In [82]:
sat_sun_df.index

Int64Index([  4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
            ...
            233, 234, 235, 236, 237, 238, 239, 240, 241, 242],
           dtype='int64', length=159)