#### Statistics
- **Prerequisite**: you should know the definition of **mean, median, mode, standard deviation, outliers, percentile, correlation coefficient** between variables
- Pandas dataframe allows us to  compute like mean, median, variance, standard deviation, minimum, maximum, etc

In [2]:
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

data_file = 'data_tips.csv' # make sure you have this file

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Dataframe

In [3]:
# important ones
df = pd.read_csv('data_tips.csv')

print(df)

# Lets find how many columns/rows in df
print(df.shape) # (rows,columns)

# TODO: Add description of tips

     total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]
(244, 7)


In [4]:
# give info about columns
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB
None


In [5]:
# give me the stats on each numerics columns
columns = ['total_bill', 'tip', 'size'] # numeric columns
print(df[columns].describe()) 

       total_bill         tip        size
count  244.000000  244.000000  244.000000
mean    19.785943    2.998279    2.569672
std      8.902412    1.383638    0.951100
min      3.070000    1.000000    1.000000
25%     13.347500    2.000000    2.000000
50%     17.795000    2.900000    2.000000
75%     24.127500    3.562500    3.000000
max     50.810000   10.000000    6.000000


In [6]:
# Lets do Column-wise Statistics:

# step1: lets do for 1 numeric column total_bill 

print( df['total_bill'].mean()   )
print( df['total_bill'].median() )
print( df['total_bill'].std()    )
print( df['total_bill'].min()    )
print( df['total_bill'].max()    )
print( df['total_bill'].sum()    )

#####################
# lets find the most common ocurring tip size for column tip
print( df['tip'].mode().iloc[0]  )

19.78594262295082
17.795
8.902411954856856
3.07
50.81
4827.77
2.0


In [7]:
# step2: lets do for multiple numeric columns = ['total_bill', 'tip', 'size'] by using for loop
columns = ['total_bill', 'tip', 'size']
for col in columns:
    print(f"stats for {col}")
    print( df[col].mean()   )
    print( df[col].median() )
    print( df[col].std()    )
    print( df[col].min()    )
    print( df[col].max()    )
    print( df[col].sum()    )
    print(f"---DONE---\n")


stats for total_bill
19.78594262295082
17.795
8.902411954856856
3.07
50.81
4827.77
---DONE---

stats for tip
2.99827868852459
2.9
1.3836381890011822
1.0
10.0
731.5799999999999
---DONE---

stats for size
2.569672131147541
2.0
0.9510998047322344
1
6
627
---DONE---



In [8]:
# show me the unique counts and the values for columns = ['sex', 'smoker', 'day', 'time', 'size']

# step1: The unique counts
columns = ['sex', 'smoker', 'day', 'time', 'size']
unique_counts = df[columns].nunique()
print(unique_counts)
print("###########################")

# step2: The unique values
for col in columns:
    print(f"{col}:")
    print(df[col].unique())
    print("--------\n")
print("###########################")

# step3: Now show me the individual value counts
for col in columns:
    print(df[col].value_counts())
    print("***************\n")

sex       2
smoker    2
day       4
time      2
size      6
dtype: int64
###########################
sex:
['Female' 'Male']
--------

smoker:
['No' 'Yes']
--------

day:
['Sun' 'Sat' 'Thur' 'Fri']
--------

time:
['Dinner' 'Lunch']
--------

size:
[2 3 4 1 6 5]
--------

###########################
sex
Male      157
Female     87
Name: count, dtype: int64
***************

smoker
No     151
Yes     93
Name: count, dtype: int64
***************

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64
***************

time
Dinner    176
Lunch      68
Name: count, dtype: int64
***************

size
2    156
3     38
4     37
5      5
1      4
6      4
Name: count, dtype: int64
***************



In [9]:
# Now I want to find Q1, Q2, Q3, IQR and outliers.

# Inter - Quantile Range (IQR - For Total Bill)

Q1 = df['total_bill'].quantile(0.25)
Q2 = df['total_bill'].quantile(0.50)
Q3 = df['total_bill'].quantile(0.75)
print(f"Q1: {Q1}")
print(f"Q2: {Q2}")
print(f"Q3: {Q3}")


IQR = Q3 - Q1
print(f"IQR: {IQR}")

# Define the bounds for outliers
lower_bound = Q1 - 1.5 * IQR 
upper_bound = Q3 + 1.5 * IQR 
print(f"lower bound: {lower_bound}")
print(f"upper bound: {upper_bound}")

# Find the outliers
mask = (df['total_bill'] < lower_bound) | (df['total_bill'] > upper_bound)
df_outliers = df[mask]

# Display the outliers
print(f"Total number of outliers: {len(df_outliers.index)}")
print(f"Outlier Data Points:\n{df_outliers}")

Q1: 13.3475
Q2: 17.795
Q3: 24.127499999999998
IQR: 10.779999999999998
lower bound: -2.8224999999999945
upper bound: 40.29749999999999
Total number of outliers: 9
Outlier Data Points:
     total_bill    tip     sex smoker   day    time  size
59        48.27   6.73    Male     No   Sat  Dinner     4
102       44.30   2.50  Female    Yes   Sat  Dinner     3
142       41.19   5.00    Male     No  Thur   Lunch     5
156       48.17   5.00    Male     No   Sun  Dinner     6
170       50.81  10.00    Male    Yes   Sat  Dinner     3
182       45.35   3.50    Male    Yes   Sun  Dinner     3
184       40.55   3.00    Male    Yes   Sun  Dinner     2
197       43.11   5.00  Female    Yes  Thur   Lunch     4
212       48.33   9.00    Male     No   Sat  Dinner     4


In [None]:
# Now I want to find Q1, Q2, Q3, IQR and outliers for each column: total_bill and tip

# Inter - Quantile Range (IQR - For Total Bill)
col = 'tip' # total_bill
Q1 = df[col].quantile(0.25)
Q2 = df[col].quantile(0.50)
Q3 = df[col].quantile(0.75)
print(f"Q1: {Q1}")
print(f"Q2: {Q2}")
print(f"Q3: {Q3}")


IQR = Q3 - Q1
print(f"IQR: {IQR}")

# Define the bounds for outliers
lower_bound = Q1 - 1.5 * IQR 
upper_bound = Q3 + 1.5 * IQR 
print(f"lower bound: {lower_bound}")
print(f"upper bound: {upper_bound}")

# Find the outliers
mask = (df[col] < lower_bound) | (df[col] > upper_bound)
df_outliers = df[mask]

# Display the outliers
print(f"Total number of outliers: {len(df_outliers.index)}")
print(f"Outlier Data Points:\n{df_outliers}")

In [11]:
# Checking for Null Values
print(df.isnull().sum())

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64


In [12]:
# I want to find correlation coeff. between features in tips dataset: -1 to 0 to +1

# Select numeric columns
numeric_cols = ['tip', 'total_bill', 'size']

correlation_matrix = df[numeric_cols].corr()
print("Correlation matrix of numeric columns:\n",correlation_matrix)

Correlation matrix of numeric columns:
                  tip  total_bill      size
tip         1.000000    0.675734  0.489299
total_bill  0.675734    1.000000  0.598315
size        0.489299    0.598315  1.000000


### OPTIONAL