Processing in the Titanic dataset.

In [34]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

In [35]:
df = sns.load_dataset("titanic")

In [36]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [37]:
df["sex"].value_counts()

male      577
female    314
Name: sex, dtype: int64

In [38]:
df.nunique()

survived         2
pclass           3
sex              2
age             88
sibsp            7
parch            7
fare           248
embarked         3
class            3
who              3
adult_male       2
deck             7
embark_town      3
alive            2
alone            2
dtype: int64

In [39]:
print(df["pclass"].nunique())
print(df["pclass"].value_counts())

3
3    491
1    216
2    184
Name: pclass, dtype: int64


In [40]:
df[["pclass","parch"]].nunique()

pclass    3
parch     7
dtype: int64

In [41]:
df["embarked"].dtype

dtype('O')

In [42]:
df["embarked"] = df["embarked"].astype("category")

In [43]:
df["embarked"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 891 entries, 0 to 890
Series name: embarked
Non-Null Count  Dtype   
--------------  -----   
889 non-null    category
dtypes: category(1)
memory usage: 1.1 KB


In [44]:
df.loc[df["embarked"]=="C"]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
19,1,3,female,,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
26,0,3,male,,0,0,7.2250,C,Third,man,True,,Cherbourg,no,True
30,0,1,male,40.0,0,0,27.7208,C,First,man,True,,Cherbourg,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,1,2,female,27.0,1,0,13.8583,C,Second,woman,False,,Cherbourg,yes,False
874,1,2,female,28.0,1,0,24.0000,C,Second,woman,False,,Cherbourg,yes,False
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False


In [45]:
df.loc[df["embarked"]!="S"]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
16,0,3,male,2.0,4,1,29.1250,Q,Third,child,False,,Queenstown,no,False
19,1,3,female,,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [46]:
df.loc[(df["age"]<30) & (df["sex"] == "female")]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
14,0,3,female,14.0,0,0,7.8542,S,Third,child,False,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
874,1,2,female,28.0,1,0,24.0000,C,Second,woman,False,,Cherbourg,yes,False
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
882,0,3,female,22.0,0,0,10.5167,S,Third,woman,False,,Southampton,no,True


In [47]:
df.loc[(df["fare"] > 500) | (df["age"] > 70)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
493,0,1,male,71.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
679,1,1,male,36.0,0,1,512.3292,C,First,man,True,B,Cherbourg,yes,False
737,1,1,male,35.0,0,0,512.3292,C,First,man,True,B,Cherbourg,yes,True
851,0,3,male,74.0,0,0,7.775,S,Third,man,True,,Southampton,no,True


In [48]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [49]:
df = df.drop("who", axis = 1)

In [50]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,True,,Southampton,no,True


In [51]:
df["deck"].unique()

[NaN, 'C', 'E', 'G', 'D', 'A', 'B', 'F']
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [52]:
mode = df["deck"].mode().values[0]
print(mode)

C


In [53]:
df.loc[df["deck"].isna(),'deck'] = mode

In [54]:
df["deck"].unique()

['C', 'E', 'G', 'D', 'A', 'B', 'F']
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [56]:
median = df["age"].median()
print(median)

28.0


In [57]:
df.loc[df["age"].isna() , 'age'] = median

In [61]:
df["age"].isnull().sum()

0

In [60]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,True,C,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,False,C,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,True,C,Southampton,no,True


In [63]:
df.groupby(["pclass", "sex"]).agg({"survived":["mean","sum","count"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,survived,survived,survived
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,sum,count
pclass,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,female,0.968085,91,94
1,male,0.368852,45,122
2,female,0.921053,70,76
2,male,0.157407,17,108
3,female,0.5,72,144
3,male,0.135447,47,347


In [64]:
df["age_flag"] = df["age"].apply(lambda x: 1 if x < 30 else 0)

In [66]:
df[["age", "age_flag"]]

Unnamed: 0,age,age_flag
0,22.0,1
1,38.0,0
2,26.0,1
3,35.0,0
4,35.0,0
...,...,...
886,27.0,1
887,19.0,1
888,28.0,1
889,26.0,1


Processing in the Tips dataset.

In [67]:
df = sns.load_dataset("tips")

In [68]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [69]:
df["time"].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [75]:
df.groupby("time").agg({"total_bill":["min","max","mean"]})

Unnamed: 0_level_0,total_bill,total_bill,total_bill
Unnamed: 0_level_1,min,max,mean
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Lunch,7.51,43.11,17.168676
Dinner,3.07,50.81,20.797159


In [77]:
df.groupby(["time","day"]).agg({"total_bill":["min","max","mean","sum"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,sum
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Lunch,Thur,7.51,43.11,17.664754,1077.55
Lunch,Fri,8.58,16.27,12.845714,89.92
Lunch,Sat,,,,0.0
Lunch,Sun,,,,0.0
Dinner,Thur,18.78,18.78,18.78,18.78
Dinner,Fri,5.75,40.17,19.663333,235.96
Dinner,Sat,3.07,50.81,20.441379,1778.4
Dinner,Sun,7.25,48.17,21.41,1627.16


In [78]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [79]:
df.loc[(df.time == "Lunch") & (df.sex == "Female")].groupby("day").agg({"total_bill":["mean","sum","max","min"], "tip":["mean","sum","max","min"]})

Unnamed: 0_level_0,total_bill,total_bill,total_bill,total_bill,tip,tip,tip,tip
Unnamed: 0_level_1,mean,sum,max,min,mean,sum,max,min
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Thur,16.64871,516.11,43.11,8.35,2.561935,79.42,5.17,1.25
Fri,13.94,55.76,16.27,10.09,2.745,10.98,3.48,2.0
Sat,,0.0,,,,0.0,,
Sun,,0.0,,,,0.0,,


In [80]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [82]:
df.loc[((df["size"] < 3) & (df["total_bill"] > 10)),["total_bill"]].mean()

total_bill    17.184965
dtype: float64

In [83]:
df["total_bill_tip_sum"] = df["total_bill"] + df["tip"]

In [84]:
new_df = df.sort_values("total_bill_tip_sum", ascending = False).head(30)

In [85]:
new_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_bill_tip_sum
170,50.81,10.0,Male,Yes,Sat,Dinner,3,60.81
212,48.33,9.0,Male,No,Sat,Dinner,4,57.33
59,48.27,6.73,Male,No,Sat,Dinner,4,55.0
156,48.17,5.0,Male,No,Sun,Dinner,6,53.17
182,45.35,3.5,Male,Yes,Sun,Dinner,3,48.85
197,43.11,5.0,Female,Yes,Thur,Lunch,4,48.11
23,39.42,7.58,Male,No,Sat,Dinner,4,47.0
102,44.3,2.5,Female,Yes,Sat,Dinner,3,46.8
142,41.19,5.0,Male,No,Thur,Lunch,5,46.19
95,40.17,4.73,Male,Yes,Fri,Dinner,4,44.9


In [86]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 170 to 155
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   total_bill          30 non-null     float64 
 1   tip                 30 non-null     float64 
 2   sex                 30 non-null     category
 3   smoker              30 non-null     category
 4   day                 30 non-null     category
 5   time                30 non-null     category
 6   size                30 non-null     int64   
 7   total_bill_tip_sum  30 non-null     float64 
dtypes: category(4), float64(3), int64(1)
memory usage: 1.9 KB


In [88]:
new_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_bill_tip_sum
170,50.81,10.0,Male,Yes,Sat,Dinner,3,60.81
212,48.33,9.0,Male,No,Sat,Dinner,4,57.33
59,48.27,6.73,Male,No,Sat,Dinner,4,55.0
156,48.17,5.0,Male,No,Sun,Dinner,6,53.17
182,45.35,3.5,Male,Yes,Sun,Dinner,3,48.85
