In [1]:
import pandas as pd
import numpy as np

In [2]:
import seaborn as sns
data = sns.load_dataset('titanic')
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
data['who'].unique()

array(['man', 'woman', 'child'], dtype=object)

In [4]:
data['sex'].unique()

array(['male', 'female'], dtype=object)

## Drop all duplicate columns

In [5]:
cols = ['alone', 'alive', 'adult_male', 'embarked', 'pclass']

data = data.drop(columns = cols, axis = 1)

data.head()

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,who,deck,embark_town
0,0,male,22.0,1,0,7.25,Third,man,,Southampton
1,1,female,38.0,1,0,71.2833,First,woman,C,Cherbourg
2,1,female,26.0,0,0,7.925,Third,woman,,Southampton
3,1,female,35.0,1,0,53.1,First,woman,C,Southampton
4,0,male,35.0,0,0,8.05,Third,man,,Southampton


## Total passengers before dropping duplicate rows

In [6]:
data.shape

(891, 10)

## Dropping Duplicate rows


In [7]:
## Checking duplicated values

data[data.duplicated(keep = 'first')]

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,who,deck,embark_town
47,1,female,,0,0,7.7500,Third,woman,,Queenstown
76,0,male,,0,0,7.8958,Third,man,,Southampton
77,0,male,,0,0,8.0500,Third,man,,Southampton
87,0,male,,0,0,8.0500,Third,man,,Southampton
95,0,male,,0,0,8.0500,Third,man,,Southampton
...,...,...,...,...,...,...,...,...,...,...
870,0,male,26.0,0,0,7.8958,Third,man,,Southampton
877,0,male,19.0,0,0,7.8958,Third,man,,Southampton
878,0,male,,0,0,7.8958,Third,man,,Southampton
884,0,male,25.0,0,0,7.0500,Third,man,,Southampton


In [8]:
## Checking the total duplicated values in the data

data.duplicated().sum()

107

In [9]:
## Dropping duplicated values

# data = data[data.duplicated(subset = None ,keep = 'first')].drop()
# data.head()

data = data.drop_duplicates()

In [10]:
data.shape

(784, 10)

In [11]:
data.head()

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,who,deck,embark_town
0,0,male,22.0,1,0,7.25,Third,man,,Southampton
1,1,female,38.0,1,0,71.2833,First,woman,C,Cherbourg
2,1,female,26.0,0,0,7.925,Third,woman,,Southampton
3,1,female,35.0,1,0,53.1,First,woman,C,Southampton
4,0,male,35.0,0,0,8.05,Third,man,,Southampton


In [12]:
data.columns  = data.columns.str.capitalize()
data.head()

Unnamed: 0,Survived,Sex,Age,Sibsp,Parch,Fare,Class,Who,Deck,Embark_town
0,0,male,22.0,1,0,7.25,Third,man,,Southampton
1,1,female,38.0,1,0,71.2833,First,woman,C,Cherbourg
2,1,female,26.0,0,0,7.925,Third,woman,,Southampton
3,1,female,35.0,1,0,53.1,First,woman,C,Southampton
4,0,male,35.0,0,0,8.05,Third,man,,Southampton


In [13]:
col_rename_dict = {'Sibsp' : 'No. of Siblings/Spouse',
                  'Parch' : 'No. of parents/children',
                  'Class' : 'Passenger Class',
                  'Who' : 'Dependent Gender'}

data = data.rename(col_rename_dict, axis = 1)

In [14]:
data.head()

Unnamed: 0,Survived,Sex,Age,No. of Siblings/Spouse,No. of parents/children,Fare,Passenger Class,Dependent Gender,Deck,Embark_town
0,0,male,22.0,1,0,7.25,Third,man,,Southampton
1,1,female,38.0,1,0,71.2833,First,woman,C,Cherbourg
2,1,female,26.0,0,0,7.925,Third,woman,,Southampton
3,1,female,35.0,1,0,53.1,First,woman,C,Southampton
4,0,male,35.0,0,0,8.05,Third,man,,Southampton


## Replace the values of survived column

In [15]:
dic = {0 : 'No', 1 : 'Yes'}

data['Survived'] = data['Survived'].replace(dic)
data.head()

Unnamed: 0,Survived,Sex,Age,No. of Siblings/Spouse,No. of parents/children,Fare,Passenger Class,Dependent Gender,Deck,Embark_town
0,No,male,22.0,1,0,7.25,Third,man,,Southampton
1,Yes,female,38.0,1,0,71.2833,First,woman,C,Cherbourg
2,Yes,female,26.0,0,0,7.925,Third,woman,,Southampton
3,Yes,female,35.0,1,0,53.1,First,woman,C,Southampton
4,No,male,35.0,0,0,8.05,Third,man,,Southampton


In [16]:
data['Sex'] = data['Sex'].str.capitalize()
data['Dependent Gender'] = data['Dependent Gender'].str.capitalize()
data.head()

Unnamed: 0,Survived,Sex,Age,No. of Siblings/Spouse,No. of parents/children,Fare,Passenger Class,Dependent Gender,Deck,Embark_town
0,No,Male,22.0,1,0,7.25,Third,Man,,Southampton
1,Yes,Female,38.0,1,0,71.2833,First,Woman,C,Cherbourg
2,Yes,Female,26.0,0,0,7.925,Third,Woman,,Southampton
3,Yes,Female,35.0,1,0,53.1,First,Woman,C,Southampton
4,No,Male,35.0,0,0,8.05,Third,Man,,Southampton


In [17]:
data['Fare'] = data['Fare'].round(2)

In [18]:
data.head()

Unnamed: 0,Survived,Sex,Age,No. of Siblings/Spouse,No. of parents/children,Fare,Passenger Class,Dependent Gender,Deck,Embark_town
0,No,Male,22.0,1,0,7.25,Third,Man,,Southampton
1,Yes,Female,38.0,1,0,71.28,First,Woman,C,Cherbourg
2,Yes,Female,26.0,0,0,7.92,Third,Woman,,Southampton
3,Yes,Female,35.0,1,0,53.1,First,Woman,C,Southampton
4,No,Male,35.0,0,0,8.05,Third,Man,,Southampton


In [19]:
data['Age'].unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [20]:
data.head()

Unnamed: 0,Survived,Sex,Age,No. of Siblings/Spouse,No. of parents/children,Fare,Passenger Class,Dependent Gender,Deck,Embark_town
0,No,Male,22.0,1,0,7.25,Third,Man,,Southampton
1,Yes,Female,38.0,1,0,71.28,First,Woman,C,Cherbourg
2,Yes,Female,26.0,0,0,7.92,Third,Woman,,Southampton
3,Yes,Female,35.0,1,0,53.1,First,Woman,C,Southampton
4,No,Male,35.0,0,0,8.05,Third,Man,,Southampton


In [21]:
data = data[['Sex', 'Age', 'Fare', 'No. of Siblings/Spouse', 'No. of parents/children', 'Passenger Class', 'Dependent Gender', 'Deck','Embark_town', 'Survived']]
data.head()

Unnamed: 0,Sex,Age,Fare,No. of Siblings/Spouse,No. of parents/children,Passenger Class,Dependent Gender,Deck,Embark_town,Survived
0,Male,22.0,7.25,1,0,Third,Man,,Southampton,No
1,Female,38.0,71.28,1,0,First,Woman,C,Cherbourg,Yes
2,Female,26.0,7.92,0,0,Third,Woman,,Southampton,Yes
3,Female,35.0,53.1,1,0,First,Woman,C,Southampton,Yes
4,Male,35.0,8.05,0,0,Third,Man,,Southampton,No


In [24]:
data.isnull().sum() / len(data) * 100

Sex                         0.000000
Age                        13.520408
Fare                        0.000000
No. of Siblings/Spouse      0.000000
No. of parents/children     0.000000
Passenger Class             0.000000
Dependent Gender            0.000000
Deck                       74.234694
Embark_town                 0.255102
Survived                    0.000000
dtype: float64

In [25]:
data = data.drop('Deck', axis = 1)
data.head()

Unnamed: 0,Sex,Age,Fare,No. of Siblings/Spouse,No. of parents/children,Passenger Class,Dependent Gender,Embark_town,Survived
0,Male,22.0,7.25,1,0,Third,Man,Southampton,No
1,Female,38.0,71.28,1,0,First,Woman,Cherbourg,Yes
2,Female,26.0,7.92,0,0,Third,Woman,Southampton,Yes
3,Female,35.0,53.1,1,0,First,Woman,Southampton,Yes
4,Male,35.0,8.05,0,0,Third,Man,Southampton,No


In [32]:
data['Age Group'] = pd.cut(data['Age'], 3, labels = ['Young', 'Adult', 'Old'])
data.head()

Unnamed: 0,Sex,Age,Fare,No. of Siblings/Spouse,No. of parents/children,Passenger Class,Dependent Gender,Embark_town,Survived,Age Group
0,Male,22.0,7.25,1,0,Third,Man,Southampton,No,Young
1,Female,38.0,71.28,1,0,First,Woman,Cherbourg,Yes,Adult
2,Female,26.0,7.92,0,0,Third,Woman,Southampton,Yes,Young
3,Female,35.0,53.1,1,0,First,Woman,Southampton,Yes,Adult
4,Male,35.0,8.05,0,0,Third,Man,Southampton,No,Adult


In [33]:
data.to_csv('titanic_cleaned.csv', index = False)

In [34]:
data = pd.read_csv('MOCK_DATA(1).csv')
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'MOCK_DATA(1).csv'