# Titanic Analysis (Practice for Data Cleaning)  
Data source: https://www.kaggle.com/competitions/titanic/data

<u> 1. Import packages and read data. </u>

In [1]:
import numpy as np
import pandas as pd

data = pd.read_csv('titanic_data//train.csv')

<u> 2. Useful operations of data cleaning. </u>

In [2]:
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Split one column into two columns
data[['FirstName', 'FamilyName']] = data['Name'].str.split(',', 1, expand=True)
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstName,FamilyName
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Braund,Mr. Owen Harris
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,Miss. Laina


In [4]:
# Rename columns
data.rename(columns={'Name':'FullName'}, inplace=True)
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,FullName,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstName,FamilyName
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Braund,Mr. Owen Harris
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,Miss. Laina


In [5]:
# Drop columns that has too much null values
data.dropna(axis=1, thresh=0.8*data.shape[0], inplace=True)
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,FullName,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FirstName,FamilyName
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Braund,Mr. Owen Harris
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Heikkinen,Miss. Laina


In [6]:
# Replace values
data['Sex'].replace('male', 0, inplace=True)
data['Sex'].replace('female', 1, inplace=True)
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,FullName,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FirstName,FamilyName
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,S,Braund,Mr. Owen Harris
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,S,Heikkinen,Miss. Laina


In [7]:
# Replace null values with median value
data.fillna(data.median(), inplace=True)
# Replace null values with mode value
data.fillna(data.mode().iloc[0], inplace=True)
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,FullName,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FirstName,FamilyName
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,S,Braund,Mr. Owen Harris
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,S,Heikkinen,Miss. Laina


In [8]:
# Drop columns
data.drop('FullName', axis=1, inplace=True)
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FirstName,FamilyName
0,1,0,3,0,22.0,1,0,A/5 21171,7.25,S,Braund,Mr. Owen Harris
1,2,1,1,1,38.0,1,0,PC 17599,71.2833,C,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,3,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,S,Heikkinen,Miss. Laina
