In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Some information about the dataset

In [61]:
df.shape

(891, 12)

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


### Check if the dataset contain null value

In [4]:
df.isnull().sum().sum()

866

In [63]:
# how many nan value that Age Column Contain
df['Age'].isnull().sum()

177

In [5]:
# fill nan with 1- medain or 2- random ages
#1- fill with median
median = df['Age'].median()
df['Age'].fillna(median, inplace=True)
#2- fill with random age 
#random_ages = np.random.rand((df['Age'].mean() - df['Age'].std()), (df['Age'].mean() + df['Age'].std()), size=df['Age'].isnull().sum)
#df['age'][np.isnan(df['age'])] = random_ages

In [6]:
# how many nan values that Embarked contain
df['Embarked'].isnull().sum()

2

In [7]:
# Embarked contain two nan values the we can forward fill it
df['Embarked'].ffill(inplace=True)

## Second, we should select the features

In [66]:
# i think that the PassengerId not effect on the model, so we should drop it
df.drop('PassengerId',axis=1,inplace=True)

In [8]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


###  now we should see what is the data type for each columns

In [9]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

### see some relations between some columns and avarage of number of Survived

In [10]:
df[['Pclass','Survived']].groupby('Pclass', as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [11]:
df[['Sex','Survived']].groupby('Sex', as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [12]:
# SibSp : Brother , sister , ...
df[['SibSp','Survived']].groupby('SibSp', as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.535885
2,2,0.464286
0,0,0.345395
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [13]:
df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.343658
5,5,0.2
4,4,0.0
6,6,0.0


## Select and Manage Features

In [16]:
# covert age as dtype int
df['Age'] = df['Age'].astype(int)

# Convert Embarked to numeric values and place in another column
df['Port'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
df.drop('Embarked', axis=1, inplace=True)

# Cabin : tell us if the passengare have a Cabin or not
df['Has_Cabin'] = df['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
df.drop('Cabin', axis=1, inplace=True)

# Create Column to To Know the family size
df['FamilySize'] = df['Parch'] + df['SibSp'] + 1

# Create Columns to check if the passenger is alone or not
df['isAlone'] = 0
df.loc[df['FamilySize'] == 1 , 'isAlone'] = 1

# Create Title Form the Names
df['Title'] = df['Name'].str.extract(' (\w+)\.' , expand=False)
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                   'Don', 'Dr', 'Major', 'Rev', 'Sir',
                                   'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace({'Mlle':'Miss','Ms':'Miss','Mme':'Mrs'})

In [18]:
# Create new column "FamilySizeGroup" and assign "Alone", "Small" and "Big"
df['FamilySizeGroup'] = 'Small'
df.loc[df['FamilySize'] == 1 , 'FamilySizeGroup'] = 'Alone'
df.loc[df['FamilySize'] >= 5 , 'FamilySizeGroup'] = 'Big'

In [21]:
# Check
# Get the average survival rate of different FamilySizes
df[['FamilySizeGroup','Survived']].groupby('FamilySizeGroup' , as_index=False).mean().sort_values(by='Survived',ascending=False)

Unnamed: 0,FamilySizeGroup,Survived
2,Small,0.578767
0,Alone,0.303538
1,Big,0.16129


In [22]:
# Convert Sex to Numeric Value
df['Sex'] = df['Sex'].map({'female':1,'male':0}).astype(int)

In [28]:
# See the unique values for the age and group them into 5 groups
np.sort(df['Age'].unique())

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 70,
       71, 74, 80], dtype=int64)

In [29]:
df.loc[ df['Age'] <= 14 , 'Age' ] = 1
df.loc[(df['Age'] > 14 )&(df['Age'] <= 32 ) , 'Age'] = 2
df.loc[(df['Age'] > 32 )&(df['Age'] <= 48 ) , 'Age'] = 3
df.loc[(df['Age'] > 48 )&(df['Age'] <= 64 ) , 'Age'] = 4
df.loc[(df['Age'] > 64 ), 'Age'] = 5

In [36]:
# See Descripe for the values for the Fare and Group them into 3 Groups
df['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [37]:
df.loc[ df['Fare'] <= 7.9104 , 'Fare'] = 0
df.loc[ (df['Fare'] > 7.9104)&(df['Fare'] <= 14.4542) ,'Fare'] = 1
df.loc[ df['Fare'] > 14.4542  ,'Fare'] = 2

In [39]:
# Convert Fare Column to int type
df['Fare'] = df['Fare'].astype(int)

In [43]:
# See The unique values for Title & FamilySizeGroup  for mapping to Numeric values
print(df['Title'].unique())
print(df['FamilySizeGroup'].unique())

['Mr' 'Mrs' 'Miss' 'Master' 'Rare']
['Small' 'Alone' 'Big']


In [44]:
# Map New Features
FamilySizeGroup_map = {'Small':0, 'Alone':1, 'Big':2}
Title_map = {'Mr':0, 'Mrs':1, 'Miss':2, 'Master':3, 'Rare':4}

df['FamilySizeGroup'] = df['FamilySizeGroup'].map(FamilySizeGroup_map)
df['Title'] = df['Title'].map(Title_map)

In [46]:
# engineer a new  features
df['IsChildandRich'] = 0
df.loc[(df['Age'] == 0)&((df['Pclass'] == 1)|(df['Pclass'] == 2)),'IsChildandRich'] = 1

In [48]:
# Delete Name column from datasets (No need for them in the analysis)
del df['PassengerId']
del df['Name']
del df['Ticket']
del df['SibSp']
del df['Parch']
del df['FamilySize']
del df['Port']

# dataset After Cleaning and engineering features

In [49]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Title,Has_Cabin,isAlone,FamilySizeGroup,IsChildandRich
0,0,3,0,2,0,0,0,0,0,0
1,1,1,1,3,2,1,1,0,0,0
2,1,3,1,2,1,2,0,1,1,0
3,1,1,1,3,2,1,1,0,0,0
4,0,3,0,3,1,0,0,1,1,0


# Now let us start to build our model 'machine learning part'