1. Frame the problem

In this challenge, we ask you to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio-economic class, etc).

In [832]:
''''
survival	Survival	0 = No, 1 = Yes
pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex	Sex	
Age	Age in years	
sibsp	# of siblings / spouses aboard the Titanic	
parch	# of parents / children aboard the Titanic	
ticket	Ticket number	
fare	Passenger fare	# أجرة
cabin	Cabin number	
embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton
'''

"'\nsurvival\tSurvival\t0 = No, 1 = Yes\npclass\tTicket class\t1 = 1st, 2 = 2nd, 3 = 3rd\nsex\tSex\t\nAge\tAge in years\t\nsibsp\t# of siblings / spouses aboard the Titanic\t\nparch\t# of parents / children aboard the Titanic\t\nticket\tTicket number\t\nfare\tPassenger fare\t# أجرة\ncabin\tCabin number\t\nembarked\tPort of Embarkation\tC = Cherbourg, Q = Queenstown, S = Southampton\n"

# Import the important libraries

In [833]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score, cross_val_predict

# Read the data

In [834]:
training_data = pd.read_csv(r'D:\AI\End_to_end_projects\Titanic-Machine Learning From Disaster\train.csv')
x_test = pd.read_csv(r'D:\AI\End_to_end_projects\Titanic-Machine Learning From Disaster\test.csv')
y_test = pd.read_csv(r'D:\AI\End_to_end_projects\Titanic-Machine Learning From Disaster\gender_submission.csv')

# Split the data to training and target

In [835]:
x_train = training_data.drop(columns = ['Survived'])
y_train = pd.DataFrame(training_data['Survived'])

# Rename some columns to make the things easier

In [836]:
x_train.rename(columns={'Parch' : 'Parents', 'SibSp':'Siblings'}, inplace=True)
x_test.rename(columns={'Parch' : 'Parents', 'SibSp':'Siblings'}, inplace=True)
x_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Siblings,Parents,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# diplay the information about the data

In [837]:
print(x_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   Siblings     891 non-null    int64  
 6   Parents      891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB
None


In [838]:
print(x_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   Siblings     418 non-null    int64  
 6   Parents      418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB
None


# Checking for null values in training and testing dataframes

In [839]:
print(x_train.isna().sum())
print('=======')
print(x_test.isna().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
Siblings         0
Parents          0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
Siblings         0
Parents          0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


# Drop unnecessary column in y_test

In [840]:
y_test.drop(columns = ['PassengerId'], inplace=True)

In [841]:
print(x_train['Embarked'].value_counts())
print(x_train['Cabin'].value_counts())

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64
Cabin
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 147, dtype: int64


# Drop unnecessary columns

In [842]:
x_train.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)
x_test.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)

# Handling null values

In [843]:
nan_rows_train = x_train[x_train['Embarked'].isna()].index
nan_rows_test = x_test[x_test['Fare'].isna()].index
print(nan_rows_train)
print(nan_rows_test)


Index([61, 829], dtype='int64')
Index([152], dtype='int64')


In [844]:
x_train.dropna(subset=['Embarked'], axis = 0, inplace=True)
y_train.drop(nan_rows_train, axis=0,inplace=True)
x_test.dropna(subset=['Fare'], axis = 0, inplace=True)
y_test.drop(nan_rows_test, axis=0, inplace=True)
x_train.drop(columns=['Cabin'], inplace=True)
x_test.drop(columns=['Cabin'], inplace=True)


In [845]:
print(x_train.shape)
print(y_train.shape)

(889, 7)
(889, 1)


In [846]:
x_train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [847]:
imputer = KNNImputer()
x_train['Age'] = imputer.fit_transform(x_train[['Age']])
x_test['Age'] = imputer.fit_transform(x_test[['Age']])

In [848]:
print(x_train.isna().sum().sum())
print(x_test.isna().sum().sum())

0
0


In [849]:
print(x_train.shape)
print(y_train.shape)

(889, 7)
(889, 1)


# Encoding categorical columns

In [850]:
encoder = OneHotEncoder()
x_train_encoded = pd.DataFrame(encoder.fit_transform(x_train[['Embarked', 'Sex']]).toarray(), columns=encoder.get_feature_names_out())
x_test_encoded = pd.DataFrame(encoder.transform(x_test[['Embarked', 'Sex']]).toarray(), columns=encoder.get_feature_names_out())

x_train = x_train.drop(['Embarked', 'Sex'], axis=1).join(x_train_encoded.add_prefix('encoded_'))
x_test = x_test.drop(['Embarked', 'Sex'], axis=1).join(x_test_encoded.add_prefix('encoded_'))

In [851]:
x_train.shape

(889, 10)

In [852]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(889, 10) (889, 1) (417, 10) (417, 1)


In [853]:
nan_train = x_train[x_train.isna().any(axis=1)].index
nan_test = x_test[x_test.isna().any(axis=1)].index

x_train.dropna(axis=0, inplace=True)
y_train.drop(nan_train, axis=0, inplace=True)

x_test.dropna(axis=0, inplace=True)
y_test.drop(nan_test, axis=0, inplace=True)

print(y_train.shape)

(887, 1)


In [854]:
x_train.head()

Unnamed: 0,Pclass,Age,Siblings,Parents,Fare,encoded_Embarked_C,encoded_Embarked_Q,encoded_Embarked_S,encoded_Sex_female,encoded_Sex_male
0,3,22.0,1,0,7.25,0.0,0.0,1.0,0.0,1.0
1,1,38.0,1,0,71.2833,1.0,0.0,0.0,1.0,0.0
2,3,26.0,0,0,7.925,0.0,0.0,1.0,1.0,0.0
3,1,35.0,1,0,53.1,0.0,0.0,1.0,1.0,0.0
4,3,35.0,0,0,8.05,0.0,0.0,1.0,0.0,1.0


# Correlations

In [855]:
corr_matrix = x_train.corr()
corr_matrix

Unnamed: 0,Pclass,Age,Siblings,Parents,Fare,encoded_Embarked_C,encoded_Embarked_Q,encoded_Embarked_S,encoded_Sex_female,encoded_Sex_male
Pclass,1.0,-0.329527,0.081421,0.016458,-0.548862,-0.080982,0.033399,0.049936,-0.032381,0.032381
Age,-0.329527,1.0,-0.232035,-0.178123,0.088884,0.031068,0.02081,-0.04033,0.002364,-0.002364
Siblings,0.081421,-0.232035,1.0,0.414244,0.160664,0.007137,-0.07418,0.040457,-0.027617,0.027617
Parents,0.016458,-0.178123,0.414244,1.0,0.217332,-0.001415,-0.072021,0.046592,-0.045622,0.045622
Fare,-0.548862,0.088884,0.160664,0.217332,1.0,0.053277,-0.021442,-0.033187,0.02444,-0.02444
encoded_Embarked_C,-0.080982,0.031068,0.007137,-0.001415,0.053277,1.0,-0.149037,-0.782483,0.083793,-0.083793
encoded_Embarked_Q,0.033399,0.02081,-0.07418,-0.072021,-0.021442,-0.149037,1.0,-0.4991,0.07476,-0.07476
encoded_Embarked_S,0.049936,-0.04033,0.040457,0.046592,-0.033187,-0.782483,-0.4991,1.0,-0.120507,0.120507
encoded_Sex_female,-0.032381,0.002364,-0.027617,-0.045622,0.02444,0.083793,0.07476,-0.120507,1.0,-1.0
encoded_Sex_male,0.032381,-0.002364,0.027617,0.045622,-0.02444,-0.083793,-0.07476,0.120507,-1.0,1.0


# Scaling

In [856]:
#x_train.isna().sum()

In [857]:
# scaler = StandardScaler()
# x_train_scaled = scaler.fit_transform(x_train)
# x_test_scaled = scaler.transform(x_test)

In [858]:

# x_train_df  = pd.DataFrame(x_train_scaled, columns=x_train.columns)
# x_test_df  = pd.DataFrame(x_test_scaled, columns=x_test.columns)

In [859]:
# x_train_df.head()

# Model

In [860]:
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)

  return fit_method(estimator, *args, **kwargs)


In [861]:
cross_val_score(random_forest, x_train, y_train, cv=5, scoring='f1_macro')

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


array([0.62190229, 0.64799732, 0.65675501, 0.6991571 , 0.64004208])

In [862]:
roc_auc_score(y_test, y_pred)

0.5914074960127592