# **import ibraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# **read dataset**

In [None]:
dataset = pd.read_csv("Titanic-Dataset.csv")

# **dataset overview**

In [None]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
dataset.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
dataset.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
dataset.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
Cabin,687
Age,177
Embarked,2
PassengerId,0
Name,0
Pclass,0
Survived,0
Sex,0
Parch,0
SibSp,0


# **prepare features and lables**


In [None]:
X = dataset.drop(columns=["PassengerId","Survived","Cabin"])
y = dataset['Survived']

In [None]:
X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [None]:
y.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


# **splitting data into trian and test sets**

In [None]:
X_train ,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=40,stratify=y)

In [None]:
X_train.shape

(712, 9)

In [None]:
X_test.shape

(179, 9)

In [None]:
y_train.shape

(712,)

In [None]:
y_test.shape

(179,)

# **take care of missing value**

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy="median")
X_train['Age'] = imputer.fit_transform(X=X_train[['Age']])
X_test['Age'] = imputer.transform(X=X_test[['Age']])

In [None]:
embarked_imputer = SimpleImputer(missing_values=np.nan,strategy="most_frequent")
X_train[['Embarked']] = embarked_imputer.fit_transform(X=X_train[['Embarked']])
X_test[['Embarked']] = embarked_imputer.transform(X=X_test[['Embarked']])

In [None]:
X_train.isnull().sum()

Unnamed: 0,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0
Embarked,0


In [None]:
X_train

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
482,3,"Rouse, Mr. Richard Henry",male,50.0,0,0,A/5 3594,8.0500,S
338,3,"Dahl, Mr. Karl Edwart",male,45.0,0,0,7598,8.0500,S
378,3,"Betros, Mr. Tannous",male,20.0,0,0,2648,4.0125,C
262,1,"Taussig, Mr. Emil",male,52.0,1,1,110413,79.6500,S
261,3,"Asplund, Master. Edvin Rojj Felix",male,3.0,4,2,347077,31.3875,S
...,...,...,...,...,...,...,...,...,...
877,3,"Petroff, Mr. Nedelio",male,19.0,0,0,349212,7.8958,S
704,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,S
735,3,"Williams, Mr. Leslie",male,28.5,0,0,54636,16.1000,S
192,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19.0,1,0,350046,7.8542,S


 # **encoding categorical data**

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_train['Sex'] = le.fit_transform(X_train['Sex'])
X_test['Sex'] = le.transform(X_test['Sex'])

In [None]:
X_train['Sex']

Unnamed: 0,Sex
482,1
338,1
378,1
262,1
261,1
...,...
877,1
704,1
735,1
192,0


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encode',OneHotEncoder(),['Embarked'])],remainder='passthrough')
X_train_array = ct.fit_transform(X_train)
X_test_array = ct.transform(X_test)
features_name = ct.get_feature_names_out()
X_train = pd.DataFrame(X_train_array,columns = feature_name)
X_test = pd.DataFrame(X_test_array,columns=feature_name)

In [None]:

X_train

Unnamed: 0,encode__Embarked_C,encode__Embarked_Q,encode__Embarked_S,remainder__Pclass,remainder__Name,remainder__Sex,remainder__Age,remainder__SibSp,remainder__Parch,remainder__Ticket,remainder__Fare
0,0.0,0.0,1.0,3,"Rouse, Mr. Richard Henry",1,50.0,0,0,A/5 3594,8.05
1,0.0,0.0,1.0,3,"Dahl, Mr. Karl Edwart",1,45.0,0,0,7598,8.05
2,1.0,0.0,0.0,3,"Betros, Mr. Tannous",1,20.0,0,0,2648,4.0125
3,0.0,0.0,1.0,1,"Taussig, Mr. Emil",1,52.0,1,1,110413,79.65
4,0.0,0.0,1.0,3,"Asplund, Master. Edvin Rojj Felix",1,3.0,4,2,347077,31.3875
...,...,...,...,...,...,...,...,...,...,...,...
707,0.0,0.0,1.0,3,"Petroff, Mr. Nedelio",1,19.0,0,0,349212,7.8958
708,0.0,0.0,1.0,3,"Hansen, Mr. Henrik Juul",1,26.0,1,0,350025,7.8542
709,0.0,0.0,1.0,3,"Williams, Mr. Leslie",1,28.5,0,0,54636,16.1
710,0.0,0.0,1.0,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",0,19.0,1,0,350046,7.8542


In [None]:
X_test

Unnamed: 0,encode__Embarked_C,encode__Embarked_Q,encode__Embarked_S,remainder__Pclass,remainder__Name,remainder__Sex,remainder__Age,remainder__SibSp,remainder__Parch,remainder__Ticket,remainder__Fare
0,0.0,0.0,1.0,1,"Baumann, Mr. John D",1,28.0,0,0,PC 17318,25.925
1,1.0,0.0,0.0,1,"Widener, Mr. Harry Elkins",1,27.0,0,2,113503,211.5
2,0.0,0.0,1.0,1,"Bradley, Mr. George (""George Arthur Brayton"")",1,28.0,0,0,111427,26.55
3,0.0,0.0,1.0,3,"Andersson, Miss. Ebba Iris Alfrida",0,6.0,4,2,347082,31.275
4,0.0,0.0,1.0,3,"Somerton, Mr. Francis William",1,30.0,0,0,A.5. 18509,8.05
...,...,...,...,...,...,...,...,...,...,...,...
174,0.0,0.0,1.0,1,"Andrews, Mr. Thomas Jr",1,39.0,0,0,112050,0.0
175,0.0,0.0,1.0,3,"Jensen, Mr. Niels Peder",1,48.0,0,0,350047,7.8542
176,0.0,1.0,0.0,3,"Madigan, Miss. Margaret ""Maggie""",0,28.0,0,0,370370,7.75
177,1.0,0.0,0.0,3,"Baclini, Miss. Marie Catherine",0,5.0,2,1,2666,19.2583


# **feature scaling**

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_array[:,6:7] = scaler.fit_transform(X_train_array[:,6:7])
X_test_array[:,6:7] = scaler.transform(X_test_array[:,6:7])

In [None]:
fareScaler = StandardScaler()
X_train_array[:,-1:] = fareScaler.fit_transform(X_train_array[:,-1:])
X_test_array[:,-1:] = fareScaler.transform(X_test_array[:,-1:])

In [None]:
X_train

Unnamed: 0,encode__Embarked_C,encode__Embarked_Q,encode__Embarked_S,remainder__Pclass,remainder__Name,remainder__Sex,remainder__Age,remainder__SibSp,remainder__Parch,remainder__Ticket,remainder__Fare
0,0.0,0.0,1.0,3,"Rouse, Mr. Richard Henry",1,1.59939,0,0,A/5 3594,-0.479693
1,0.0,0.0,1.0,3,"Dahl, Mr. Karl Edwart",1,1.214329,0,0,7598,-0.479693
2,1.0,0.0,0.0,3,"Betros, Mr. Tannous",1,-0.710975,0,0,2648,-0.562886
3,0.0,0.0,1.0,1,"Taussig, Mr. Emil",1,1.753414,1,1,110413,0.995628
4,0.0,0.0,1.0,3,"Asplund, Master. Edvin Rojj Felix",1,-2.020181,4,2,347077,0.001177
...,...,...,...,...,...,...,...,...,...,...,...
707,0.0,0.0,1.0,3,"Petroff, Mr. Nedelio",1,-0.787987,0,0,349212,-0.482871
708,0.0,0.0,1.0,3,"Hansen, Mr. Henrik Juul",1,-0.248902,1,0,350025,-0.483728
709,0.0,0.0,1.0,3,"Williams, Mr. Leslie",1,-0.056371,0,0,54636,-0.313823
710,0.0,0.0,1.0,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",0,-0.787987,1,0,350046,-0.483728


In [None]:
X_test

Unnamed: 0,encode__Embarked_C,encode__Embarked_Q,encode__Embarked_S,remainder__Pclass,remainder__Name,remainder__Sex,remainder__Age,remainder__SibSp,remainder__Parch,remainder__Ticket,remainder__Fare
0,0.0,0.0,1.0,1,"Baumann, Mr. John D",1,-0.094877,0,0,PC 17318,-0.111378
1,1.0,0.0,0.0,1,"Widener, Mr. Harry Elkins",1,-0.17189,0,2,113503,3.712405
2,0.0,0.0,1.0,1,"Bradley, Mr. George (""George Arthur Brayton"")",1,-0.094877,0,0,111427,-0.0985
3,0.0,0.0,1.0,3,"Andersson, Miss. Ebba Iris Alfrida",0,-1.789145,4,2,347082,-0.001141
4,0.0,0.0,1.0,3,"Somerton, Mr. Francis William",1,0.059147,0,0,A.5. 18509,-0.479693
...,...,...,...,...,...,...,...,...,...,...,...
174,0.0,0.0,1.0,1,"Andrews, Mr. Thomas Jr",1,0.752256,0,0,112050,-0.645564
175,0.0,0.0,1.0,3,"Jensen, Mr. Niels Peder",1,1.445366,0,0,350047,-0.483728
176,0.0,1.0,0.0,3,"Madigan, Miss. Margaret ""Maggie""",0,-0.094877,0,0,370370,-0.485875
177,1.0,0.0,0.0,3,"Baclini, Miss. Marie Catherine",0,-1.866157,2,1,2666,-0.248746
