In [84]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier

In [85]:
df = pd.read_csv("train.csv")

In [86]:
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [87]:
df.drop(columns = ['PassengerId','Ticket','Cabin','Name'],inplace = True)

In [88]:
df.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C


In [89]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']),
                                                    df['Survived'],
                                                    test_size=0.2,
                                                    random_state=42)

In [90]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S


In [91]:
X_test.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
709,3,male,,1,1,15.2458,C
439,2,male,31.0,0,0,10.5,S


In [92]:
y_train.head(2)

331    0
733    0
Name: Survived, dtype: int64

In [93]:
y_test.head(2)

709    1
439    0
Name: Survived, dtype: int64

In [94]:
#applying imputation on age and embarked column as they both conatin some null value

In [95]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [96]:
#applying imputation 

SimpleImputerAge = SimpleImputer()
SimpleImputerEmbarked = SimpleImputer(strategy = 'most_frequent')

In [97]:
X_train_Age = SimpleImputerAge.fit_transform(X_train[['Age']])
X_train_Embarked = SimpleImputerEmbarked.fit_transform(X_train[['Embarked']])

In [98]:
X_test_Age = SimpleImputerAge.transform(X_test[['Age']])
X_test_Embarked = SimpleImputerEmbarked.transform(X_test[['Embarked']])

In [99]:
X_test_Embarked

array([['C'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
      

In [100]:
X_test_Age

array([[29.49884615],
       [31.        ],
       [20.        ],
       [ 6.        ],
       [14.        ],
       [26.        ],
       [29.49884615],
       [16.        ],
       [16.        ],
       [19.        ],
       [37.        ],
       [44.        ],
       [29.49884615],
       [30.        ],
       [36.        ],
       [16.        ],
       [42.        ],
       [29.49884615],
       [27.        ],
       [47.        ],
       [24.        ],
       [34.        ],
       [19.        ],
       [20.        ],
       [29.49884615],
       [10.        ],
       [40.        ],
       [31.        ],
       [ 4.        ],
       [31.        ],
       [19.        ],
       [22.        ],
       [29.49884615],
       [29.49884615],
       [18.        ],
       [27.        ],
       [28.        ],
       [29.49884615],
       [30.        ],
       [29.49884615],
       [21.        ],
       [29.        ],
       [29.49884615],
       [29.49884615],
       [45.        ],
       [16

In [101]:
#one hot encoding 

OneHotencoding_Sex = OneHotEncoder(sparse = False , handle_unknown = 'ignore')
OneHotencoding_Embarked = OneHotEncoder(sparse = False , handle_unknown = 'ignore')

X_train_Sex = OneHotencoding_Sex.fit_transform(X_train[['Sex']])
X_train_Embarked = OneHotencoding_Embarked.fit_transform(X_train_Embarked)

X_test_Sex = OneHotencoding_Sex.transform(X_test[['Sex']])
X_test_Embarked = OneHotencoding_Embarked.transform(X_test_Embarked)

In [102]:
#X_train_Embarked

In [103]:
#X_train_Sex

In [104]:
X_train_rem = X_train.drop(columns = ['Sex','Age','Embarked'])
X_test_rem = X_test.drop(columns = ['Sex','Age','Embarked'])

In [105]:
X_train_transformed = np.concatenate((X_train_rem,X_train_Age,X_train_Sex,X_train_Embarked,),axis = 1)
X_test_transformed = np.concatenate((X_test_rem,X_test_Age,X_test_Sex,X_test_Embarked,),axis = 1)

In [106]:
X_train_transformed.shape

(712, 10)

In [107]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

In [113]:
y_pred = clf.predict(X_test_transformed)

In [114]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.776536312849162

In [116]:
import pickle

In [119]:
pickle.dump(OneHotencoding_Sex,open('OneHotencoding_Sex.pkl','wb'))
pickle.dump(OneHotencoding_Embarked,open('OneHotencoding_Embarked.pkl','wb'))
pickle.dump(clf,open('clf.pkl','wb'))