In [34]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

In [35]:
df = pd.read_csv('/content/train.csv')

In [36]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [37]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [38]:
# what i think here is in this passengerID,Name,Ticket,cabin are not useful
# Survived will be our target variable also it is already in numerical so there is no need of LabelEncoder
# sex - nominal categorical variable so we will use OneHotEncoder
# age- we might convert it into catgeorical by making bins
# embarked - nominal one hot encoder will be used
# Pclass - ordinal catgeorical variable - so are we going to use ordinalencoder here?

In [39]:
df = df.drop(columns = ['PassengerId','Name','Ticket','Cabin'])

In [40]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


Step 1 : Train Test Split


In [41]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns = ['Survived']), df['Survived'],test_size = 0.2, random_state = 42)

In [42]:
X_train,X_test

(     Pclass     Sex   Age  SibSp  Parch      Fare Embarked
 331       1    male  45.5      0      0   28.5000        S
 733       2    male  23.0      0      0   13.0000        S
 382       3    male  32.0      0      0    7.9250        S
 704       3    male  26.0      1      0    7.8542        S
 813       3  female   6.0      4      2   31.2750        S
 ..      ...     ...   ...    ...    ...       ...      ...
 106       3  female  21.0      0      0    7.6500        S
 270       1    male   NaN      0      0   31.0000        S
 860       3    male  41.0      2      0   14.1083        S
 435       1  female  14.0      1      2  120.0000        S
 102       1    male  21.0      0      1   77.2875        S
 
 [712 rows x 7 columns],
      Pclass     Sex   Age  SibSp  Parch     Fare Embarked
 709       3    male   NaN      1      1  15.2458        C
 439       2    male  31.0      0      0  10.5000        S
 840       3    male  20.0      0      0   7.9250        S
 720       2  fem

In [43]:
# first step is to check if there are null values in the dataset or not

In [44]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


In [45]:
# for treating missing values we will use SimpleImputer

In [46]:
#applying imputation

si_age = SimpleImputer()
si_embarked = SimpleImputer(strategy = 'most_frequent')

X_train_age = si_age.fit_transform(X_train[['Age']])
X_train_embarked = si_embarked.fit_transform(X_train[['Embarked']])

X_test_age = si_age.transform(X_test[['Age']])
X_test_embarked = si_embarked.fit_transform(X_test[['Embarked']])

In [47]:
#one hot encoding sex and embarked
ohe_sex = OneHotEncoder(sparse_output = False,handle_unknown = 'ignore')
ohe_embarked = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)

X_test_sex = ohe_sex.transform(X_test[['Sex']])
X_test_embarked = ohe_embarked.transform(X_test_embarked)


In [48]:
#now here why we didnt drop because we are going to implement decision tree and that is not a linear model so multicollinearity se won't affect much

In [49]:
# handle_unknown means future mai test mai agar koi completely new catgeory aajaye to vo zero hojaayengi

In [50]:
# now we will do concatenation

In [51]:
X_train_rem = X_train.drop(columns = ['Sex','Embarked','Age'])

In [52]:
X_test_rem = X_test.drop(columns = ['Sex','Embarked','Age'])

In [54]:
X_train_transformed = np.concatenate((X_train_rem,X_train_age,X_train_sex,X_train_embarked), axis=1)
X_test_transformed = np.concatenate((X_test_rem,X_test_age,X_test_sex,X_test_embarked), axis=1)


In [55]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

In [58]:
y_pred = clf.predict(X_test_transformed)

In [59]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7877094972067039

In [None]:
# model deployment case

In [60]:
import pickle

In [62]:
pickle.dump(ohe_sex,open('ohe_sex.pkl','wb'))
pickle.dump(ohe_embarked,open('ohe_embarked.pkl','wb'))
pickle.dump(clf,open('clf.pkl','wb'))

#here we also exported sex and embarked encoding because the user would be giving the inputs we will have to convert into numbers as model understands that.
# and here we didn't import simpleimputer because the user will giving values not null

In [63]:
from google.colab import files
files.download("clf.pkl")
files.download("ohe_sex.pkl")
files.download("ohe_embarked.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>