In [11]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import pickle

In [12]:
df_titanic  = pd.read_csv("titanic.csv")


In [13]:
df_titanic.head(2)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [14]:
def Missing_Values(data):
    variable_name=[]
    total_value=[]
    total_missing_value=[]
    missing_value_rate=[]
    unique_value_list=[]
    total_unique_value=[]
    data_type=[]
    for col in data.columns:
        variable_name.append(col)
        data_type.append(data[col].dtype)
        total_value.append(data[col].shape[0])
        total_missing_value.append(data[col].isnull().sum())
        missing_value_rate.append(round(data[col].isnull().sum()/data[col].shape[0],3))
        unique_value_list.append(data[col].unique())
        total_unique_value.append(len(data[col].unique()))
    missing_data=pd.DataFrame({"Variable":variable_name,"Total_Value":total_value,\
                             "Total_Missing_Value":total_missing_value,"Missing_Value_Rate":missing_value_rate,
                             "Data_Type":data_type,"Unique_Value":unique_value_list,\
                               "Total_Unique_Value":total_unique_value})
    return missing_data.sort_values("Missing_Value_Rate",ascending=False)

In [15]:
Missing_Values(df_titanic)

Unnamed: 0,Variable,Total_Value,Total_Missing_Value,Missing_Value_Rate,Data_Type,Unique_Value,Total_Unique_Value
12,body,1310,1189,0.908,float64,"[nan, 135.0, 22.0, 124.0, 148.0, 208.0, 172.0,...",122
9,cabin,1310,1015,0.775,object,"[B5, C22 C26, E12, D7, A36, C101, nan, C62 C64...",187
11,boat,1310,824,0.629,object,"[2, 11, nan, 3, 10, D, 4, 9, 6, B, 8, A, 5, 7,...",28
13,home.dest,1310,565,0.431,object,"[St Louis, MO, Montreal, PQ / Chesterville, ON...",370
4,age,1310,264,0.202,float64,"[29.0, 0.9167, 2.0, 30.0, 25.0, 48.0, 63.0, 39...",99
8,fare,1310,2,0.002,float64,"[211.3375, 151.55, 26.55, 77.9583, 0.0, 51.479...",282
10,embarked,1310,3,0.002,object,"[S, C, nan, Q]",4
0,pclass,1310,1,0.001,float64,"[1.0, 2.0, 3.0, nan]",4
1,survived,1310,1,0.001,float64,"[1.0, 0.0, nan]",3
2,name,1310,1,0.001,object,"[Allen, Miss. Elisabeth Walton, Allison, Maste...",1308


let's drop the columns we will not use and fill in the nan values

In [16]:
df_titanic.drop(["cabin", "name", "ticket", "home.dest", "boat", "body"], axis =1 , inplace= True)
df_titanic["age"].fillna(value= df_titanic["age"].mean(), inplace= True)
df_titanic.dropna(inplace= True)

In [17]:
df_titanic["pclass"] = df_titanic["pclass"].astype(str)

In [41]:
"""    Pclass is a proxy for socio-economic status (SES)
1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower      """

"""Sibling: Brother, Sister, Stepbrother, or Stepsister of Passenger Aboard Titanic
Spouse: Husband or Wife of Passenger Aboard Titanic (Mistresses and Fiances
Ignored)
Parent: Mother or Father of Passenger Aboard Titanic
Child: Son, Daughter, Stepson, or Stepdaughter of Passenger Aboard Titanic"""


Missing_Values(df_titanic)

Unnamed: 0,Variable,Total_Value,Total_Missing_Value,Missing_Value_Rate,Data_Type,Unique_Value,Total_Unique_Value
0,pclass,1306,0,0.0,object,"[1.0, 2.0, 3.0]",3
1,survived,1306,0,0.0,float64,"[1.0, 0.0]",2
2,sex,1306,0,0.0,object,"[female, male]",2
3,age,1306,0,0.0,float64,"[29.0, 0.9167, 2.0, 30.0, 25.0, 48.0, 63.0, 39...",98
4,sibsp,1306,0,0.0,float64,"[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 8.0]",7
5,parch,1306,0,0.0,float64,"[0.0, 2.0, 1.0, 4.0, 3.0, 5.0, 6.0, 9.0]",8
6,fare,1306,0,0.0,float64,"[211.3375, 151.55, 26.55, 77.9583, 0.0, 51.479...",280
7,embarked,1306,0,0.0,object,"[S, C, Q]",3


In [42]:
df_titanic["pclass"] = df_titanic["pclass"].replace(["1.0","2.0","3.0"], [0,1,2])
df_titanic["sex"] = df_titanic["sex"].replace(["male","female"], [0,1])
df_titanic["embarked"] = df_titanic["embarked"].replace(["S","C","Q"], [0,1,2])

In [43]:
Missing_Values(df_titanic)

Unnamed: 0,Variable,Total_Value,Total_Missing_Value,Missing_Value_Rate,Data_Type,Unique_Value,Total_Unique_Value
0,pclass,1306,0,0.0,int64,"[0, 1, 2]",3
1,survived,1306,0,0.0,float64,"[1.0, 0.0]",2
2,sex,1306,0,0.0,int64,"[1, 0]",2
3,age,1306,0,0.0,float64,"[29.0, 0.9167, 2.0, 30.0, 25.0, 48.0, 63.0, 39...",98
4,sibsp,1306,0,0.0,float64,"[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 8.0]",7
5,parch,1306,0,0.0,float64,"[0.0, 2.0, 1.0, 4.0, 3.0, 5.0, 6.0, 9.0]",8
6,fare,1306,0,0.0,float64,"[211.3375, 151.55, 26.55, 77.9583, 0.0, 51.479...",280
7,embarked,1306,0,0.0,int64,"[0, 1, 2]",3


In [51]:
max(df_titanic["fare"])

512.3292

In [44]:
X = df_titanic.drop("survived", axis = 1)
y = df_titanic["survived"]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state= 42)


In [46]:
print("Train lenght: {}\nTest lenght: {}".format(len(X_train),len(X_test)))

Train lenght: 979
Test lenght: 327


In [47]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)



print("classification_report\n",classification_report(y_test, y_pred))
print("Accuracy Score for ",accuracy_score(y_test, y_pred))

classification_report
               precision    recall  f1-score   support

         0.0       0.83      0.82      0.82       198
         1.0       0.73      0.74      0.74       129

    accuracy                           0.79       327
   macro avg       0.78      0.78      0.78       327
weighted avg       0.79      0.79      0.79       327

Accuracy Score for  0.7889908256880734


In [48]:
filename = 'titanic_ml_model.sav'
pickle.dump(rf_model, open(filename, 'wb'))

In [49]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)

print(result)

0.7889908256880734
