In [724]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

##### Read CSV

In [725]:
df = pd.read_csv('titanic-train.csv')
print(df)
# Keep required features only
filtered_df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'])
# filtered_df

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

##### Data Normalisation

In [726]:
print('\n Before normalization... \n', filtered_df.isnull().sum())
# find mean value for Age feature
age_mean = filtered_df['Age'].mean()
# fill null Age values with mean
filtered_df.fillna({'Age': round(age_mean)}, inplace=True)
print('\n After normalization... \n', filtered_df.isnull().sum())



 Before normalization... 
 Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
dtype: int64

 After normalization... 
 Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64


In [727]:
filtered_df


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.2500
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.9250
3,1,1,female,35.0,1,0,53.1000
4,0,3,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000
887,1,1,female,19.0,0,0,30.0000
888,0,3,female,30.0,1,2,23.4500
889,1,1,male,26.0,0,0,30.0000


In [728]:
label_encoder = LabelEncoder()
filtered_df['Sex'] = label_encoder.fit_transform(filtered_df['Sex'])

filtered_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.2500
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.9250
3,1,1,0,35.0,1,0,53.1000
4,0,3,1,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000
887,1,1,0,19.0,0,0,30.0000
888,0,3,0,30.0,1,2,23.4500
889,1,1,1,26.0,0,0,30.0000


In [729]:
input_df = filtered_df.drop(columns='Survived')
print(input_df)
target_df = filtered_df['Survived']
target_df

     Pclass  Sex   Age  SibSp  Parch     Fare
0         3    1  22.0      1      0   7.2500
1         1    0  38.0      1      0  71.2833
2         3    0  26.0      0      0   7.9250
3         1    0  35.0      1      0  53.1000
4         3    1  35.0      0      0   8.0500
..      ...  ...   ...    ...    ...      ...
886       2    1  27.0      0      0  13.0000
887       1    0  19.0      0      0  30.0000
888       3    0  30.0      1      2  23.4500
889       1    1  26.0      0      0  30.0000
890       3    1  32.0      0      0   7.7500

[891 rows x 6 columns]


0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [730]:
X_train, X_test, y_train, y_test = train_test_split(input_df, target_df, test_size=0.3, random_state=1)

In [731]:
# Train a Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print('DecissonTree model\'s accuracy score: ', dt_accuracy)

DecissonTree model's accuracy score:  0.7425373134328358


In [732]:
# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print('Randomforest model\'s accuracy score: ', rf_accuracy)

Randomforest model's accuracy score:  0.7761194029850746


In [733]:
def predic_survival(model, test_input):
    df = pd.DataFrame([test_input])
    
    label_encoder = LabelEncoder()
    df['Sex'] = label_encoder.fit_transform(df['Sex'])

    prediction = model.predict(df)
    
    print(prediction)
    
    return 'Passenger survived' if prediction[0] == 1 else 'Passenger didn\'t survived'
    
input = {
    'Pclass': 3,
    'Sex': 'female',
    'Age': 88,
    'SibSp': 0,
    'Parch': 1,
    'Fare': 6
}

print(predic_survival(rf_model, input))

[0]
Passenger didn't survived
