In [1]:
import pandas as pd 
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
from xgboost import XGBClassifier
import xgboost
import pickle

In [29]:
df = pd.read_csv('data/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [30]:
df = df[['Pclass','Sex','Age','Embarked','Survived']]
df.head()

Unnamed: 0,Pclass,Sex,Age,Embarked,Survived
0,3,male,22.0,S,0
1,1,female,38.0,C,1
2,3,female,26.0,S,1
3,1,female,35.0,S,1
4,3,male,35.0,S,0


In [31]:
df.Embarked.value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [32]:
df.Sex.replace(['male', 'female'], [0, 1], inplace=True)
df.Embarked.replace(['S', 'C', 'Q'], [0, 1, 2], inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Embarked,Survived
0,3,0,22.0,0.0,0
1,1,1,38.0,1.0,1
2,3,1,26.0,0.0,1
3,1,1,35.0,0.0,1
4,3,0,35.0,0.0,0


In [33]:
df.isna().sum()

Pclass        0
Sex           0
Age         177
Embarked      2
Survived      0
dtype: int64

In [34]:
df.Age.fillna(df.Age.mean(), inplace=True)
df.Embarked.fillna(df.Embarked.mode()[0], inplace=True)

In [35]:
x = df[['Pclass','Sex','Age','Embarked']]
y = df['Survived']

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [37]:
x_train.shape, y_train.shape

((712, 4), (712,))

In [38]:
x_test.shape, y_test.shape

((179, 4), (179,))

In [39]:
# normalizer = StandardScaler()
# x_train = normalizer.fit_transform(x_train)
# x_test = normalizer.transform(x_test)

In [40]:
lr_model = LogisticRegression( max_iter=1000, C=1, solver='lbfgs')
lr_model.fit(x_train, y_train)

In [41]:
y_pred = lr_model.predict(x_test)

In [47]:
lr_model.predict([[1,1,35,2]])



array([1])

In [63]:
with open('model2.pkl', 'wb') as file:
    pickle.dump(lr_model, file)

In [42]:
print(accuracy_score(y_test, y_pred))
print(10*'-')
print(confusion_matrix(y_test, y_pred))
print(10*'-')
print(f'F1 Score: {f1_score(y_test, y_pred)}')

0.7932960893854749
----------
[[88 17]
 [20 54]]
----------
F1 Score: 0.7448275862068966


In [43]:
model = XGBClassifier(iterations=1000 )
model.fit(x_train, y_train)

Parameters: { "iterations" } are not used.



In [44]:
y_pred = model.predict(x_test)

In [45]:
print(accuracy_score(y_test, y_pred))
print(10*'-')
print(confusion_matrix(y_test, y_pred))
print(10*'-')
print(f'F1 Score: {f1_score(y_test, y_pred)}')

0.8044692737430168
----------
[[89 16]
 [19 55]]
----------
F1 Score: 0.7586206896551724


In [48]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)