In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("Titanic-Dataset.csv")
df.head(10)

In [None]:
df.info()

In [None]:
df.isna().sum()

1. Since age has about 177 null values and Embarked has about 2 null values lets drop it

In [None]:
df.dropna(subset=['Age', 'Embarked'], inplace = True)

In [None]:
print(df.Age.isna().sum())
print(df.Embarked.isna().sum())

In [None]:
df.Embarked

## Encoding:
1. Label encoding: Sex
2. Hot encoding: Embarked 

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.Sex = le.fit_transform(df.Sex)

In [None]:
df.info()

In [None]:
df['Embarked'] = df['Embarked'].astype('category')

In [None]:
emb_dummy = pd.get_dummies(df.Embarked)
emb_dummy = emb_dummy.astype(int)
df.reset_index(drop = True, inplace = True)
emb_dummy.reset_index(drop=True, inplace=True)
df = pd.concat([df, emb_dummy], axis=1)  
df.drop(columns=['Embarked'], inplace=True)

2. Now the feature Cabin has about 687 null values, which is a significant number so instead of dropping the values, we will first check how significantly is it correlated to the target and on that we will either replace it with mean or median.

3. For Correlation we will use heatmap

In [517]:
import seaborn as sns
import matplotlib.pyplot as plt
r_df = df.drop(columns = ['Name', 'Ticket', 'Cabin'], axis = 1)

In [None]:
plt.figure(figsize = (10, 5))
r = r_df.corr()
sns.heatmap(r, annot = True, cmap = plt.cm.CMRmap_r)
plt.show()

From the above analysis we get:
1. Survival(Target) is highly correlated to: Pclass, Sex.
2. Pclass and Fare are highy correlated so we can use either of them, but since our dataset is small well consider both.
3. Sibsp and Parch are significantly correlated, so we can use either of them, but since our dataset is small well consider both.

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = df.drop(['Survived','PassengerId', 'Name', 'Age', 'Ticket', 'Cabin'], axis = 1)
y = df.Survived

In [None]:
X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(len(X_train))
print(len(X_test))

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=7, )
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
model.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

correct_predictions = (results_df['Actual'] == results_df['Predicted']).sum()
wrong_predictions = (results_df['Actual'] != results_df['Predicted']).sum()
print("Total Values", len(y_test))
print("Total number of correct predictions:", correct_predictions)
print("Total number of wrong predictions:", wrong_predictions)
results_df.head()

In [None]:
correct_predictions = (results_df['Actual'] == results_df['Predicted']).sum()
wrong_predictions = (results_df['Actual'] != results_df['Predicted']).sum()
print("Total Values:", len(y_test))
print("Total number of correct predictions:", correct_predictions)
print("Total number of wrong predictions:", wrong_predictions)
results_df.head()

In [None]:
import joblib
joblib.dump(model, 'diabetesModel.pkl')
joblib.dump(scaler, 'scaler.pkl')       