# Titanic Prediction Model Using K-Nearest Neighbours

_by Viviana Toledo_

In [132]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

pd.options.mode.chained_assignment = None  # default='warn'

df = pd.read_csv("../data/raw/titanic/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Training Data

### Establishing the Model Parameters

In [133]:
X = df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
y = df["Survived"]

### Replacing NaN Values

In [134]:
X.fillna({"Age": X["Age"].median()}, inplace=True)
X.fillna({"Fare": X["Fare"].median()}, inplace=True)
X.fillna({"Embarked": X["Embarked"].mode()[0]}, inplace=True)

# Replace categorical variables to numerical
X = pd.get_dummies(X,columns=["Sex", "Embarked"])

# Checking that there are no more NaNs
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      891 non-null    int64  
 1   Age         891 non-null    float64
 2   SibSp       891 non-null    int64  
 3   Parch       891 non-null    int64  
 4   Fare        891 non-null    float64
 5   Sex_female  891 non-null    bool   
 6   Sex_male    891 non-null    bool   
 7   Embarked_C  891 non-null    bool   
 8   Embarked_Q  891 non-null    bool   
 9   Embarked_S  891 non-null    bool   
dtypes: bool(5), float64(2), int64(3)
memory usage: 39.3 KB


### Modeling

In [135]:
# Standarizing the features, since they have different scales
# If the feature distribution is normal, we use Standard Scaler
# If unknown, we use a min max scaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [136]:
# KNN Model
knn = KNeighborsClassifier(n_neighbors=80)
knn.fit(X, y)

# Predicting y test
y_pred = knn.predict(X)

# Evaluating the model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

ValueError: Found input variables with inconsistent numbers of samples: [624, 891]

## Test Data

### Preparing the Test Data

In [None]:
df_test = pd.read_csv("../data/raw/titanic/test.csv") 
x_test = df_test[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]

In [None]:
x_test.fillna({"Age": df["Age"].median()}, inplace=True)
x_test.fillna({"Fare": df["Fare"].median()}, inplace=True)
x_test.fillna({"Embarked": df["Embarked"].mode()[0]}, inplace=True)

# Replace categorical variables to numerical
x_test = pd.get_dummies(x_test,columns=["Sex", "Embarked"])

# Checking that there are no more NaNs
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      418 non-null    int64  
 1   Age         418 non-null    float64
 2   SibSp       418 non-null    int64  
 3   Parch       418 non-null    int64  
 4   Fare        418 non-null    float64
 5   Sex_female  418 non-null    bool   
 6   Sex_male    418 non-null    bool   
 7   Embarked_C  418 non-null    bool   
 8   Embarked_Q  418 non-null    bool   
 9   Embarked_S  418 non-null    bool   
dtypes: bool(5), float64(2), int64(3)
memory usage: 18.5 KB


## Training the Data

In [None]:
# Predict y
x_test = scaler.transform(x_test)
y_pred = knn.predict(x_test)

In [None]:
test_predictions = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_pred})
test_predictions.to_csv(path_or_buf="../data/processed/titanic/test_predictions.csv", index=False)

# 