# 

In [1]:
import numpy as np
import pandas as pd
import os

data_train = 'input/train.csv'
data_test = 'input/test.csv'


#load train data
train_data = pd.read_csv(data_train)
test_data = pd.read_csv(data_test)
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# clean data helper function
from sklearn.preprocessing import MinMaxScaler

def clean_data(df):
    # --- 1. Handle Missing Values ---

    # Fill missing 'Age' values with the median age
    df.fillna({'Age': df['Age'].median()}, inplace=True)

    # Fill missing 'Embarked' values with the most frequent port
    df.fillna({'Embarked': df['Embarked'].mode()[0]}, inplace=True)

    # Fill missing 'Fare' values with the median fare
    df.fillna({'Fare': df['Fare'].median()}, inplace=True)

    # --- 2. Create Dummy Variables for Categorical Features ---

    # Convert 'Sex' to a numerical format (0 for male, 1 for female)
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})


    # One-hot encode 'Embarked' and 'Pclass'
    df['Pclass'] = df['Pclass'].astype('category')
    df_dummies = pd.get_dummies(df[['Embarked', 'Pclass']], drop_first=True, dtype=int)
    for col in df_dummies.columns:
        df[col] = df_dummies[col]
    df.drop(columns=['Embarked', 'Pclass'], inplace=True)

    # --- 3. Normalize Numerical Columns ---

    # Select numerical columns for scaling
    numerical_cols = ['Age', 'Fare', 'SibSp', 'Parch']

    # Initialize the scaler
    scaler = MinMaxScaler()

    # Fit and transform the data
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


    # --- 4. Final Data Preparation ---

    # Drop columns that are not needed for the model
    df.drop(columns=['Ticket', 'Cabin', 'Name'], inplace=True)

In [3]:
clean_data(train_data)
clean_data(test_data)

In [9]:
# Use KNN to do predictions
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
#from sklearn.metrix import confusion_matrix, classification_report
#import seaborn as sns
#import matplotlib.pyplot as plt

X_train = train_data.drop(columns=['Survived', 'PassengerId'])
y_train = train_data['Survived']

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_test = test_data.drop(columns=['PassengerId']) 

# let's do it. Starting at 5
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)

# run predictions
y_pred_val = knn.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred_val)
print(f"\nModel Accuracy: {accuracy:.4f}")




Model Accuracy: 0.8045
