In [23]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [45]:
ds_path = '../datasets/titanic/'

def readData(ds_path):
    tr_set = pd.read_csv(ds_path + 'train.csv')
    te_set = pd.read_csv(ds_path + 'test.csv')
    y_te = pd.read_csv(ds_path + 'gender_submission.csv')['Survived'].values
    te_set['Survived'] = y_te
    df = pd.concat([tr_set, te_set])
    return df

def splitData(df, te_size, rnd_state):
    tr_set, te_set = train_test_split(df, test_size = te_size, random_state = rnd_state)
    return tr_set, te_set,


def preProcessData(train_set, test_set, with_min_max = False):
    tr_set = train_set.copy()
    te_set = test_set.copy()
    
    drop_columns = ['PassengerId', 'Name', 'SibSp', 'Ticket', 'Cabin', 'Parch', 'Fare', 'Embarked']

    tr_set.drop(drop_columns, axis=1, inplace=True)
    te_set.drop(drop_columns, axis=1, inplace=True)

    tr_set['Age'] = tr_set['Age'].fillna(29)
    te_set['Age'] = te_set['Age'].fillna(29) 
    sx = {'male': 0, 'female':10}

    tr_set['Sex'] = tr_set['Sex'].apply(lambda x: sx[x])
    te_set['Sex'] = te_set['Sex'].apply(lambda x: sx[x])

    X = tr_set.drop(['Survived'], axis=1).values
    y = tr_set['Survived'].values        
    
    te_y = te_set['Survived'].values
    te_set.drop('Survived', axis=1, inplace=True)
    te_X = te_set.values
    
    if with_min_max:
        scaler = MinMaxScaler()
        scaler.fit(X)
        X = scaler.transform(X)
        te_X = scaler.transform(te_X)
    
    return X, y, te_X, te_y

    

0.8269720101781171

In [47]:
df = readData(ds_path)

results_ordinary = []
for i in range(0, 5):
    tr_set, te_set = splitData(df, .3, i)
    X, y, te_X, te_y = preProcessData(tr_set, te_set)

    classifier = KNeighborsClassifier(n_neighbors=4)
    classifier.fit(X, y)

    y_p = classifier.predict(te_X)

    results_ordinary.append(metrics.accuracy_score(te_y, y_p))
    
results_scaler = []
for i in range(0, 10):
    tr_set, te_set = splitData(df, .3, i)
    X, y, te_X, te_y = preProcessData(tr_set, te_set, True)

    classifier = KNeighborsClassifier(n_neighbors=4)
    classifier.fit(X, y)

    y_p = classifier.predict(te_X)

    results_scaler.append(metrics.accuracy_score(te_y, y_p))    
    
print(np.mean(results_ordinary))
print(np.mean(results_scaler))

0.8386768447837148
0.8424936386768447
