In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

data_train.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
845,846,0,3,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S
496,497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54.0,1,0,36947,78.2667,D20,C
298,299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S


### Transforming Features

1. Aside from 'Sex', the 'Age' feature is second in importance. To avoid overfitting, I'm grouping people into logical human age groups.
2. Each Cabin starts with a letter. I bet this letter is much more important than the number that follows, let's slice it off.
3. Fare is another continuous value that should be simplified. I ran data_train.Fare.describe() to get the distribution of the feature, then placed them into quartile bins accordingly.
4. Extract information from the 'Name' feature. Rather than use the full name, I extracted the last name and name prefix (Mr. Mrs. Etc.), then appended them as their own features.
5. Lastly, drop useless features. (Ticket and Name)

In [2]:
def simplify_ages(df):
    df.Age = df.Age.fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
    categories = pd.cut(df.Age, bins, labels=group_names)
    df.Age = categories
    return df

def simplify_cabins(df):
    df.Cabin = df.Cabin.fillna('N')
    df.Cabin = df.Cabin.apply(lambda x: x[0])
    return df

def simplify_fares(df):
    df.Fare = df.Fare.fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 1000)
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
    categories = pd.cut(df.Fare, bins, labels=group_names)
    df.Fare = categories
    return df

def format_name(df):
    df['Lname'] = df.Name.apply(lambda x: x.split(' ')[0])
    df['NamePrefix'] = df.Name.apply(lambda x: x.split(' ')[1])
    return df    
    
def drop_features(df):
    return df.drop(['Ticket', 'Name', 'Embarked'], axis=1)

def transform_features(df):
    df = simplify_ages(df)
    df = simplify_cabins(df)
    df = simplify_fares(df)
    df = format_name(df)
    df = drop_features(df)
    return df

data_train = transform_features(data_train)
data_test = transform_features(data_test)
data_train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Lname,NamePrefix
0,1,0,3,male,Student,1,0,1_quartile,N,"Braund,",Mr.
1,2,1,1,female,Adult,1,0,4_quartile,C,"Cumings,",Mrs.
2,3,1,3,female,Young Adult,0,0,1_quartile,N,"Heikkinen,",Miss.
3,4,1,1,female,Young Adult,1,0,4_quartile,C,"Futrelle,",Mrs.
4,5,0,3,male,Young Adult,0,0,2_quartile,N,"Allen,",Mr.
5,6,0,3,male,Unknown,0,0,2_quartile,N,"Moran,",Mr.
6,7,0,1,male,Adult,0,0,4_quartile,E,"McCarthy,",Mr.
7,8,0,3,male,Baby,3,1,3_quartile,N,"Palsson,",Master.
8,9,1,3,female,Young Adult,0,2,2_quartile,N,"Johnson,",Mrs.
9,10,1,2,female,Teenager,1,0,3_quartile,N,"Nasser,",Mrs.


In [3]:
from sklearn import preprocessing
def encode_features(df_train, df_test):
    features = ['Fare', 'Cabin', 'Age', 'Sex', 'Lname', 'NamePrefix']
    df_combined = pd.concat([df_train[features], df_test[features]])
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test
    
data_train, data_test = encode_features(data_train, data_test)
data_train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Lname,NamePrefix
0,1,0,3,1,4,1,0,0,7,100,19
1,2,1,1,0,0,1,0,3,2,182,20
2,3,1,3,0,7,0,0,0,7,329,16
3,4,1,1,0,7,1,0,3,2,267,20
4,5,0,3,1,7,0,0,1,7,15,19
5,6,0,3,1,6,0,0,1,7,538,19
6,7,0,1,1,0,0,0,3,4,500,19
7,8,0,3,1,1,3,1,2,7,608,13
8,9,1,3,0,7,0,2,1,7,382,20
9,10,1,2,0,5,1,0,2,7,559,20


In [4]:
data_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Lname,NamePrefix
0,892,3,1,7,0,0,0,7,401,19
1,893,3,0,0,1,0,0,7,843,20
2,894,2,1,3,0,0,1,7,552,19
3,895,3,1,7,0,0,1,7,851,19
4,896,3,0,4,1,1,1,7,342,20


In [5]:
from sklearn.model_selection import train_test_split

X_all = data_train.drop(['Survived', 'PassengerId'], axis=1)
y_all = data_train['Survived']

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

### In the MKNN algorithm, every training sample must be validated at the first step. The validity of each point is computed according to its neighbors. The validation process is performed for all train samples once. After assigning the validity of each train sample, it is used at the second step as impact or weight of the points in the ensembles of neighbors which the point is selected to attend. To validate a sample point in the train set, the H nearest neighbors of the point is considered. Among the H nearest neighbors of a train sample x, validity(x) counts the number of points with the same label to the label of x. Img. 1 is the formula which is proposed to compute the validity of every  points in train set.

<img src="img1.png">

### Where H is the number of considered neighbors and lbl(x) returns the true class label of the sample x. also, Ni(x) stands for the ith nearest neighbor of the point x. The function S takes into account the similarity between the point x and the ith nearest neighbor. Img. 2 defines this function. 

<img src="img2.png">

### In the MKNN method, first the weight of each neighbor is computed using the 1/(de+ ), where is a smoothing regulator and here is selected to 0.5. Then, the validity of that training sample is multiplied on its raw weight which is based on the Euclidian distance. In the MKNN method, the weight of each neighbor sample is derived according to Img. 3.

<img src="img3.png">

### ● Implement your validity function (5 pts)

In [10]:
def validity_function(X_train, y_train, H):
    #########################################
    ## Aqui coloque su código
    array_val=[]
    for indexTrain1, rowTrain1 in X_train.iterrows():
        list=[]
        for indexTrain2, rowTrain2 in X_train.iterrows():
            if indexTrain1!=indexTrain2:
                list.append((dist_eucli(rowTrain1, rowTrain2),y_train[indexTrain2],indexTrain2))
        list.sort()
        validity=0
        lblx=y_train[indexTrain1]
        for ind in range(H):
            if lblx==list[ind][1]:
                validity+=1       
        validity/=float(H)
        array_val.append(validity)   
    return array_val

### ● Please implement your euclidean function (3 pts)

In [7]:
import math
def dist_eucli(x_elem, y_elem):
    #########################################
    ## Aqui coloque su código
    dist=0
    for ind in range(9):
        dist+=(x_elem[ind]-y_elem[ind])*(x_elem[ind]-y_elem[ind])
    dist=math.sqrt(dist)
    
    return(dist)

### ● Please implement your MKNN function. This function has four parameters: (8 pts)

In [17]:
import random
def my_Modified_KNN(K, X_validity, X_train, y_train, X_test, alpha = 0.5):
    #########################################
    ## Aqui coloque su código
    y_predict=[]
    for indexTest, rowTest in X_test.iterrows():
        list=[]
        i=0
        for indexTrain, rowTrain in X_train.iterrows():
            list.append((X_validity[i]*(-1.0/(alpha+dist_eucli(rowTest, rowTrain))),y_train[indexTrain]))
            i+=1
        list.sort()
        survived=0
        notsurvived=0
        for ind in range(K):
            if list[ind][1]==0:
                notsurvived+=1
            else:
                survived+=1
        if survived>notsurvived:
            y_predict.append(1)
        elif survived<notsurvived:
            y_predict.append(0)
        else:
            y_predict.append(random.randint(0, 1))
    #print y_predict
    return(y_predict)  

In [12]:
#X_validity = validity_function(X_train, y_train,3)
y_pred = my_Modified_KNN(3, X_validity, X_train, y_train, X_test)

[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]


### ● Please find the best parameter for k, test k from 2 to 10. Use F-measure (3 pts) 

In [16]:
from sklearn.metrics import f1_score

In [None]:
#########################################
## Aqui coloque su código
bestk=0
maxf1=0
for k in range(2,11):
    y_pred = my_Modified_KNN(k, X_validity, X_train, y_train, X_test)
    currentf1=f1_score(y_test, y_pred)
    if currentf1>maxf1:
        bestk=k
        maxf1=currentf1
    
print bestk

### ● Use the confusion matrix for the best k (1 pt)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = my_Modified_KNN(5, X_validity, X_train, y_train, X_test)
confusion_matrix(y_test, y_pred)