In [2]:
import pandas as pd
import numpy as np

In [3]:
input_df = pd.read_csv('./data/train.csv').set_index('id')
input_df.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.106643,3.59437,132.804,3.18428,0.081971,1.18859,3.73238,2.26627,2.09959,0.01233,...,1.09862,0.013331,-0.011715,0.052759,0.0654,4.21125,1.97877,0.085974,0.240496,0
1,0.125021,1.67336,76.5336,3.37825,0.0994,5.09366,1.27562,-0.471318,4.54594,0.037706,...,3.46017,0.017054,0.124863,0.154064,0.606848,-0.267928,2.57786,-0.020877,0.024719,0
2,0.03633,1.49747,233.546,2.19435,0.026914,3.12694,5.05687,3.84946,1.80187,0.056995,...,4.883,0.085222,0.032396,0.116092,-0.001688,-0.520069,2.14112,0.124464,0.148209,0
3,-0.014077,0.246,779.967,1.89064,0.006948,1.53112,2.698,4.51733,4.50332,0.123494,...,3.47439,-0.017103,-0.0081,0.062013,0.041193,0.511657,1.9686,0.040017,0.044873,0
4,-0.003259,3.71542,156.128,2.14772,0.018284,2.09859,4.15492,-0.038236,3.37145,0.034166,...,1.91059,-0.042943,0.105616,0.125072,0.037509,1.04379,1.07481,-0.012819,0.072798,1


In [6]:
input_df.shape[1]

101

In [100]:
def _euclidean_distance(point1, point2):
    '''
    This function computes the euclidean distance between 2 data points. To perform vectorization for faster 
    computation, these data points are converted to arrays.
    
    NOTE: point1 and point2 must correspond to single rows of the train/test dataframe.
    
    Returns: Euclidean distance between 2 points
    '''
    array1 = np.array(point1)
    array2 = np.array(point2)
    euclidean_dist = np.sqrt(np.sum((array1 - array2)**2))
    return euclidean_dist

def _compute_distances(point, train_df, y_name):
    '''
    This function computes the distabces between a given point and every row of a training_data set.
    It returns a copy of the training_data with distance column appended and sort by ascending distance.
    '''
    training_df = train_df.copy(deep=True)
    
    #Compute distance between point and every other point of dataframe. Axis = 1 applies function along columns, 
    #such that the distance is computed between rows.
    distance_df =  training_df.drop(columns = [f'{y_name}'])\
                    .apply(lambda row: _euclidean_distance(row, point), axis = 1)
    #Sort training_df by descending distance:
    training_df['euclidean_distance'] = distance_df
    training_df.sort_values(by = 'euclidean_distance', ascending = True)
    return training_df

def _knn(point, train_df, k, y_name):
    '''
    This function takes in a given datapoint and scans the entire training datatset to compute the distance between
    every point in the training set and the given point. Finally, it sorts the distances in descending order to find
    the k-nearest neighbors. Finally it computes the most common value of target variable from the data.
    
    NOTE: training_data must contain both x and y variables.
    '''
    #Compute distance between point and every other point of dataframe and sort in ascending order of distance:
    training_df = _compute_distances(point, train_df, y_name)
    
    #Subset the k-nearest neighbors
    knn_df = training_df.loc[:k-1]
    y_pred = knn_df.target.mode()[0]
    return y_pred

def _model(train_df, test_df, k, y_name):
    '''
    This function runs the entire K-Nearest Neighbors algorithm from start to finish by assumbling all the pieces
    together. The output is a dictionary containing all the model parameters are results.
    '''
    #Iterate over each row of the test_data to compute the knn prediction and append back to the dataframe:
    for index, row in test_df.drop(columns = [f'{y_name}']).iterrows():
        if index%10==0:
            print(index)
        test_df.loc[index,'y_prediction'] = _knn(pd.DataFrame(row), train_df, k, y_name)
    
    #Calculate Accuracy if we know the actual y values:
    if y_name in test_df.columns:
        accuracy = 100*test_df.loc[lambda x: f'x.{y_name}' == x.y_prediction].shape[0]/test_df.shape[0]
    print(f'Accuracy = {accuracy}')
    
    d = {'k': k,
         'accuracy': accuracy,
         'test_data': test_df}
    
    return d

In [101]:
#Split dataset into training and test data:
#First shuffle the input_data df and then split into 599,900 and 100
input_df_shuffled = input_df.iloc[np.random.permutation(input_df.shape[0])].reset_index(drop = True)
train_data = input_df_shuffled.loc[:50000].reset_index(drop = True)
test_data = input_df_shuffled.loc[599900:].reset_index(drop = True)
print(test_data.shape)

# for i in np.arange(100,1100,100):
_model(train_data, test_data, k = 100, y_name = 'target')

(100, 101)
0
10
20
30
40
50
60
70
80
90
Accuracy = 0.0


{'k': 100,
 'accuracy': 0.0,
 'test_data':           f0        f1          f2       f3        f4        f5        f6  \
 0   2.505590  2.390020    8.619680  1.19007  0.187324  3.961150  4.559560   
 1   0.457133  1.410940   -2.893830  2.38832  0.060315  3.968330  4.279030   
 2   0.321053  3.565550   71.456000  4.46200  0.010020  2.882220  2.004430   
 3   0.112592  1.726240  235.621000  3.45170  0.016496  4.964160  1.616450   
 4   0.106847  3.406420   19.161200  4.62525  0.046394  3.385430  1.515560   
 ..       ...       ...         ...      ...       ...       ...       ...   
 95  0.131950  2.842040   41.397100  3.97226  0.120247 -0.219181  2.936390   
 96 -0.059866  0.112736  570.802000  2.00449  0.113889  5.027430  2.208520   
 97  0.068890  4.068560  236.379000  1.74781  0.017118  2.636900  1.974470   
 98  0.701729  4.063340   -0.469054  3.25912  0.070857  0.689488  4.155180   
 99  0.516356  0.598477  -92.639800  0.48396  0.081476  2.010170  0.968244   
 
           f7       

In [79]:
test1 = pd.DataFrame([[1, 2, 5]], columns = ['c1','c2', 'c3'])
test2 = pd.DataFrame([[2, 3, 6, 1], [1,3,5, 0]], columns = ['c1','c2', 'c3', 'target'])
# _euclidean_distance(test1, test2)
# _compute_distances(test1, test2, y_name = 'target')
_knn(test1, test2, k=1, y_name='target')[0]

check1
check2
chck3


1