In [1]:
%matplotlib inline
from knn import KNN, RNN, LOO
import pandas as pd
import numpy as np

In [2]:
# reading data
spam = pd.read_csv('spam.csv', dtype=float)
cancer = pd.read_csv('cancer.csv')

In [3]:
# just to be sure I convert binary str labels to int
cancer.label = pd.get_dummies(cancer.label, drop_first=True)

In [4]:

def solution(cls, data, norm=False):
    """
    This is just a pipeline function
    
    Args:
        cls (class): class of clustering
        data (pd.DataFrame): data to cluster with target named 'label'
        norm (bool): norm the input

    """
    
    # all scores will be kept here with parameters
    scores = {}
    
    # splitting feature data and target variable
    X, y = data.drop(['label'], axis=1), data.label
    
    # if scaling should be made to 0 - 1
    if norm:
        X = (X - X.min()) / (X.max() - X.min())

    print(f'{cls.name}')

    # for KNN iterating over 1 to 10 neighbours
    if cls.name == 'KNN':
        start, stop, count = 1, 10, 10
        
    # for RNN radius is from 10 to 10000 
    elif cls.name == 'RNN':
        start, stop, count = 10, 10000, 10
        
        # if scaling is applied, then changing to 0.1 - 1
        if norm:
            start, stop, count = 0.1, 1, 10
    
    # for each parameter
    for param in np.linspace(start, stop, count):
        
        loo = LOO(param, cls)
        
        # calculating LOO
        score = loo.calculate(X, y)
        
        # saving parameter with a score
        scores[param] = score
        print(f'\t{param}: {score:.4f}')
        
    # choosing the best parameter
    best = min(scores, key=scores.get)
    print(f'Best parameter is {best} with score {scores[best]:.3f}')
    

In [5]:
# KNN for spam without scaling
solution(KNN, spam)

KNN
	1.0: 0.1693
	2.0: 0.1908
	3.0: 0.1856
	4.0: 0.1930
	5.0: 0.1858
	6.0: 0.2004
	7.0: 0.1958
	8.0: 0.2010
	9.0: 0.2026
	10.0: 0.2021
Best parameter is 1.0 with score 0.169


In [6]:
# RNN for spam without scaling
solution(RNN, spam)

RNN
	10.0: 0.2997
	1120.0: 0.3641
	2230.0: 0.3849
	3340.0: 0.3921
	4450.0: 0.3945
	5560.0: 0.3936
	6670.0: 0.3934
	7780.0: 0.3932
	8890.0: 0.3930
	10000.0: 0.3936
Best parameter is 10.0 with score 0.300


In [7]:
# KNN for spam with scaling
solution(KNN, spam, norm=True)

KNN
	1.0: 0.0876
	2.0: 0.1011
	3.0: 0.0948
	4.0: 0.1015
	5.0: 0.0952
	6.0: 0.1028
	7.0: 0.0993
	8.0: 0.1022
	9.0: 0.1019
	10.0: 0.1052
Best parameter is 1.0 with score 0.088


In [8]:
# RNN for spam with scaling
solution(RNN, spam, norm=True)

RNN
	0.1: 0.4784
	0.2: 0.2639
	0.30000000000000004: 0.2217
	0.4: 0.2634
	0.5: 0.3193
	0.6: 0.3625
	0.7000000000000001: 0.3808
	0.8: 0.3886
	0.9: 0.3932
	1.0: 0.3962
Best parameter is 0.30000000000000004 with score 0.222


In [10]:
# KNN for cancer without scaling
solution(KNN, cancer)

KNN
	1.0: 0.0844
	2.0: 0.0773
	3.0: 0.0738
	4.0: 0.0738
	5.0: 0.0668
	6.0: 0.0703
	7.0: 0.0685
	8.0: 0.0703
	9.0: 0.0668
	10.0: 0.0703
Best parameter is 5.0 with score 0.067


In [11]:
# RNN for cancer without scaling
solution(RNN, cancer)

RNN
	10.0: 0.8330
	1120.0: 0.2214
	2230.0: 0.3497
	3340.0: 0.3708
	4450.0: 0.3726
	5560.0: 0.3726
	6670.0: 0.3726
	7780.0: 0.3726
	8890.0: 0.3726
	10000.0: 0.3726
Best parameter is 1120.0 with score 0.221


In [12]:
# KNN for cancer with scaling
solution(KNN, cancer, norm=True)

KNN
	1.0: 0.0475
	2.0: 0.0387
	3.0: 0.0299
	4.0: 0.0281
	5.0: 0.0334
	6.0: 0.0299
	7.0: 0.0299
	8.0: 0.0316
	9.0: 0.0299
	10.0: 0.0299
Best parameter is 4.0 with score 0.028


In [13]:
# RNN for cancer with scaling
solution(RNN, cancer, norm=True)

RNN
	0.1: 1.0000
	0.2: 0.8699
	0.30000000000000004: 0.4569
	0.4: 0.2232
	0.5: 0.1459
	0.6: 0.1037
	0.7000000000000001: 0.0914
	0.8: 0.0967
	0.9: 0.1107
	1.0: 0.1318
Best parameter is 0.7000000000000001 with score 0.091
