# KNN

In [36]:
train_path = "data_train.csv"
validation_path = "data_validation.csv"

import pandas as pd
import numpy as np

df_train = pd.read_csv(train_path)
df_validation = pd.read_csv(validation_path)

nominal_columns = [
    'blue',
    'dual_sim',
    'four_g',
    'three_g',
    'touch_screen',
    'wifi'
]

ratio_columns = [
    'battery_power',
    'clock_speed',
    'fc',
    'int_memory',
    'm_dep',
    'mobile_wt',
    'n_cores',
    'pc',
    'px_height',
    'px_width',
    'ram',
    'sc_h',
    'sc_w',
    'talk_time'
]

columns = nominal_columns + ratio_columns

x_train = df_train.drop(['price_range'], axis=1)
y_train = df_train['price_range'].apply(int)

x_validation = df_validation.drop(['price_range'], axis=1)
y_validation = df_validation['price_range']


In [61]:
from pandas import DataFrame, Series
from typing import List, Dict
import json

class KNNClassifier:
    n_neighbors: int
    features: DataFrame = None
    target: Series = None
    nominal_columns: List[str]
    numeric_columns: List[str]
    column_correlations : Dict
    standardize : bool
    multiply_correlations : bool

    def __init__(self, standardize : bool = False, n_neighbors=5, multiply_correlations : bool = True):
        '''
            Parameters:
                standardize (bool) : Apakah data latihan akan dinormalisasi
                n_neighbors (int) : Jumlah tetangga yang digunkan saat prediksi
                multiply_correlations (bool) : Apakah besar korelasi dari sebuah kolom diperhitungkan pada kolom target
        '''
        self.n_neighbors = n_neighbors
        self.standardize = standardize
        self.multiply_correlations = multiply_correlations

    def fit(self, x: DataFrame, y: Series, categorical_column: List[str] = []):
        '''
        Melatih model yang diinstansiasi dengan data latihan yang diberikan dan konfigurasi 

            Parameters:
                x (Dataframe) : Data latihan tanpa label
                y (Series) : Label dari data latihan (target)
                categorical_column (List[str]) : Kolom dari x yang bertipe kategorikal
        '''
        self.features = x
        self.target = y
        self.nominal_columns = categorical_column
        self.numeric_columns = list(set(x.columns).difference(self.nominal_columns))
        self.column_correlations = dict()
        
        if self.standardize:
            self.features = self._standardize_data(self.features, self.numeric_columns)
        self._get_correlation()
        print(self.column_correlations)

    def save_model(self, feature_path = 'features.csv', target_path = 'target.csv', conf_path = 'conf.json'):
        '''
        Save the model to the given files
        '''
        self.features.to_csv(feature_path)
        self.target.to_csv(target_path)
        conf_json = json.dumps({
            "nominal_columns" : self.nominal_columns,
            "standardize" : self.standardize,
            "multiply_correlations" : self.multiply_correlations,
            "n_neighbors" : self.n_neighbors
        }, indent=4)
        with open(conf_path, "w") as f:
            f.write(conf_json)
    
    def load_model(self,feature_path = 'features.csv', target_path = 'target.csv', conf_path = 'conf.json'):
        '''
        Load the model from the given files
        '''
        x = pd.read_csv(feature_path, index_col=0)
        y = pd.read_csv(target_path)
        target_column = y.columns[1]
        y = y[target_column]
        with open(conf_path, "r") as f:
            conf_json = json.load(f)
            self.n_neighbors = conf_json["n_neighbors"]
            self.standardize = conf_json["standardize"]
            self.multiply_correlations = conf_json["multiply_correlations"]

        self.fit(x, y, conf_json["nominal_columns"])

    def _get_correlation(self):
        '''
        Mendapatkan korelasi dari setiap kolom yang ada pada data latihan dengan label, kemudian menyimpannya ke atribut column_correlations dari instans
        '''
        columns = self.numeric_columns + self.nominal_columns
        target_column = self.target
        for column in columns:
            self.column_correlations[column] = 2**(self.features[column].corr(target_column))

    def _standardize_data(self, df: DataFrame, columns: List[str]):
        '''
        Melakukan standarisasi dari data yang diberikan pada kolom yang diberikan dari data tersebut, lalu mengembalikan hasilnya
        
        Parameters:
                df (Dataframe) : Data yang ingin dinormalisasi
                columns (List[str]) : Kolom dari df yang ingin dinormalisasi

        Returns:
                df_standardized (Dataframe): salinan dari df yang telah dinormalisasi pada kolom yang diberikan
        '''
        df_standardized = df.copy()

        for col in columns:
            mean = df[col].mean()
            std = df[col].std()
            df_standardized[col] = (df[col] - mean)/std 

        return df_standardized
        

    def _calculate_distance(self, row: Series) -> np.ndarray:
        '''
        Menghitung jarak dari baris yang diberikan
        Data diasumsikan data sudah termornalisasi sehingga kolom numerik dan kategorikal bisa dianggap sama perhitungannya
        Jika multiply_correlations dari instans adalah True, maka korelasi dari kolom yang ada dengan target akan diperhitungkan dengan dikalikan pada jarak
        Parameters:
                y (Series) : Label dari data latihan (target)
        Returns:
                result (int): Jarak dari baris yang diberikan
        '''
        if self.multiply_correlations:
            result = (((self.features - row)**2).mul(self.column_correlations)).sum(axis=1).pow(1/2)
        else :
            result = ((self.features - row)**2).sum(axis=1).pow(1/2)
        
        return result
    
    def multiplied(self):
        return ((self.features)).mul(self.column_correlations)
    
    def _predict_row(self, row: Series):
        '''
        Memprediksi nilai label dari baris yang diberikan lalu mengembalikan hasilnya
            Parameters:
                row (Series) : Baris data yang ingin diprediksi nilai labelnya
            Returns:
                result (int): Prediksi nilai label untuk baris yang diberikan
        '''
        distance = self._calculate_distance(row)
        
        data = pd.DataFrame(data={
            "distance": distance,
            "target": self.target
        }).sort_values(by=['distance'], ascending=True)
        result = round(data['target'][:self.n_neighbors].mean())
        # result = data['target'][:self.n_neighbors].mode()[0]

        return result
    
    def predict(self, to_predict: DataFrame):
        '''
        Memprediksi nilai label dari baris yang diberikan lalu mengembalikan hasilnya
            Parameters:
                to_predict (Dataframe) : Data validasi yang ingin diprediksi nilai labelnya
            Returns:
                result (array of int): Nilai label hasil prediksi untuk data yang diberikan, terurut sesuai tiap baris pada data
        '''
        if self.standardize :
            to_predict = self._standardize_data(to_predict, self.numeric_columns)

        prediction_result = []

        for (i, row) in to_predict.iterrows():
            prediction_result.append(self._predict_row(row))

        return prediction_result
        


In [42]:
def accuracy_score(y_real, y_test):
    correct = 0

    for i in range(len(y_real)):
        if y_real[i] == y_test[i]:
            correct += 1

    return correct / len(y_real)

In [63]:
neighbours_used = 17

model1 = KNNClassifier(True, neighbours_used, True)
model2 = KNNClassifier(False, neighbours_used, True)
model3 = KNNClassifier(True, neighbours_used, False)
model4 = KNNClassifier(False, neighbours_used, False)

# model1.fit(x_train, y_train, nominal_columns)
model2.fit(x_train, y_train, nominal_columns)

model2.save_model()
# model3.fit(x_train, y_train, nominal_columns)
# model4.fit(x_train, y_train, nominal_columns)

model3.load_model()


# model2.multiplied()

# print("Correlation accounted, standardized, accuracy :", accuracy_score(y_validation, model1.predict(x_validation)))
print("Correlation accounted, not standardized, accuracy :", accuracy_score(y_validation, model2.predict(x_validation)))
print("Correlation accounted, not standardized, accuracy :", accuracy_score(y_validation, model3.predict(x_validation)))
# print("Correlation not accounted, standardized, accuracy :", accuracy_score(y_validation, model3.predict(x_validation)))
# print("Correlation not accounted, not standardized, accuracy :", accuracy_score(y_validation, model4.predict(x_validation)))

{'mobile_wt': 0.9494943031484544, 'fc': 0.9973404642422978, 'int_memory': 1.0183092128319688, 'talk_time': 1.0077325012473515, 'px_width': 1.131873469281212, 'ram': 1.889912224013501, 'battery_power': 1.1366601162478973, 'sc_w': 1.013897421568552, 'n_cores': 0.9995964406186718, 'px_height': 1.1163835221700165, 'sc_h': 1.0084565201720506, 'pc': 0.9963921561713885, 'm_dep': 1.0008355343936008, 'clock_speed': 1.0097731732765796, 'blue': 1.0295021826306285, 'dual_sim': 0.9925721195086483, 'four_g': 1.0003818919673215, 'three_g': 1.018960148121211, 'touch_screen': 0.9795272409131484, 'wifi': 1.0240806645050748}
{'mobile_wt': 0.9494943031484544, 'fc': 0.9973404642422978, 'int_memory': 1.0183092128319688, 'talk_time': 1.0077325012473515, 'px_width': 1.131873469281212, 'ram': 1.889912224013501, 'battery_power': 1.1366601162478973, 'sc_w': 1.013897421568552, 'n_cores': 0.9995964406186718, 'px_height': 1.1163835221700165, 'sc_h': 1.0084565201720506, 'pc': 0.9963921561713885, 'm_dep': 1.000835534

In [8]:
kaggle_test = pd.read_csv("kaggle_test.csv").drop(['id'], axis=1)

In [9]:
model_submission = KNNClassifier(False, 17, True)

model_submission.fit(x_train, y_train, nominal_columns)

result = model_submission.predict(kaggle_test)

result_df = pd.DataFrame({"price_range" : result})
train_df_processed = pd.DataFrame({"price_range" : y_train})

In [10]:
result_df.index = pd.RangeIndex(len(result_df.index))

result_df['id'] = result_df.index

result_df.to_csv("submission_mean.csv", columns=['id', 'price_range'], index=False)

result_df

Unnamed: 0,price_range,id
0,0,0
1,3,1
2,3,2
3,2,3
4,0,4
...,...,...
1995,2,1995
1996,0,1996
1997,3,1997
1998,0,1998
