In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler


class KNN:
    def __init__(self, k):
        self.k = k

    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2)**2))

    def predict_single(self, x_sample):
        distances = []
        index = 0

        for x_train_sample in self.x_train:
            distances.append((index, self.euclidean_distance(x_sample, x_train_sample)))
            index = index+1

        distances.sort(key = lambda x:x[1])
        k_idx = [idx for idx, _ in distances[:self.k]]
        k_labels = [self.y_train[k_idxs] for k_idxs in k_idx]
        most_common = Counter(k_labels).most_common(1)[0][0]
        return most_common

    def predict(self, x_test):
        return np.array([self.predict_single(x) for x in x_test])
    

    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train

# Main working from here

data = load_iris()
x= data.data
y = data.target
data = pd.DataFrame(x, columns=data.feature_names)
data['target'] = y
print(x)
print(y)

print(data.isnull().sum())
print(data.duplicated().sum())
print(data)

print(data.select_dtypes(include = ["object"]).columns.tolist())
# handles missing values
num_imputer = SimpleImputer(strategy = 'mean')
cat_imputer = SimpleImputer(strategy = 'most_frequent')

num_columns = data.select_dtypes(include = ['int64', 'float64']).columns
cat_columns = data.select_dtypes(include = ['object']).columns

data[num_columns] = num_imputer.fit_transform(data[num_columns])
data[cat_columns] = cat_imputer.fit_transform(data[cat_columns])

#handle duplicates: drop them simply
data = data.drop_duplicates()
# OR
data = data.drop_duplicates(subset = ['column_name1','column_name2'])

#encoding the categorical features:
data = pd.get_dummies(data, drop_first=True)
#The other methods to encode my categorical columns are:
#1) Label Encoder: order matters
#2) One hot encoder: order does not matter
#3) mapping: you want to assign each class a priority by yourself
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

le = LabelEncoder()
data['column_name'] = le.fit_transform(data['column_name'])

ohe1 = OneHotEncoder(drop = 'first')
ohe2 = OneHotEncoder(dop = 'if_binary')
data['column_name2'] = ohe1.fit_transform(data['column_name2']) # this has the same concept as dummy encoding because dropping the first column for each class
data['column_name2'] = ohe2.fit_transform(data['column_name2']) #drops the first col always if the class in binary

mapping = {
    'High School':1,
    'bachelors': 2,
    'masters': 3
}
data['column_name3'] = data['column_name3'].map(mapping)

#calculating the pearson correlation and selecting features with threshold 0.5
corr_matrix = data.corr(method = 'pearson')
print(corr_matrix)
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(corr_matrix, annot = True, cmap = 'coolwarm')
plt.title("Correlation matrix")
plt.show()
correlations = data.drop(data['target'], axis = 1).corrwith(data['target'])
selected_features = correlations[abs(correlations) > 0.5].index

#scale only the selected features from correlation calculation
#scaling the data:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(data[selected_features])
#OR I can even use MinMaxScaler
scaler2 = MinMaxScaler()
x_scaled_again = scaler.fit_transform(data[selected_features])


#train-test split:
x_train_full, x_test, y_train_full, y_test = train_test_split(x_scaled, y, test_size = 0.3)

#train-validation split:
x_train, x_val, y_train, y_val = train_test_split(x_train_full, y_train_full, test_size = 0.2, random_state = 42)
model = KNN(k = 3)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

#Calculating different accuracies
# train_acc  =accuracy_score(y_train, model.predict(x_train))
# test_acc = accuracy_score(y_test, model.predict(x_test))
# val_acc = accuracy_score(y_val, model.predict(x_val))
#the first paameter is always the actual y, the second one is model's prediction

#confusion_matrix:
cm_train = confusion_matrix(y_train, model.predict(x_train))
cm_test = confusion_matrix(y_test, model.predict(x_test))
cm_val = confusion_matrix(y_val, model.predict(x_val))


[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

AttributeError: isnull