#### Auxliar Structures

In [104]:
def readFile(fileName):
    f = open(fileName, "r")
    return f.read()

In [105]:
class DataFrame:
    def __init__(self, matrix):
        self.columns = matrix[0]
        self.data = matrix[1:]
        self.column_indexes = {}

        for i in range(len(self.columns)):
            self.column_indexes[self.columns[i]] = i

    def __getitem__(self, columns):
        """ use: df[[col1, col2, col3]] """
        newMatrix = [columns]

        for i in range(len(self.data)):
            oldEntry = self.data[i]
            newEntry = []

            for col in columns:
                newEntry.append(oldEntry[self.column_indexes[col]])
            newMatrix.append(newEntry)
        
        return DataFrame(newMatrix)

    def head(self, n):
        print(self.columns)
        for i in range(n):
            print(self.data[i])
    
    def copy(self):
        return self[self.columns]

    def remove_lines_with_value(self, empty_value):
        newMatrix = [self.columns]

        for i in range(len(self.data)):
            if not empty_value in self.data[i]:
                newMatrix.append([elem for elem in self.data[i]]) # deep copy

        return DataFrame(newMatrix)


        
class DataFrameLoader:
    def get_dataframe_from_file(self, csvFile) -> DataFrame:
        raw_file = readFile(csvFile)
        matrix = self.__get_matrix_from_csv_data(raw_file)
        return DataFrame(matrix)

    def __get_matrix_from_csv_data(self, csvData):
        matrix = []
        lines = csvData.split("\n")
        
        for line in lines:
            if ',' in line:
                line = line.replace(", ", ". ")
                matrix.append(line.split(","))

        return matrix

#### Open and process data


In [106]:
df = DataFrameLoader().get_dataframe_from_file("penguins.csv")
df = df[['Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)', 'Species']]
df = df.remove_lines_with_value("NA")

X = df[['Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)']]
y = df[['Species']]

### With pandas

### KNN

In [118]:
def distance(p1, p2):
    d = 0
    for i in range(len(p1)):
        d += pow(p1[i] - p2[i], 2)
    return d

def most_frequent(List):
    return max(set(List), key = List.count)

class KNN:    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_predict, k = 1):
        predictions = []
        
        for x_pred in X_predict:
            closest_points = self.__get_closest_points(x_pred, k)
            prediction = self.__get_prediction(closest_points)
            predictions.append(prediction)

        return predictions


    # Brute force

    def __get_closest_points(self, x_pred, k):
        closest_points = []

        for i in range(len(self.X_train)):
            x_train = self.X_train[i]
            dist = distance(x_pred, x_train)
            closest_points = self.__add_point_to_closest_points(i, dist, k, closest_points)
        
        return closest_points


    def __add_point_to_closest_points(self, point_index, dist, k, closest_points):
        for i in range(k):
            if i == len(closest_points):
                closest_points.append((point_index, dist))
                break
            else:
                if dist < closest_points[i][1]:
                    closest_points.insert(i, (point_index, dist))
                    break
        return closest_points
            
    def __get_prediction(self, closest_points):
        classes = []

        for point in closest_points:
            classes.append(self.y_train[point[0]][0])
        
        return most_frequent(classes)



    

In [161]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

df = pd.read_csv("penguins.csv")
df = df[['Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)', 'Species']]
df = df.dropna()

X = df[['Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)']]
y = df[['Species']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

x_train_list = X_train.values.tolist()
y_train_list = y_train.values.tolist()
X_test_list = X_test.values.tolist()

knn = KNN()

knn.fit(x_train_list, y_train_list)
y_pred = knn.predict(X_test_list)

print(classification_report(y_test, y_pred))

                                           precision    recall  f1-score   support

      Adelie Penguin (Pygoscelis adeliae)       0.80      0.88      0.84        50
Chinstrap penguin (Pygoscelis antarctica)       0.55      0.35      0.43        17
        Gentoo penguin (Pygoscelis papua)       0.97      1.00      0.99        36

                                 accuracy                           0.83       103
                                macro avg       0.77      0.74      0.75       103
                             weighted avg       0.82      0.83      0.82       103

