# Trabalho A1 - Tiago Barradas

## Part 1 - Data loading and pre-processing

Primeiramente, vamos baixar o dataset e começar a explorá-lo em busca de problemas.

In [1]:
import os
import urllib
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=11)
mpl.rc('xtick', labelsize=13)
mpl.rc('ytick', labelsize=13)
mpl.rc('grid', alpha=0)
import seaborn as sns



# Baixa o dataset
URL = "https://raw.githubusercontent.com/tidyverse/ggplot2/main/data-raw/diamonds.csv"
DATASET_PATH = os.path.join("dataset", "diamonds.csv")
os.makedirs("dataset", exist_ok=True)
urllib.request.urlretrieve(URL, DATASET_PATH)

diamonds = pd.read_csv(DATASET_PATH)

In [2]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


A princípo, não há nenhum datapoint com features faltantes, o que é um ótimo começo. Vamos agora checar se há datapoints repetidos:

In [3]:
diamonds.duplicated().value_counts()

False    53794
True       146
dtype: int64

In [4]:
diamonds = diamonds[~diamonds.duplicated()]
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53794 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53794 non-null  float64
 1   cut      53794 non-null  object 
 2   color    53794 non-null  object 
 3   clarity  53794 non-null  object 
 4   depth    53794 non-null  float64
 5   table    53794 non-null  float64
 6   price    53794 non-null  int64  
 7   x        53794 non-null  float64
 8   y        53794 non-null  float64
 9   z        53794 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.5+ MB


Após remover os datapoints repetidos usando o método .duplicated(), temos um dataset com 53794 entradas. Para evitar quaisquer problemas com índices faltantes, irei resetar os índices para que eles possuam um range adequado. Depois disso, vamos checar as features "cut", "color", e "clarity" para nos certificar que todos possuem valores adequados, conforme a [referência indicada](https://ggplot2.tidyverse.org/reference/diamonds.html).

In [5]:
diamonds.reset_index(inplace=True)
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53794 entries, 0 to 53793
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   index    53794 non-null  int64  
 1   carat    53794 non-null  float64
 2   cut      53794 non-null  object 
 3   color    53794 non-null  object 
 4   clarity  53794 non-null  object 
 5   depth    53794 non-null  float64
 6   table    53794 non-null  float64
 7   price    53794 non-null  int64  
 8   x        53794 non-null  float64
 9   y        53794 non-null  float64
 10  z        53794 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 4.5+ MB


In [6]:
diamonds["cut"].value_counts()


Ideal        21488
Premium      13748
Very Good    12069
Good          4891
Fair          1598
Name: cut, dtype: int64

In [7]:
diamonds["color"].value_counts()


G    11262
E     9776
F     9520
H     8272
D     6755
I     5407
J     2802
Name: color, dtype: int64

In [8]:
diamonds["clarity"].value_counts()

SI1     13032
VS2     12229
SI2      9150
VS1      8156
VVS2     5056
VVS1     3647
IF       1784
I1        740
Name: clarity, dtype: int64

Agora que nos certificamos que o dataset está limpo, vamos trabalhar nos histogramas:

In [9]:
classes = diamonds["cut"].value_counts().index # Cria lista com todos os possíveis valores de cut


for key in diamonds.keys():
    fig, axs = plt.subplots(2, figsize=(11, 10))
    fig.suptitle(f"Histograma de {key}", fontsize=24)
    sns.histplot(ax=axs[0], data=diamonds, x=diamonds[key], 
                 stat='density', bins=50, common_norm=True)

    sns.histplot(ax=axs[1], data=diamonds, x=diamonds[key], 
                 stat='density', bins=50, common_norm=True, hue="cut")
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    fig.show()

Com os histogramas feitos, vamos agora fazer nossa matriz de figuras, contendo histogramas e scatterplots que mostram a correlação entre as diversas features:

In [10]:
def correlation(x, y, **kws):
    corr = np.corrcoef(diamonds[x.name], diamonds[y.name])
    ax = plt.gca()
    ax.annotate(f"Correlação: {round(corr[0, 1], 2)}", 
                xy=(0.25, 0.45), xycoords=ax.transAxes)
    plt.axis('off')


plot_matrix = sns.PairGrid(diamonds, hue='cut', diag_sharey=False)
plot_matrix.map_diag(sns.histplot, stat='density', bins=50, common_norm=True)
plot_matrix.map_lower(sns.scatterplot)
plot_matrix.map_upper(correlation)

Com isso, a primeira parte está concluída!

## Part 2 - K-nearest-neighbour classifier

Primeiramente, vamos dividir nosso dataset em training, validation e testing sets. Antes disso, vamos primeiro tratar o dataset para transformar as variáveis categóricas em variáveis quantitativas. Como as variáveis categóricas em questão (a cor, clareza e corte) possuem todas uma ordem de melhor para pior, é razoável fazer uma conversão numérica direta para cada uma delas. Para isso, usamos

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

# As listas precisam estar na ordem correta, começando de 0 (pior valor)
cut_cats = ["Fair", "Good", "Very Good", "Premium", "Ideal"]
color_cats = ["J", "I", "H", "G", "F", "E", "D"]
clarity_cats = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]

ordinal_encoder = OrdinalEncoder(categories=[cut_cats, color_cats, clarity_cats])

diamonds_encoded = ordinal_encoder.fit_transform(diamonds[["cut", 
                                                           "color", 
                                                           "clarity"]])
diamonds_encoded = pd.DataFrame(diamonds_encoded) 
diamonds_encoded.columns = ["cut_ord", "color_ord", "clarity_ord"]

diamonds_ordinal = pd.concat([diamonds, diamonds_encoded], axis=1)

diamonds_ordinal[["cut", "cut_ord", "color", "color_ord", "clarity", "clarity_ord"]].head()


Unnamed: 0,cut,cut_ord,color,color_ord,clarity,clarity_ord
0,Ideal,4.0,E,5.0,SI2,1.0
1,Premium,3.0,E,5.0,SI1,2.0
2,Good,1.0,E,5.0,VS1,4.0
3,Premium,3.0,I,1.0,VS2,3.0
4,Good,1.0,J,0.0,SI2,1.0


A correlação entre os números e seus valores categóricos está funcionando. Agora, vamos realizar nosso split no dataset, removendo as features categóricas originais:

In [12]:
X_diamonds = diamonds_ordinal.drop(["cut", "color", "clarity"], axis=1)
y_diamonds = diamonds_ordinal["cut"]

X_train, X_test, y_train, y_test = train_test_split(X_diamonds, y_diamonds, test_size=0.2, random_state=42, stratify=y_diamonds)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.2, random_state=42, stratify=y_test) 

Agora, vamos implementar nosso próprio estimator para realizar a classificação K-Nearest-Neighbours:

In [13]:
from sklearn.base import BaseEstimator

class KNearestNeighbours(BaseEstimator):
    def __init__(self, k=5, distances='euclidean'):
        self.k = k  # Number of nearest neighbours to consider
        self.distances = distances  # Can also be a matrix with precalculated distances
    
    def fit(self, X, y):
        self.X_train = X  # Training data (in a dataframe)
        self.y_train = pd.DataFrame(y)  # Training labels (in a series that gets converted)
        return self

    def predict(self, X):
        labels = []
        X_arr = np.array(X)

        if self.distances=='euclidean':
            # For each new datapoint, calculate and sort by the distance to each training point,
            # and select the most common label in the k nearest neighbours
            for datapoint in X_arr:  
                point_dist = np.linalg.norm(self.X_train - datapoint, axis=1)
                sorted_dist = np.argsort(point_dist)[:self.k]
                predicted_label = self.y_train.iloc[sorted_dist, :].value_counts().index[0][0] 
                labels.append(predicted_label)
        else:
            # For each new datapoint, sort the previously calculated distances and 
            # select the most common label in the k nearest neighbours
            if self.distances.shape[1] != self.X_train.shape[0]:
                raise ValueError('Distances matrix must have the same number of columns as the number of datapoints in the training data')
            for datapoint in self.distances:
                sorted_dist = np.argsort(datapoint)[:self.k]
                predicted_label = self.y_train.iloc[sorted_dist, :].value_counts().index[0][0] 
                labels.append(predicted_label)
        return np.array(labels)


In [15]:
from sklearn.metrics import accuracy_score

knn_dim = KNearestNeighbours(k=3)
knn_dim.fit(X_train, y_train)
y_pred = knn_dim.predict(X_test)


accuracy_score(y_test, y_pred)

0.49622400371790404

In [16]:
from sklearn.neighbors import KNeighborsClassifier
skknn = KNeighborsClassifier(n_neighbors=3)
skknn.fit(X_train, y_train)
sky_pred = skknn.predict(X_test)

accuracy_score(y_test, sky_pred)

0.49622400371790404