In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

: 

# Outils

## definition des distances

In [None]:
euclid_distance = lambda x, y : np.sum((x)*(x), axis=1)
mahalonobis_distance = lambda x, y, sigma: np.sum((x@sigma)*y, axis=1)

: 

# chargement des jeu de données et extraction des caracteristique des classes

In [None]:
import os 
os.getcwd()

: 

In [None]:
df = pd.read_csv("tp9_data/tp9_data_train.txt", names=["x1", "x2", "y"])
X_train = df[["x1", "x2"]].values
y_train = df["y"].values

: 

## moyenne des classse

In [None]:
classes = df.y.unique()
classes.sort()


classes_mean = np.array([df[df.y == classe][["x1", "x2"]].values.mean(axis=0) for classe in classes])
classes_mean

: 

## variance des classes

In [None]:
print(df[df.y == 0][["x1", "x2"]].cov())
print(np.cov(df[df.y == 0][["x1", "x2"]].values.T))

: 

In [None]:

#np.cov(df[df.y == 0][["x1", "x2"]].values.T)
classes_sigma = np.array([np.cov(df[df.y == classe][["x1", "x2"]].values.T) for classe in classes])
classes_inv_sigma = np.array([np.linalg.inv(sigma) for sigma in classes_sigma])


: 

## probabilité a priori des classes

In [None]:
p_classes = np.array([len(df[df.y == classe])/len(df) for classe in classes])
p_classes

: 

# visulaisation des données

## histogrammes des caracteristiques

In [None]:
def show(df):
    classes = df.y.unique()
    classes.sort()
    plt.figure(figsize=(18, 8))
    for classe in classes:
        plt.hist(df[df.y==classe]["x1"], bins=20, alpha=0.5, label=f"classe {classe}")
        # plt.hist(class_1["x1"], bins=20, alpha=0.5, label=fClasse {class}", color='r')
    plt.xlabel('Valeurs de x1')
    plt.ylabel('Fréquence')
    plt.title('Histogrammes des valeurs de x1 pour chaque classe')
    plt.legend()
    plt.show()
    plt.figure(figsize=(18, 8))
    for classe in classes:
        plt.hist(df[df.y==classe]["x2"], bins=20, alpha=0.5, label=f"classe {classe}")
        # plt.hist(class_1["x2"], bins=20, alpha=0.5, label='Classe 1', color='r')
    plt.xlabel('Valeurs de x2')
    plt.ylabel('Fréquence')
    plt.title('Histogrammes des valeurs de x2 pour chaque classe')
    plt.legend()
    plt.show()

show(df)


: 

## nuage  des variables

In [None]:
def scatter_df(df):
    classes = df.y.unique()
    classes.sort()
    plt.figure(figsize=(18, 8))
    
    for classe in classes :
        plt.scatter(df[df.y==classe]['x1'], df[df.y==classe]['x2'], marker="+",  label=f"Classe {classe}")
        
    plt.axis("equal")
    plt.title('Nuage de points des classes ')
    plt.xlabel('Caractéristique x1')
    plt.ylabel('Caractéristique x2')
    plt.legend()

scatter_df(df)

: 

In [None]:

for classe in classes:
    plt.figure(figsize=(5, 5))
    plt.title(f"covariance des variable dans la classe {classe}")
    sns.heatmap(classes_sigma[classe], annot=True)

: 

# Entrainement avec la distance de mahalanobis

In [None]:
## definition des fonction de prediction

: 

In [None]:
def get_euclidian_predictor(mu, classes):

    def prediction(X):
        d = np.array(np.concatenate([[euclid_distance(X-mu[classe], X-mu[classe])] for classe in classes])).T
        return d.argmin(axis=1)

    return prediction

: 

In [None]:
def get_mal_predictor( mu, classes_sigma,  classes_p, classes):
    sigma_inv = np.array([np.linalg.inv(sigma) for sigma in classes_sigma])

    det_sigma = [np.linalg.det(sigma) for sigma in classes_sigma]

    b = [np.log(det_sigma[classe]) - 2 * np.log(classes_p[classe]) for classe in classes]
    
    def prediction(X):
        d = np.array(np.concatenate([[euclid_distance(X-mu[classe], X-mu[classe]) + b[classe]] for classe in classes])).T
        return d.argmin(axis=1)

    return prediction


: 

## outils affichage des frontiere de decision

In [None]:
def plot_decision_multi(x1_min, x1_max, x2_min, x2_max, prediction, sample = 300):
    """Uses Matplotlib to plot and fill a region with 2 colors
    corresponding to 2 classes.

    Parameters
    ----------
    x1_min : float
        Minimum value for the first feature
    x1_max : float
        Maximum value for the first feature
    x2_min : float
        Minimum value for the second feature
    x2_max : float
        Maximum value for the second feature
    prediction :  (x : 2D vector) -> label : int
        Prediction function for a vector x
    sample : int, optional
        Number of samples on each feature (default is 300)
    """
    x1_list = np.linspace(x1_min, x1_max, sample)
    x2_list = np.linspace(x2_min, x2_max, sample)
    y_grid_pred = [[prediction(np.array([[x1,x2]]))[0] for x1 in x1_list] for x2 in x2_list] 
    l = np.shape(np.unique(y_grid_pred))[0] - 1
    plt.contourf(x1_list, x2_list, y_grid_pred, levels=l, colors=plt.rcParams['axes.prop_cycle'].by_key()['color'], alpha=0.35)


: 

## Creation des modele de prediction bayesienne

In [None]:
mahalonobis_predictor = get_mal_predictor(classes_mean, classes_sigma, p_classes, classes)
euclidian_predictor = get_euclidian_predictor(classes_mean, classes)

: 

In [None]:
euclidian_predictor([[2, 4]])

: 

In [None]:
## resultat avec mahalanobis

: 

In [None]:
scatter_df(df)
plot_decision_multi(df['x1'].min(), df['x1'].max(), df['x2'].min(), df['x2'].max(), mahalonobis_predictor)

: 

In [None]:
scatter_df(df)
plot_decision_multi(df['x1'].min(), df['x1'].max(), df['x2'].min(), df['x2'].max(), euclidian_predictor)

: 