Integrantes:
- Fisher, Agustin
- Ruiz, Lucia Ines

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from collections import Counter
from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import multivariate_normal

In [None]:
train_df = pd.read_csv("data/alturas-pesos-mils-train.csv")
test_df = pd.read_csv("data/alturas-pesos-mils-test.csv")

In [None]:
train_df

Separamos en datos de train y datos de test. `x` es para las features e `y` es para las classes.

In [None]:
features = ['Peso', 'Altura']
target = 'Genero'

x_train = train_df[features].values
y_train = train_df[target].values

x_test = test_df[features].values
y_test = test_df[target].values

Armamos una tupla con las clases, nro de clases y nro de features.

In [None]:
classes = np.unique(y_train)
num_classes = len(classes)
n_features = x_train.shape[1]

print(classes)

Computamos las probabilidades a priori y medias

In [None]:
priors = {}
means = {}
for c in classes:
    X_c = x_train[y_train == c]
    priors[c] = X_c.shape[0] / x_train.shape[0]
    means[c] = np.mean(X_c, axis=0)

In [None]:
priors

In [None]:
means

In [None]:
np.mean(x_train, axis=0)

Calculamos la covarianza

In [None]:
x_train_male = train_df.loc[train_df['Genero']=='Hombre'][['Peso', 'Altura']].values
x_train_female = train_df.loc[train_df['Genero']=='Mujer'][['Peso', 'Altura']].values

cov = np.cov(np.vstack([x_train_male - means['Hombre'], x_train_female - means['Mujer']]).T)

In [None]:
print(f"Covariance: {cov}")

In [None]:
def get_gauss_prob(data, means, priors, cov):
    data_np = data
    p_class = {}
    
    L_male = multivariate_normal.pdf(data_np, means['Hombre'], cov)
    L_female = multivariate_normal.pdf(data_np, means['Mujer'], cov)
    p_total = L_male * priors['Hombre'] + L_female * priors['Mujer']
    p_class['Hombre'] = L_male * priors['Hombre'] / p_total
    p_class['Mujer'] = L_female * priors['Mujer'] / p_total

    return p_class


In [None]:
def get_acc_gauss(p_class, data):
    return (((p_class['Hombre'] > p_class['Mujer']) == (data['Genero'] == 'Hombre')).sum() / len(p_class['Hombre']))

Calculamos accuracy de training y test

In [None]:
p_class = get_gauss_prob(x_train, means, priors, cov)
acc_train = get_acc_gauss(p_class, train_df)
print(f"Train Accuracy: {acc_train*100} %")

In [None]:
p_class = get_gauss_prob(x_test, means, priors, cov)
acc_test = get_acc_gauss(p_class, test_df)
print(f"Test Accuracy: {acc_test*100} %")

Graficamos en 3D

In [None]:
N = 300
X = np.linspace(train_df.min()['Peso'], train_df.max()['Peso'], N)
Y = np.linspace(train_df.min()['Altura'], train_df.max()['Altura'], N)
X, Y = np.meshgrid(X, Y)

# Pack X and Y into a single 3-dimensional array
pos = np.empty(X.shape + (2,))
pos[:, :, 0] = X
pos[:, :, 1] = Y

# The distribution on the variables X, Y packed into pos.
Z_H_LDA = multivariate_normal.pdf(pos, means['Hombre'], cov)
Z_M_LDA = multivariate_normal.pdf(pos, means['Mujer'], cov)
Z_LDA = get_gauss_prob(x_train, means, priors, cov)['Hombre'], get_gauss_prob(x_train, means, priors, cov)['Mujer']

In [None]:
%matplotlib qt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(projection='3d')
cm = plt.cm.RdBu
#cf = ax.contourf(X, Y, Z, 256, alpha=.8, vmin=0., vmax=1., cmap=cm)
#plt.colorbar(cf, ax=ax)
ax.contourf(X, Y, Z_H_LDA, 256)
ax.contourf(X, Y, Z_M_LDA, 256)
ax.view_init(70, -90)
ax.set_ylabel('Alturas [cms]')
ax.set_xlabel('Pesos [kgs]')
plt.show()

Ahora, la implementacion con SKLearn

In [None]:
# sigo teniendo las mismas features y x_train, etc...

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score

def sk_lda_full(x_train, y_train, x_test, y_test):
    lda = LinearDiscriminantAnalysis(priors=None)
    lda.fit(x_train, y_train)
    y_pred = lda.predict(x_test)
    acc_sk = accuracy_score(y_test, y_pred)
    return lda, acc_sk



In [None]:
lda_normal, acc_normal = sk_lda_full(x_train, y_train, x_test, y_test)

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html
print(f"Means: {lda_normal.means_}")
print(f"Priors: {lda_normal.priors_}")


In [None]:
N = 300
X = np.linspace(train_df.min()['Peso'], train_df.max()['Peso'], N)
Y = np.linspace(train_df.min()['Altura'], train_df.max()['Altura'], N)
X, Y = np.meshgrid(X, Y)

# Pack X and Y into a single 3-dimensional array
pos = np.empty(X.shape + (2,))
pos[:, :, 0] = X
pos[:, :, 1] = Y

# The distribution on the variables X, Y packed into pos.
Z_H_nb = multivariate_normal.pdf(pos, means['Hombre'], cov)
Z_M_nb = multivariate_normal.pdf(pos, means['Mujer'], cov)
Z_nb = get_gauss_prob(pos, means, priors, cov)['Hombre']

fig = plt.figure(figsize=(20,10))
ax = fig.gca()
cm = plt.cm.RdBu
cf = ax.contourf(X, Y, Z_nb, 256, alpha=.8, vmin=0., vmax=1., cmap=cm)
plt.colorbar(cf, ax=ax)
z_levels = np.logspace(-5,-2,10)/4
ax.contour(X, Y, Z_H_nb, z_levels)
ax.contour(X, Y, Z_M_nb, z_levels)
ax.contour(X, Y, Z_nb, (0.5,), colors='k', linewidths=1)
ax.set_ylabel('Alturas [cms]')
ax.set_xlabel('Pesos [kgs]')
plt.show()

In [None]:
x_train_p = x_train[:, 0].reshape(-1,1)
x_test_p = x_test[:, 0].reshape(-1,1)
x_train_a = x_train[:, 1].reshape(-1,1)
x_test_a = x_test[:, 1].reshape(-1,1)

Reentrenamos el modelo con p, $p^2$, a, $a^2$, $p*a$. Extensión cuadrática.

In [None]:
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns

poly = PolynomialFeatures(degree=2, include_bias=False)  # degree=2 genera p, p^2, a, a^2, p*a
X_train_extended = poly.fit_transform(x_train)
x_test_extended = poly.transform(x_test)

lda_ext, acc_ext = sk_lda_full(X_train_extended, y_train, poly.transform(x_test), y_test)

lda = LinearDiscriminantAnalysis(priors=None)
lda.fit(X_train_extended, y_train)

acc_train = lda.score(X_train_extended, y_train)
acc_test = lda.score(x_test_extended, y_test)

print(f"weights: {lda_ext.coef_}")
print(f"intercept: {lda_ext.intercept_}")
print(f"means: {lda_ext.means_}")
print(f"train accuracy: {acc_train}")
print(f"validation accuracy: {acc_test}")

In [None]:
x_min = x_train[:, 0].min() - 1
x_max = x_train[:, 0].max() + 1
y_min = x_train[:, 1].min() - 1
y_max = x_train[:, 1].max() + 1

xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300),
                     np.linspace(y_min, y_max, 300))

grid = np.c_[xx.ravel(), yy.ravel()]
grid_extended = poly.transform(grid)

Z = lda.decision_function(grid_extended)
Z = Z.reshape(xx.shape)

plt.contour(xx, yy, Z, levels=[0.5], colors='black', linewidths=2)

color_map = {'Hombre': 'blue', 'Mujer': 'red'}
point_colors = [color_map[label] for label in y_train]

plt.scatter(x_train[:, 0], x_train[:, 1], c=point_colors, cmap='viridis', edgecolor='k', s=30)
plt.xlabel("Pesos [kg]")
plt.ylabel("Alturas [cm]")
plt.title("Decision Boundary in Original 2D Space")
plt.show()

**Observaciones:** 
- ahora el modelo tienen 5 parametros de entrada que se ven reflejados en que los weights ahora son un arreglo de 5 elementos.
- el accuracy se mantuvo alrededor del 92%
- al tener ahora 5 parametros se sigue teniendo un umbral de decision que es lineal, pero en 5D. En el espacio 2D original, deja de ser lineal.

LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1e12)
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
acc_lr = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc_lr*100} %")

# Parámetros del modelo
print(f"Coeficientes: {lr.coef_}")
print(f"Intercept: {lr.intercept_}")

In [None]:
x_min = x_train[:, 0].min() - 1
x_max = x_train[:, 0].max() + 1
y_min = x_train[:, 1].min() - 1
y_max = x_train[:, 1].max() + 1

xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500),
                     np.linspace(y_min, y_max, 500))

grid = np.c_[xx.ravel(), yy.ravel()]

probs = lr.predict_proba(grid)
male_probs = probs[:, 1].reshape(xx.shape)

levels = np.linspace(0, 1, 100)
plt.contourf(xx, yy, male_probs, alpha=0.5, cmap=plt.cm.RdBu_r)

plt.contour(xx, yy, male_probs, levels=[0.5], colors='black', linewidths=2)

color_map = {'Mujer': 'red', 'Hombre': 'blue'}
point_colors = [color_map[label] for label in y_train]

plt.scatter(x_train[:, 0], x_train[:, 1], c=point_colors, edgecolors='k', s=30)

plt.xlabel("Pesos [kg]")
plt.ylabel("Altura [cm]")
plt.title("Logistic Regression Decision Boundary")
plt.show()