In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from collections import Counter
from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import multivariate_normal


In [2]:
train_df = pd.read_csv("data/alturas-pesos-mils-train.csv")
test_df = pd.read_csv("data/alturas-pesos-mils-test.csv")

In [3]:
train_df

Unnamed: 0.1,Unnamed: 0,Genero,Peso,Altura
0,7620,Mujer,61.235,162.402
1,1837,Hombre,97.432,181.908
2,3311,Hombre,73.324,172.459
3,9478,Mujer,55.193,157.748
4,7805,Mujer,56.886,151.798
...,...,...,...,...
7995,6395,Mujer,62.215,160.083
7996,2714,Hombre,83.840,177.866
7997,3476,Hombre,75.461,170.910
7998,7084,Mujer,65.461,166.137


In [4]:
features = ['Peso', 'Altura']
target = 'Genero'

x_train = train_df[features].values
y_train = train_df[target].values

x_test = test_df[features].values
y_test = test_df[target].values

In [5]:
classes = np.unique(y_train)
num_classes = len(classes)
n_features = x_train.shape[1]

print(classes)

['Hombre' 'Mujer']


In [6]:
# Compute class priors and means for each class
priors = {}
means = {}
for c in classes:
    X_c = x_train[y_train == c]
    priors[c] = X_c.shape[0] / x_train.shape[0]
    means[c] = np.mean(X_c, axis=0)

In [7]:
priors

{'Hombre': 0.50025, 'Mujer': 0.49975}

In [8]:
means

{'Hombre': array([ 84.45539805, 174.91533033]),
 'Mujer': array([ 61.13789495, 161.30471261])}

In [9]:
# Compute the pooled covariance matrix
pooled_cov = np.zeros((n_features, n_features))
for c in classes:
    x_c = x_train[y_train == c]
    # np.cov with rowvar=False returns the covariance matrix for features (normalized by N-1)
    cov_c = np.cov(x_c, rowvar=False)
    pooled_cov += (x_c.shape[0] - 1) * cov_c

pooled_cov = pooled_cov / (x_train.shape[0] - num_classes)

# Inverse of the pooled covariance matrix
inv_cov = np.linalg.inv(pooled_cov)

In [10]:
np.mean(x_train, axis=0)

array([ 72.80247587, 168.11342412])

In [11]:
(x_train - np.mean(x_train)).T

array([[-59.22295, -23.02595, -47.13395, ..., -44.99695, -54.99695,
        -40.13495],
       [ 41.94405,  61.45005,  52.00105, ...,  50.45205,  45.67905,
         51.11705]], shape=(2, 8000))

In [12]:
x_train_male = train_df.loc[train_df['Genero']=='Hombre'][['Peso', 'Altura']].values
x_train_female = train_df.loc[train_df['Genero']=='Mujer'][['Peso', 'Altura']].values

cov = np.cov(np.vstack([x_train_male - means['Hombre'], x_train_female - means['Mujer']]).T)

In [13]:
print(f"pooled_cov: {pooled_cov}\n non_pooled_cov: {cov}")

pooled_cov: [[77.94810827 53.53142632]
 [53.53142632 50.29787332]]
 non_pooled_cov: [[77.93836353 53.52473405]
 [53.52473405 50.2915853 ]]


In [14]:
def get_gauss_prob(data, means, priors, cov):
    data_np = data
    p_class = {}
    
    L_male = multivariate_normal.pdf(data_np, means['Hombre'], cov)
    L_female = multivariate_normal.pdf(data_np, means['Mujer'], cov)
    p_total = L_male * priors['Hombre'] + L_female * priors['Mujer']
    p_class['Hombre'] = L_male * priors['Hombre'] / p_total
    p_class['Mujer'] = L_female * priors['Mujer'] / p_total

    return p_class


In [31]:
def get_acc_gauss(p_class, data):
    return (((p_class['Hombre'] > p_class['Mujer']) == (data['Genero'] == 'Hombre')).sum() / len(p_class['Hombre']))

In [36]:
p_class = get_gauss_prob(x_train, means, priors, cov)
acc_train = get_acc_gauss(p_class, train_df)
print(f"Train Accuracy: {acc_train*100} %")

Train Accuracy: 91.675 %


In [37]:
p_class = get_gauss_prob(x_test, means, priors, cov)
acc_test = get_acc_gauss(p_class, test_df)
print(f"Test Accuracy: {acc_test*100} %")

Test Accuracy: 92.25 %


In [39]:
N = 300
X = np.linspace(train_df.min()['Peso'], train_df.max()['Peso'], N)
Y = np.linspace(train_df.min()['Altura'], train_df.max()['Altura'], N)
X, Y = np.meshgrid(X, Y)

# Pack X and Y into a single 3-dimensional array
pos = np.empty(X.shape + (2,))
pos[:, :, 0] = X
pos[:, :, 1] = Y

# The distribution on the variables X, Y packed into pos.
Z_H_LDA = multivariate_normal.pdf(pos, means['Hombre'], cov)
Z_M_LDA = multivariate_normal.pdf(pos, means['Mujer'], cov)
Z_LDA, _ = get_gauss_prob(x_train, means, priors, cov)

In [41]:
%matplotlib qt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
fig = plt.figure(figsize=(20,10))
ax = fig.gca(projection='3d')
cm = plt.cm.RdBu
#cf = ax.contourf(X, Y, Z, 256, alpha=.8, vmin=0., vmax=1., cmap=cm)
#plt.colorbar(cf, ax=ax)
ax.contourf(X, Y, Z_H_LDA, 256)
ax.contourf(X, Y, Z_M_LDA, 256)
ax.view_init(70, -90)
ax.set_ylabel('Alturas [cms]')
ax.set_xlabel('Pesos [kgs]')
plt.show()

ImportError: Failed to import any of the following Qt binding modules: PyQt6, PySide6, PyQt5, PySide2