# Question 2 (a)
First we will calculate vector w using python only and without using any machine learning libraries (i.e. sklearn)

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('DiabetesTraining.csv')

features = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
X = data[features]
y = data['diabetes']

X_standardized = X / X.std()

mean_0 = X_standardized[y == 0].mean()
mean_1 = X_standardized[y == 1].mean()

S_w = np.zeros((len(features), len(features)))
for i in range(X_standardized.shape[0]):
    if y[i] == 0:
        row = (X_standardized.iloc[i] - mean_0).values.reshape(len(features), 1)
    else:
        row = (X_standardized.iloc[i] - mean_1).values.reshape(len(features), 1)
    S_w += row @ row.T

mean_diff = (mean_1 - mean_0).values.reshape(len(features), 1)
S_b = (y[y == 0].shape[0] * mean_diff @ mean_diff.T)

# Get eigenvalues and vectors for S_b and S_w
eigvals, eigvecs = np.linalg.eig(np.linalg.inv(S_w) @ S_b)

# The eigenvector corresponding to the largest eigenvalue is the LDA vector w
w = eigvecs[:, np.argmax(eigvals)]
print("LDA vector w:", w)

LDA vector w: [0.21704954 0.26374165 0.72404695 0.59923773]


Now we will calculate vector w by using sklearn

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_standardized, y);
w_sklearn = lda.coef_;
print("LDA vector w (using sklearn):", w_sklearn)

LDA vector w (using sklearn): [[0.58625292 0.71236874 1.95565779 1.61854689]]


We can observe that the vector we calculated using sklearn looks different than the one we calculated initially. Is it though? Let's check...

In [None]:
# Normalize both vectors
w_sklearn_normalized = w_sklearn / np.linalg.norm(w_sklearn)
w_normalized = w / np.linalg.norm(w)

is_proportional = np.allclose(w_sklearn_normalized, w_normalized)
print("Are the two vectors proportional?", is_proportional)

Are the two vectors proportional? True


Both vectors are correct since they point in the same direction but are scaled differently. The absolute values of the coefficients do not matter for LDA, only the direction of the discriminant vector.

sklearn LDA: It optimizes the separation of classes and typically returns the vector normalized in a certain way.

Manual Calculation: Depending on how you solve the eigenvalue problem and normalize the eigenvector, you may get a different scaling.

# Question 2 (b)

In [None]:
df= pd.read_csv('DiabetesTraining.csv')
df2 = df;
df2['gender'] = df2['gender'].map({'Female': 0, 'Male':1})
df2['smoking_history'] = df2['smoking_history'].map({'ever': 0, 'current': 1, 'not current': 2, 'never': 3, 'former': 4, 'No Info': 5})
X = df2[['gender', 'age','hypertension','heart_disease','smoking_history','bmi','HbA1c_level','blood_glucose_level']]
y = df['diabetes']
def gini_impurity(y):
    p1 = np.sum(y == 1) / len(y)
    p0 = np.sum(y == 0) / len(y)
    return 1 - p1**2 - p0**2

gini_before = gini_impurity(y)

hypertension = X['hypertension']
y_hyper_1 = y[hypertension == 1]
y_hyper_0 = y[hypertension == 0]

gini_hyper_1 = gini_impurity(y_hyper_1)
gini_hyper_0 = gini_impurity(y_hyper_0)

weight_hyper_1 = len(y_hyper_1) / len(y)
weight_hyper_0 = len(y_hyper_0) / len(y)
gini_after_hyper = weight_hyper_1 * gini_hyper_1 + weight_hyper_0 * gini_hyper_0

heart_disease = X['heart_disease']
y_hd_1 = y[heart_disease == 1]
y_hd_0 = y[heart_disease == 0]

gini_hd_1 = gini_impurity(y_hd_1)
gini_hd_0 = gini_impurity(y_hd_0)

weight_hd_1 = len(y_hd_1) / len(y)
weight_hd_0 = len(y_hd_0) / len(y)
gini_after_hd = weight_hd_1 * gini_hd_1 + weight_hd_0 * gini_hd_0

print(f"Gini impurity before splitting: {gini_before}\n")
print(f"Gini impurity after splitting on 'hypertension': {gini_after_hyper}")
print(f"Gini impurity after splitting on 'heart disease': {gini_after_hd}")

Gini impurity before splitting: 0.14711045401222633

Gini impurity after splitting on 'hypertension': 0.14295615100629253
Gini impurity after splitting on 'heart disease': 0.14601404503959087


In [None]:
def entropy(y):
    p1 = np.sum(y == 1) / len(y)
    p0 = np.sum(y == 0) / len(y)
    if p1 == 0 or p0 == 0:
        return 0
    return -p1 * np.log2(p1) - p0 * np.log2(p0)

entropy_before = entropy(y)

y_hyper_1 = y[hypertension == 1]
y_hyper_0 = y[hypertension == 0]

entropy_hyper_1 = entropy(y_hyper_1)
entropy_hyper_0 = entropy(y_hyper_0)

weighted_entropy_hyper = (len(y_hyper_1) / len(y)) * entropy_hyper_1 + (len(y_hyper_0) / len(y)) * entropy_hyper_0

info_gain_entropy_hyper = entropy_before - weighted_entropy_hyper

y_hd_1 = y[heart_disease == 1]
y_hd_0 = y[heart_disease == 0]

entropy_hd_1 = entropy(y_hd_1)
entropy_hd_0 = entropy(y_hd_0)

weighted_entropy_hd = (len(y_hd_1) / len(y)) * entropy_hd_1 + (len(y_hd_0) / len(y)) * entropy_hd_0

info_gain_entropy_hd = entropy_before - weighted_entropy_hd

print(f"Information gain for 'hypertension': {info_gain_entropy_hyper}")
print(f"Information gain for 'heart disease': {info_gain_entropy_hd}")


Information gain for 'hypertension': 0.014297005653686634
Information gain for 'heart disease': 0.003982146278906973
