# Discriminant Analysis

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn import metrics

%cd "G:/Archive"

G:\Archive


## Data Prep

In [2]:
#load data
data = pd.read_csv("data/wine quality red.csv", header = 0)

#aggregate into binary classification problem
data["quality"] = pd.Categorical(np.where(data["quality"] > 5, "above_avg", "below_avg"))

#split data
X = data.iloc[:, :11]
y = data.iloc[:, 11]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

## Linear Discriminant Analysis (LDA)

In [7]:
#no tuning parameters
#no need to scale for DA
#be careful about var types: continuous independent var and categorical dependent var
#contrast to ANOVA
#10-fold cv with accuracy as target metrics
model1 = LinearDiscriminantAnalysis()
model1.fit(X_train, y_train)
cv_score1 = cross_val_score(model1, X_train, y_train, cv = 10, scoring = "accuracy")

#show cv results
print("Average cross-validation score: ", np.mean(cv_score1))

#predict on test set
pred1 = model1.predict(X_test)

#show prediction performance
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, pred1)))
print("Recall: {0:.4f}".format(metrics.recall_score(y_test, pred1, pos_label = "above_avg", average = "binary")))
print("Precision: {0:.4f}".format(metrics.precision_score(y_test, pred1, pos_label = "above_avg", average = "binary")))
print("F1 score: {0:.4f}".format(metrics.f1_score(y_test, pred1, pos_label = "above_avg", average = "binary")))

Average cross-validation score:  0.7357344980314962
Accuracy: 0.7875
Recall: 0.7791
Precision: 0.8171
F1 score: 0.7976


## Quadratic Discriminant Analysis (QDA)

In [8]:
model2 = QuadraticDiscriminantAnalysis()
model2.fit(X_train, y_train)
cv_score2 = cross_val_score(model2, X_train, y_train, cv = 10, scoring = "accuracy")

#show cv results
print("Average cross-validation score: ", np.mean(cv_score2))

#predict on test set
pred2 = model2.predict(X_test)

#show prediction performance
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, pred2)))
print("Recall: {0:.4f}".format(metrics.recall_score(y_test, pred2, pos_label = "above_avg", average = "binary")))
print("Precision: {0:.4f}".format(metrics.precision_score(y_test, pred2, pos_label = "above_avg", average = "binary")))
print("F1 score: {0:.4f}".format(metrics.f1_score(y_test, pred2, pos_label = "above_avg", average = "binary")))

Average cross-validation score:  0.7263779527559056
Accuracy: 0.7250
Recall: 0.8256
Precision: 0.7100
F1 score: 0.7634


## Visualization