### Linear Discriminant Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# Open the .csv files
X_train = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/train_tfidf.csv')
X_test = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/test_tfidf.csv')

X_train = X_train.drop(['Unnamed: 0'], axis=1)
X_test = X_test.drop(['Unnamed: 0'], axis=1)

# Check the shape of the data
print(X_train.shape)
print(X_test.shape)

(22500, 1000)
(7500, 1000)


In [3]:
# Extract the features into a numpy array
X_train = X_train.values
X_test = X_test.values

# Open the labels
y_train_df = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/CSE343-ML-Project/Data/Preprocessed-Data/train_labels.csv')
y_test_df = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/CSE343-ML-Project/Data/Preprocessed-Data/test_labels.csv')

# Check the shape of the labels
print(y_train_df.shape)
print(y_test_df.shape)

(22500, 1)
(7500, 1)


In [4]:
# Convert class to non_suicide = 0 and suicide = 1
y_train_temp = y_train_df['class'].values
y_test_temp = y_test_df['class'].values

# Check the shape of the labels
print(y_train_temp.shape)
print(y_test_temp.shape)

y_train = np.array([0 if label == "non-suicide" else 1 for label in y_train_temp])
y_test = np.array([0 if label == "non-suicide" else 1 for label in y_test_temp])

# Check the shape of the labels
print(y_train.shape)
print(y_test.shape)

(22500,)
(7500,)
(22500,)
(7500,)


### Running without Hyperparameter Tuning

In [5]:
# Define the LDA classifier
lda_base = LinearDiscriminantAnalysis()

# Fit the model
lda_base.fit(X_train, y_train)

# Predict
y_pred = lda_base.predict(X_test)

In [6]:
# Print the accuracy score
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print AUC score
print("AUC Score:", roc_auc_score(y_test, y_pred))

# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_test, y_pred, average='weighted'))

Accuracy Score: 0.9130666666666667
Confusion Matrix:
 [[3470  238]
 [ 414 3378]]
AUC Score: 0.9133186200210288
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.91      3708
           1       0.93      0.89      0.91      3792

    accuracy                           0.91      7500
   macro avg       0.91      0.91      0.91      7500
weighted avg       0.91      0.91      0.91      7500

Weighted F1-score: 0.9130416384644972


### Hyperparameter Tuning

In [7]:
# Define the LDA classifier
lda = LinearDiscriminantAnalysis()

# Define a grid of hyperparameters to search
param_grid = {
    'solver': ['svd', 'lsqr', 'eigen'],  # LDA solver
    'n_components': [1, 2, 3],  # Number of components to keep
    'store_covariance': [True, False]  # Whether to store covariance matrices
}

# Create a grid search object with 5-fold cross validation
grid_search = GridSearchCV(estimator=lda, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END .n_components=1, solver=svd, store_covariance=False; total time=  49.6s
[CV] END .n_components=1, solver=svd, store_covariance=False; total time=  49.8s
[CV] END .n_components=1, solver=svd, store_covariance=False; total time=  50.6s
[CV] END ..n_components=1, solver=svd, store_covariance=True; total time=  53.2s
[CV] END ..n_components=1, solver=svd, store_covariance=True; total time=  53.4s
[CV] END ..n_components=1, solver=svd, store_covariance=True; total time=  54.1s
[CV] END ..n_components=1, solver=svd, store_covariance=True; total time=  54.2s
[CV] END ..n_components=1, solver=svd, store_covariance=True; total time=  54.2s
[CV] END .n_components=1, solver=lsqr, store_covariance=True; total time=   6.8s
[CV] END .n_components=1, solver=lsqr, store_covariance=True; total time=   6.9s
[CV] END .n_components=1, solver=lsqr, store_covariance=True; total time=   7.4s
[CV] END n_components=1, solver=lsqr, store_cova

62 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/discriminant_analysis.py", line 631, in fit
    self._solve_eigen(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/discriminant_analysis.py", line 463, in _solve_eigen
    evals, evecs = linalg.eigh(Sb, Sw)
  File "/Library/Frameworks/P

In [None]:
# Print the best hyperparameters found by the grid search
print("Best Hyperparameters:")
print(grid_search.best_params_)

In [7]:
# Best Model
best_lda = LinearDiscriminantAnalysis(n_components=1, solver='svd', store_covariance=True)

# Fit the model
best_lda.fit(X_train, y_train)

In [8]:
# Predict on the test data
y_pred = best_lda.predict(X_test)

# Print the accuracy score
print("Best LDA: Testing Data")
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print AUC score
print("AUC Score:", roc_auc_score(y_test, y_pred))

# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_test, y_pred, average='weighted'))

Best LDA: Testing Data
Accuracy Score: 0.9130666666666667
Confusion Matrix:
 [[3470  238]
 [ 414 3378]]
AUC Score: 0.9133186200210288
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.91      3708
           1       0.93      0.89      0.91      3792

    accuracy                           0.91      7500
   macro avg       0.91      0.91      0.91      7500
weighted avg       0.91      0.91      0.91      7500

Weighted F1-score: 0.9130416384644972


In [9]:
# Predict on training data
y_pred_train = best_lda.predict(X_train)

# Print the accuracy score
print("Best LDA: Training Data")
print("Accuracy Score:", accuracy_score(y_train, y_pred_train))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))

# Print AUC score
print("AUC Score:", roc_auc_score(y_train, y_pred_train))

# Print the classification report
print("Classification Report:\n", classification_report(y_train, y_pred_train))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_train, y_pred_train, average='weighted'))

Best LDA: Training Data
Accuracy Score: 0.9206666666666666
Confusion Matrix:
 [[10710   582]
 [ 1203 10005]]
AUC Score: 0.920562519484716
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.95      0.92     11292
           1       0.95      0.89      0.92     11208

    accuracy                           0.92     22500
   macro avg       0.92      0.92      0.92     22500
weighted avg       0.92      0.92      0.92     22500

Weighted F1-score: 0.9205979917765191


### Principal Component Analysis

In [10]:
# Perform PCA on the data and extract the features that explain 85% of the variance
pca = PCA(n_components=0.75)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
print(X_train_pca.shape)

(22500, 434)


In [11]:
# Train the LDA classifier on the PCA features
best_lda.fit(X_train_pca, y_train)

# Predict on the test data
y_pred = best_lda.predict(X_test_pca)

In [12]:
# Print the accuracy score
print("Best LDA: PCA Testing Data")
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print AUC score
print("AUC Score:", roc_auc_score(y_test, y_pred))

# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_test, y_pred, average='weighted'))

Best LDA: PCA Testing Data
Accuracy Score: 0.9093333333333333
Confusion Matrix:
 [[3477  231]
 [ 449 3343]]
AUC Score: 0.9096475461881938
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.91      3708
           1       0.94      0.88      0.91      3792

    accuracy                           0.91      7500
   macro avg       0.91      0.91      0.91      7500
weighted avg       0.91      0.91      0.91      7500

Weighted F1-score: 0.9092862328817355


In [13]:
# Predict on training data
y_pred_train = best_lda.predict(X_train_pca)

# Print the accuracy score
print("Best LDA: PCA Training Data")
print("Accuracy Score:", accuracy_score(y_train, y_pred_train))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))

# Print AUC score
print("AUC Score:", roc_auc_score(y_train, y_pred_train))

# Print the classification report
print("Classification Report:\n", classification_report(y_train, y_pred_train))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_train, y_pred_train, average='weighted'))

Best LDA: PCA Training Data
Accuracy Score: 0.9122666666666667
Confusion Matrix:
 [[10617   675]
 [ 1299  9909]]
AUC Score: 0.9121619046210351
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.91     11292
           1       0.94      0.88      0.91     11208

    accuracy                           0.91     22500
   macro avg       0.91      0.91      0.91     22500
weighted avg       0.91      0.91      0.91     22500

Weighted F1-score: 0.9121900280474452
