In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# Open the .csv files
X_train = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/train_tfidf.csv')
X_test = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/test_tfidf.csv')

X_train = X_train.drop(['Unnamed: 0'], axis=1)
X_test = X_test.drop(['Unnamed: 0'], axis=1)

# Check the shape of the data
print(X_train.shape)
print(X_test.shape)

(22500, 1000)
(7500, 1000)


In [3]:
# Extract the features into a numpy array
X_train = X_train.values
X_test = X_test.values

# Open the labels
y_train_df = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/CSE343-ML-Project/Data/Preprocessed-Data/train_labels.csv')
y_test_df = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/CSE343-ML-Project/Data/Preprocessed-Data/test_labels.csv')

# Check the shape of the labels
print(y_train_df.shape)
print(y_test_df.shape)

(22500, 1)
(7500, 1)


In [4]:
# Convert class to non_suicide = 0 and suicide = 1
y_train_temp = y_train_df['class'].values
y_test_temp = y_test_df['class'].values

# Check the shape of the labels
print(y_train_temp.shape)
print(y_test_temp.shape)

y_train = np.array([0 if label == "non-suicide" else 1 for label in y_train_temp])
y_test = np.array([0 if label == "non-suicide" else 1 for label in y_test_temp])

# Check the shape of the labels
print(y_train.shape)
print(y_test.shape)

(22500,)
(7500,)
(22500,)
(7500,)


In [5]:
# Decision Tree Classifier
rfc = RandomForestClassifier()

# Fit the model
rfc.fit(X_train, y_train)

# Predict on the test data
y_pred = rfc.predict(X_test)

In [10]:
print("Random Forest Classifier: Test Data")
# Print the accuracy score
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print AUC score
print("AUC Score:", roc_auc_score(y_test, y_pred))

# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_test, y_pred, average='weighted'))

Random Forest Classifier: Test Data
Accuracy Score: 0.8866666666666667
Confusion Matrix:
 [[3269  439]
 [ 411 3381]]
AUC Score: 0.8866106297707318
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.88      0.88      3708
           1       0.89      0.89      0.89      3792

    accuracy                           0.89      7500
   macro avg       0.89      0.89      0.89      7500
weighted avg       0.89      0.89      0.89      7500

Weighted F1-score: 0.8866603467980485


In [11]:
# Predict on the train data
y_pred_train = rfc.predict(X_train)

print("Random Forest Classifier: Train Data")
# Print the accuracy score
print("Accuracy Score:", accuracy_score(y_train, y_pred_train))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))

# Print AUC score
print("AUC Score:", roc_auc_score(y_train, y_pred_train))

# Print the classification report
print("Classification Report:\n", classification_report(y_train, y_pred_train))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_train, y_pred_train, average='weighted'))

Random Forest Classifier: Train Data
Accuracy Score: 0.9987555555555555
Confusion Matrix:
 [[11289     3]
 [   25 11183]]
AUC Score: 0.9987518877892746
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     11292
           1       1.00      1.00      1.00     11208

    accuracy                           1.00     22500
   macro avg       1.00      1.00      1.00     22500
weighted avg       1.00      1.00      1.00     22500

Weighted F1-score: 0.9987555498229947


In [18]:
# Define hyperparameters grid
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [19]:
# Initialize GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(rfc, param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   5.1s
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   5.2s
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   5.4s
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   5.4s
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   6.0s
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  19.0s
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  20.1s
[CV] END bootstrap=True, criterion=gini, max_depth=None, min_s

In [20]:
# Print the best parameters
print(grid_search.best_params_)

{'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}


In [6]:
# Best Decision Tree Classifier
rf_best = RandomForestClassifier(criterion='entropy', max_depth=None, min_samples_leaf=2, min_samples_split=10,
                                    n_estimators =  200)

# Fit the model
rf_best.fit(X_train, y_train)

# Predict on the test data
y_pred = rf_best.predict(X_test)

In [22]:
# Print the accuracy score
print("Best Random Forest: Test Data")
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print AUC score
print("AUC Score:", roc_auc_score(y_test, y_pred))

# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_test, y_pred, average='weighted'))

Best Random Forest: Test Data
Accuracy Score: 0.8866666666666667
Confusion Matrix:
 [[3284  424]
 [ 426 3366]]
AUC Score: 0.8866554353911489
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.89      0.89      3708
           1       0.89      0.89      0.89      3792

    accuracy                           0.89      7500
   macro avg       0.89      0.89      0.89      7500
weighted avg       0.89      0.89      0.89      7500

Weighted F1-score: 0.8866669971357997


In [23]:
y_pred_train = rf_best.predict(X_train)
# Print the accuracy score
print("Best Random Forest: Train Data")
print("Accuracy Score:", accuracy_score(y_train, y_pred_train))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))

# Print AUC score
print("AUC Score:", roc_auc_score(y_train, y_pred_train))

# Print the classification report
print("Classification Report:\n", classification_report(y_train, y_pred_train))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_train, y_pred_train, average='weighted'))

Best Random Forest: Train Data
Accuracy Score: 0.9685333333333334
Confusion Matrix:
 [[11071   221]
 [  487 10721]]
AUC Score: 0.9684887578403463
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97     11292
           1       0.98      0.96      0.97     11208

    accuracy                           0.97     22500
   macro avg       0.97      0.97      0.97     22500
weighted avg       0.97      0.97      0.97     22500

Weighted F1-score: 0.9685275451738941


### Principal Component Analysis

In [7]:
pca = PCA(n_components=0.75)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [13]:
print(X_train_pca.shape)

(22500, 434)


In [None]:
#Run the model on the PCA data
rf_pca = RandomForestClassifier(criterion='entropy', max_depth=None, min_samples_leaf=2, min_samples_split=10,
                                    n_estimators =  200)

rf_pca.fit(X_train_pca, y_train)

y_pred_test_pca = rf_pca.predict(X_test_pca)

In [25]:
#Print the accuracy score
print("Random Forest Classifier with PCA : Test Data")
print("Accuracy Score:", accuracy_score(y_test, y_pred_test_pca))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test_pca))

# Print AUC score
print("AUC Score:", roc_auc_score(y_test, y_pred_test_pca))

# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred_test_pca))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_test, y_pred_test_pca, average='weighted'))


Random Forest Classifier with PCA : Test Data
Accuracy Score: 0.8829333333333333
Confusion Matrix:
 [[3262  446]
 [ 432 3360]]
AUC Score: 0.8828977373588408
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.88      0.88      3708
           1       0.88      0.89      0.88      3792

    accuracy                           0.88      7500
   macro avg       0.88      0.88      0.88      7500
weighted avg       0.88      0.88      0.88      7500

Weighted F1-score: 0.8829304774596538


In [26]:
# Training Data
# Predict on the training data
y_pred_train_pca = rf_pca.predict(X_train_pca)

# Print the accuracy score
print("Random Forest Classifier with PCA: Training Data")
print("Accuracy Score:", accuracy_score(y_train, y_pred_train_pca))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train_pca))

# Print AUC score
print("AUC Score:", roc_auc_score(y_train, y_pred_train_pca))

# Print the classification report
print("Classification Report:\n", classification_report(y_train, y_pred_train_pca))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_train, y_pred_train_pca, average='weighted'))

Random Forest Classifier with PCA: Training Data
Accuracy Score: 0.9975555555555555
Confusion Matrix:
 [[11277    15]
 [   40 11168]]
AUC Score: 0.9975513732789923
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     11292
           1       1.00      1.00      1.00     11208

    accuracy                           1.00     22500
   macro avg       1.00      1.00      1.00     22500
weighted avg       1.00      1.00      1.00     22500

Weighted F1-score: 0.9975555423974964
