In [58]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
import glob
import numpy as np
import ast

In [59]:
import os

df_labels = pd.read_csv('data/innovaid_hackathon_anima/output_classes.csv')

X = []
y = []
files = glob.glob("data/innovaid_hackathon_anima/input_coords/*.csv")
for file in files:
    df = pd.read_csv(file)
    if df.shape[0] != 500:
        continue
    df.fillna(0, inplace=True)
    df.drop("SCENE_INDEX", inplace=True, axis=1)
    X.append(df.to_numpy().flatten())
    y.append(list(df_labels[df_labels['sid'] == file.split("/")[-1].split('.')[0]]['has_depression'])[0])

In [60]:
len(X), len(y)

(2963, 2963)

In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming X and y are your feature matrix and labels
# You can use your original data or load it as needed
# X, y = ...

# Concatenate X and y to handle the undersampling
data = pd.concat([pd.DataFrame(X), pd.DataFrame({'y': y})], axis=1)

# Separate data into majority (class 1) and minority (class 0) classes
majority_class = data[data['y'] == 1]
minority_class = data[data['y'] == 0]

# Undersample majority class to have the same number of samples as the minority class
undersampled_majority_class = majority_class.sample(n=minority_class.shape[0], random_state=42)

# Concatenate the undersampled majority class and the minority class
undersampled_data = pd.concat([undersampled_majority_class, minority_class], axis=0)

# Separate features and labels
X_undersampled = undersampled_data.drop('y', axis=1)
y_undersampled = undersampled_data['y']

X = X_undersampled
y = y_undersampled

len(X_undersampled), len(y_undersampled)


(1240, 1240)

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### MLP Classifier

In [63]:
model = MLPClassifier(hidden_layer_sizes=(1000,), max_iter=1000, activation='relu', random_state=42)

model.fit(X_train, y_train)

In [64]:
model.score(X_test, y_test)

0.5443548387096774

### SVM

In [65]:
from sklearn.svm import SVC

model = SVC(random_state=42)
model.fit(X_train, y_train)

In [66]:
model.score(X_test, y_test)

0.5241935483870968

---

In [67]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier



names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(max_depth=10, random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=500, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=10000, random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]


for name, clf in zip(names, classifiers):
        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print(f'{name}: {round(score, 5)}')

Nearest Neighbors: 0.51613
Linear SVM: 0.54032
RBF SVM: 0.48387
Gaussian Process: 0.50403
Decision Tree: 0.53226
Random Forest: 0.5121
Neural Net: 0.52823
AdaBoost: 0.52823
Naive Bayes: 0.47177




QDA: 0.4879


In [68]:
predicted_labels = model.predict(X_test)

result_df = pd.DataFrame({'Actual': y_test, 'Predicted': predicted_labels})

# Identify misclassifications
misclassifications = result_df[result_df['Actual'] != result_df['Predicted']]

print("Total len:", len(X_test))

# Calculate the number of 0-labels and 1-labels where the model fails
false_negatives = misclassifications[misclassifications['Actual'] == 1].shape[0]
false_positives = misclassifications[misclassifications['Actual'] == 0].shape[0]

print(f"Number of false negatives (actual 1, predicted 0): {false_negatives}")
print(f"Number of false positives (actual 0, predicted 1): {false_positives}")


Total len: 248
Number of false negatives (actual 1, predicted 0): 59
Number of false positives (actual 0, predicted 1): 59
