In [30]:
import pandas as pd
import ast  
from ast import literal_eval
import numpy as np

In [16]:
def convert_to_list(value):
    try:
        return ast.literal_eval(value)
    except (ValueError, SyntaxError):
        return value

In [20]:
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

In [22]:
X_train = train_df['vector'].apply(literal_eval)
y_train = train_df['label']

In [23]:
X_val = val_df['vector'].apply(literal_eval)
y_val = val_df['label']

In [24]:
X_test = test_df['vector'].apply(literal_eval)
y_test = test_df['label']

In [34]:
X_train_array = np.vstack(X_train)

In [46]:
X_test_array = np.vstack(X_test)

In [39]:
X_val_array = np.vstack(X_val)

# Naive Bayes:

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [36]:
classifier = MultinomialNB()
classifier.fit(X_train_array, y_train)

MultinomialNB()

In [41]:
val_predictions = classifier.predict(X_val_array)

In [42]:
val_accuracy = accuracy_score(y_val, val_predictions)
val_classification_rep = classification_report(y_val, val_predictions)

print(f"Validation Accuracy: {val_accuracy}")
print("Validation Classification Report:\n", val_classification_rep)

Validation Accuracy: 0.9860302677532014
Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       645
         1.0       0.98      0.96      0.97       214

    accuracy                           0.99       859
   macro avg       0.98      0.98      0.98       859
weighted avg       0.99      0.99      0.99       859



In [47]:
test_predictions = classifier.predict(X_test_array)

In [48]:
test_accuracy = accuracy_score(y_test, test_predictions)
test_classification_rep = classification_report(y_test, test_predictions)

print(f"\nTest Accuracy: {test_accuracy}")
print("Test Classification Report:\n", test_classification_rep)


Test Accuracy: 0.9883720930232558
Test Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       633
         1.0       0.97      0.98      0.98       227

    accuracy                           0.99       860
   macro avg       0.98      0.99      0.99       860
weighted avg       0.99      0.99      0.99       860



# SVM

In [49]:
from sklearn.svm import LinearSVC


In [50]:
classifier_2 = LinearSVC(max_iter=10000)
classifier_2.fit(X_train_array, y_train)


LinearSVC(max_iter=10000)

In [52]:
val_predictions = classifier_2.predict(X_val_array)


In [53]:
val_accuracy = accuracy_score(y_val, val_predictions)
val_classification_rep = classification_report(y_val, val_predictions)

print(f"Validation Accuracy: {val_accuracy}")
print("Validation Classification Report:\n", val_classification_rep)


Validation Accuracy: 0.9837019790454016
Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       645
         1.0       0.99      0.95      0.97       214

    accuracy                           0.98       859
   macro avg       0.98      0.97      0.98       859
weighted avg       0.98      0.98      0.98       859



In [54]:
test_predictions = classifier_2.predict(X_test_array)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, test_predictions)
test_classification_rep = classification_report(y_test, test_predictions)

print(f"\nTest Accuracy: {test_accuracy}")
print("Test Classification Report:\n", test_classification_rep)


Test Accuracy: 0.9872093023255814
Test Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       633
         1.0       0.97      0.98      0.98       227

    accuracy                           0.99       860
   macro avg       0.98      0.98      0.98       860
weighted avg       0.99      0.99      0.99       860



# Random Forest

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [56]:
classifier_3 = RandomForestClassifier(n_estimators=100, random_state=42)
classifier_3.fit(X_train_array, y_train)

RandomForestClassifier(random_state=42)

In [57]:
val_predictions_3 = classifier_3.predict(X_val_array)


In [58]:
val_accuracy_3 = accuracy_score(y_val, val_predictions_3)
val_classification_rep_3 = classification_report(y_val, val_predictions_3)

print(f"Validation Accuracy: {val_accuracy_3}")
print("Validation Classification Report:\n", val_classification_rep_3)

Validation Accuracy: 0.9767171129220024
Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      1.00      0.98       645
         1.0       0.99      0.91      0.95       214

    accuracy                           0.98       859
   macro avg       0.98      0.95      0.97       859
weighted avg       0.98      0.98      0.98       859



In [64]:
param_grid = {
    'n_estimators': [100,300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3,verbose=2)
grid_search.fit(X_train_array, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END max_depth=None, min_samples_split=5, n_estimators=100; total time=  16.3s
[CV] END max_depth=None, min_samples_split=5, n_estimators=100; total time=  13.4s
[CV] END max_depth=None, min_samples_split=5, n_estimators=100; total time=  14.7s
[CV] END max_depth=None, min_samples_split=5, n_estimators=300; total time=  41.0s
[CV] END max_depth=None, min_samples_split=5, n_estimators=300; total time=  40.7s
[CV] END max_depth=None, min_samples_split=5, n_estimators=300; total time=  40.8s
[CV] END max_depth=None, min_samples_split=10, n_estimators=100; total time=  13.8s
[CV] END max_depth=None, min_samples_split=10, n_estimators=100; total time=  13.8s
[CV] END max_depth=None, min_samples_split=10, n_estimators=100; total time=  14.0s
[CV] END max_depth=None, min_samples_split=10, n_estimators=300; total time=  42.7s
[CV] END max_depth=None, min_samples_split=10, n_estimators=300; total time=  41.3s
[CV] END max_depth=No

In [65]:
best_classifier = grid_search.best_estimator_
best_classifier.fit(X_train_array, y_train)

RandomForestClassifier(min_samples_split=5, n_estimators=300, random_state=42)

In [66]:
val_predictions_4 = best_classifier.predict(X_val_array)
val_accuracy_4 = accuracy_score(y_val, val_predictions_4)
val_classification_rep_4 = classification_report(y_val, val_predictions_4)

print(f"Validation Accuracy: {val_accuracy_4}")
print("Validation Classification Report:\n", val_classification_rep_4)

Validation Accuracy: 0.9767171129220024
Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      1.00      0.98       645
         1.0       0.99      0.91      0.95       214

    accuracy                           0.98       859
   macro avg       0.98      0.95      0.97       859
weighted avg       0.98      0.98      0.98       859



In [67]:
test_predictions = best_classifier.predict(X_test_array)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, test_predictions)
test_classification_rep = classification_report(y_test, test_predictions)

print(f"\nTest Accuracy: {test_accuracy}")
print("Test Classification Report:\n", test_classification_rep)


Test Accuracy: 0.9755813953488373
Test Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      1.00      0.98       633
         1.0       1.00      0.91      0.95       227

    accuracy                           0.98       860
   macro avg       0.98      0.95      0.97       860
weighted avg       0.98      0.98      0.98       860



# Best model:

The best model based on the test results is the Naive Bayes model with 98.83% accuracy.