In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score
import time

In [6]:
# Load data
dh = pd.read_csv("clear_lemmatize_expand_texts.csv")
dh = dh[dh['text'].notna()]

# Preprocess targets
dh['target'] = dh['target'].astype(str)
target_mapping = {'0': 0, '4': 1}
dh['target'] = dh['target'].replace(target_mapping)

# Split text into words
dh['text'] = dh['text'].apply(lambda x: x.split())

# Create Word2Vec model
w2v_model = Word2Vec(dh['text'], vector_size=500, window=3, min_count=1, workers=6)

# Function to convert a list of words into a Word2Vec vector
def w2v_vectorizer(text):
    return np.mean([w2v_model.wv[word] for word in text if word in w2v_model.wv], axis=0)

# Apply vectorizer to the text data
X_train, X_test, y_train, y_test = train_test_split(dh['text'], dh['target'], test_size=0.991, random_state=42)

X_train_w2v = np.array([w2v_vectorizer(text) for text in X_train])
X_test_w2v = np.array([w2v_vectorizer(text) for text in X_test])

# Find the longest sentence in the dataset
longest_sentence = max(dh['text'], key=len)
longest_sentence_w2v = w2v_vectorizer(longest_sentence).reshape(1, -1)

# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=2024),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(eval_metric=["error"])
}

  dh['target'] = dh['target'].replace(target_mapping)


In [7]:
results = pd.DataFrame(columns=['Model', 'Test Accuracy', 'Std', 'Mean Inference Time', 'Inference Time Std'])

for clf_name, classifier in classifiers.items():
    # Train the classifier
    t1 = time.time()
    classifier.fit(X_train_w2v, y_train)
    train_accuracy = classifier.score(X_train_w2v, y_train)
    print(f"Training accuracy for {clf_name} : {train_accuracy}, time: {time.time() - t1}")

    # Test accuracy
    t1 = time.time()
    test_accuracy = accuracy_score(y_test, classifier.predict(X_test_w2v))
    print(f"test eval time: {time.time() - t1}")

    # Measure inference time for the longest sentence
    inference_times = []
    for _ in range(10000):
        t1 = time.time()
        classifier.predict(longest_sentence_w2v)
        inference_times.append(time.time() - t1)

    mean_inference_time = np.mean(inference_times)
    std_inference_time = np.std(inference_times)
    
    result_dict = {
        'Model': clf_name,
        'Test Accuracy': test_accuracy,
        'Std': 0,  # Placeholder, replace if you have a standard deviation for test accuracy
        'Mean Inference Time': mean_inference_time,
        'Inference Time Std': std_inference_time
    }
    results = pd.concat([results, pd.DataFrame([result_dict])], ignore_index=True)

print(results)

Training accuracy for Logistic Regression : 0.7502783189535207, time: 0.6957638263702393
test eval time: 1.9268624782562256


  results = pd.concat([results, pd.DataFrame([result_dict])], ignore_index=True)


Training accuracy for Random Forest : 0.9998608405232396, time: 28.207966327667236
test eval time: 30.68875241279602
Training accuracy for Decision Tree : 0.9998608405232396, time: 8.305766344070435
test eval time: 0.7030081748962402
Training accuracy for Extra Trees : 0.9998608405232396, time: 5.352341651916504
test eval time: 35.556556224823
Training accuracy for XGBoost : 0.999234622877818, time: 3.507412910461426
test eval time: 1.2194581031799316
                 Model  Test Accuracy Std  Mean Inference Time  \
0  Logistic Regression       0.738771   0             0.000041   
1        Random Forest       0.694244   0             0.001898   
2        Decision Tree       0.588495   0             0.000048   
3          Extra Trees       0.690709   0             0.001922   
4              XGBoost       0.709614   0             0.000393   

   Inference Time Std  
0            0.000201  
1            0.001038  
2            0.000220  
3            0.000732  
4            0.000562  
