In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score
import time

In [5]:
dh = pd.read_csv("./dataset/clear_lemmatize_expand_texts.csv")
dh['text'] = dh['text'].astype(str)
dh['len'] = dh['text'].apply(lambda x: len(x))
index = dh[dh['len'] == dh['len'].max()].index[0]

# Split the text into words
dh['text'] = dh['text'].apply(lambda x: x.split())

# Create a Word2Vec model
w2v_model = Word2Vec(dh['text'], vector_size=500, window=3, min_count=1, workers=6)

# Create a function to convert a list of words into a Word2Vec vector
def w2v_vectorizer(text):
    return np.mean([w2v_model.wv[word] for word in text if word in w2v_model.wv], axis=0)

# Apply the vectorizer to the text data
X_train, X_test, y_train, y_test = train_test_split(dh['text'], dh['target'], test_size=0.991, random_state=42)

X_train_w2v = np.array([w2v_vectorizer(text) for text in X_train])
X_test_w2v = np.array([w2v_vectorizer(text) for text in X_test])

classifiers = {
    'SVM' : SVC(random_state=2024),
    'Logistic Regression': LogisticRegression(random_state=2024),
    'Random Forest': RandomForestClassifier(random_state=2024),
    'Decision Tree': DecisionTreeClassifier(random_state=2024),
    'Extra Trees': ExtraTreesClassifier(random_state=2024),
    'XGBoost': XGBClassifier(random_state=2024)
}

In [19]:
results = pd.DataFrame(columns=['Model', 'Test Accuracy', 'Std'])

for clf_name, classifier in classifiers.items():
    t1 = time.time()
    classifier.fit(X_train_w2v, y_train // 4)
#     train_accuracy = classifier.score(X_train_w2v, y_train)
#     print(f"Training accuracy for {clf_name} : {train_accuracy}, time: {time.time() - t1}")

#     t1 = time.time()
#     test_accuracy = accuracy_score(y_test, classifier.predict(X_test_w2v))
#     print(f"test eval time: {time.time() - t1}")
#     result_dict = {'Model': clf_name, 'Test Accuracy': test_accuracy, 'Std': 0}
#     results = pd.concat([results, pd.DataFrame([result_dict])], ignore_index=True)

# print(results)

In [11]:
def inference_test(cls, x=dh['text'][index], times=10000):
    print(cls.__class__.__name__, w2v_vectorizer.__name__)
    a = []
    for _ in range(times):
        t1 = time.time()
        cls.predict([w2v_vectorizer(x)])
        a.append(time.time() - t1)
    print(np.mean(a))
    return np.mean(a)

In [20]:
for c in classifiers.values():
    inference_test(c)

SVC w2v_vectorizer
0.003599748396873474
