In [93]:
from sklearnex import patch_sklearn # intel device sklearn boosting
patch_sklearn()
import util
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from torch.cuda import is_available
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier # it take too much time to evaluation
from sklearn.svm import LinearSVC # if training dataset is too large, we will not train in default SVC
from sklearn.svm import SVC # only train in small training dataset
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from pycontractions import Contractions
from nltk.corpus import stopwords
import time
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
from optuna.visualization import plot_param_importances
from optuna.importance import MeanDecreaseImpurityImportanceEvaluator
from optuna.visualization import plot_optimization_history

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [None]:
dataset = util.load_dataset()
cont = Contractions(api_key="glove-twitter-100")
cont.load_models()
dataset['text'] = dataset['text'].apply(util.clear_text)
dataset['text'] = list(cont.expand_texts(dataset['text']))
dataset['text'] = dataset['text'].apply(util.lemmatize)

In [None]:
def clear_stopwords(text: str):
    return ' '.join(t for t in str(text).split() if t not in stopwords.words('english'))
dataset['text'] = dataset['text'].apply(clear_stopwords)
dataset.to_csv('./dataset/clear_lemmatize_expand_texts.csv', index=False)

In [16]:
dataset = pd.read_csv('./dataset/clear_lemmatize_expand_texts.csv')
dataset['text'] = dataset['text'].astype(str)

In [86]:
cv = CountVectorizer()
cv.fit(dataset['text'])
print(cv.transform([dataset.iloc[-1]['text']]).toarray().shape)

(1, 281408)


In [87]:
tfidf = TfidfVectorizer()
tfidf.fit(dataset['text'])
print(tfidf.transform([dataset.iloc[-1]['text']]).toarray().shape)

(1, 281408)


In [88]:
dataset['len'] = dataset['text'].apply(lambda x: len(x))
index = dataset[dataset['len'] == dataset['len'].max()].index[0]

In [102]:
def inference_test(cls, encoder, x=dataset['text'][index], times=10000):
    print(cls.__class__.__name__, encoder.__class__.__name__)
    a = []
    for _ in range(times):
        t1 = time.time()
        cls.predict(encoder.transform([x]))
        a.append(time.time() - t1)
    print(np.mean(a))
    return np.mean(a)

In [91]:
Y = dataset['target'].to_numpy() // 4

In [109]:
def train(model: BaseEstimator, x, y=Y, batch=None, test_ratio=0.991, verbose=False, acc=True):
    if verbose:
        print(model.__class__.__name__)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_ratio)
    t1 = time.time()
    model.fit(x_train[:batch], y_train[:batch])
    t2 = time.time()
    train_sc = None
    test_sc = None
    if acc:
        train_sc = model.score(x_train[:batch], y_train[:batch])
        test_sc = model.score(x_test[:batch], y_test[:batch])
    if verbose and acc:
        print('training time:', t2 - t1)
        print('train acc:', train_sc)
        print('test acc:', test_sc)
        print('eval time:', time.time() - t2)
    return train_sc, test_sc

In [95]:
rfc = RandomForestClassifier(random_state=2024)
xgb = XGBClassifier(random_state=2024)
dtc = DecisionTreeClassifier(random_state=2024)
lr = LogisticRegression(random_state=2024)
gnb = GaussianNB()
svc = SVC(random_state=2024)
extree = ExtraTreesClassifier(random_state=2024)

In [None]:
X = cv.transform(dataset['text'])
train(rfc, X, verbose=True, acc=False)
train(xgb, X, verbose=True, acc=False)
train(dtc, X, verbose=True, acc=False)
train(lr, X, verbose=True, acc=False)
train(extree, X, verbose=True, acc=False)
# train(gnb, X, verbose=True, acc=False)

In [111]:
inference_test(rfc, cv)
inference_test(xgb, cv)
inference_test(dtc, cv)
inference_test(lr, cv)
inference_test(extree, cv)
# inference_test(gnb, cv)

RandomForestClassifier CountVectorizer
0.11365972511768341
XGBClassifier CountVectorizer
0.05128001439571381
DecisionTreeClassifier CountVectorizer
0.0007828950881958007
LogisticRegression CountVectorizer
0.0002606998682022095
ExtraTreesClassifier CountVectorizer
0.06992638602256775


0.06992638602256775

In [112]:
X = tfidf.transform(dataset['text'])
train(rfc, X, verbose=True, acc=False)
train(xgb, X, verbose=True, acc=False)
train(dtc, X, verbose=True, acc=False)
train(lr, X, verbose=True, acc=False)
train(extree, X, verbose=True, acc=False)
# train(gnb, X, verbose=True, acc=False)

RandomForestClassifier
XGBClassifier
DecisionTreeClassifier
LogisticRegression
ExtraTreesClassifier


(None, None)

In [113]:
inference_test(rfc, tfidf)
inference_test(xgb, tfidf)
inference_test(dtc, tfidf)
inference_test(lr, tfidf)
inference_test(extree, tfidf)
# inference_test(gnb, tfidf)

RandomForestClassifier TfidfVectorizer
0.06837882320880889
XGBClassifier TfidfVectorizer
0.05341551949977875
DecisionTreeClassifier TfidfVectorizer
0.005056393265724182
LogisticRegression TfidfVectorizer
0.004581047964096069
ExtraTreesClassifier TfidfVectorizer
0.07218445415496826


0.07218445415496826