In [2]:
import wandb
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import xgboost as xgb 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import average_precision_score, f1_score, roc_auc_score
from sklearn.svm import LinearSVC


In [3]:
df_short = pd.read_csv("data/processed/processed_short.csv")
df_medium = pd.read_csv("data/processed/processed_medium.csv")
df_dank = pd.read_csv("data/processed/processed_dank.csv")

In [4]:
tfidf = TfidfVectorizer()

In [17]:
def model_trial(df):
    name =[x for x in globals() if globals()[x] is df][0]
    #creating the desired vectors
    text = df['text']
    y = df["label"]
    text_train, text_test, y_train, y_test = train_test_split(
    text, y, test_size=0.33, random_state=42)
    X_train = tfidf.fit_transform(text_train.astype('U').values)
    X_test = tfidf.transform(text_test.astype('U').values)
    #svm
    svm = LinearSVC()
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    print('--------------------------------------------------')
    print("Average presicion of SVM on " + str(name) + ": {:.2f}".format(average_precision_score(y_test,y_pred)))
    print("ROC-Auc of SVM on " + str(name) + ": {:.2f}".format(roc_auc_score(y_test,y_pred)))
    print("F1 score of SVM on " + str(name) + ": {:.2f}".format(f1_score(y_test,y_pred)))

    #xgboost
    booster = xgb.XGBClassifier()
    booster.fit(X_train, y_train)
    y_pred = booster.predict(X_test)
    print('--------------------------------------------------')
    print("Average presicion of XBG on " + str(name) + ": {:.2f}".format(average_precision_score(y_test,y_pred)))
    print("ROC-Auc of XGB on " + str(name) + ": {:.2f}".format(roc_auc_score(y_test,y_pred)))
    print("F1 score of XGB on " + str(name) + ": {:.2f}".format(f1_score(y_test,y_pred)))

In [18]:
model_trial(df_short)


--------------------------------------------------
Average presicion of SVM on df_short: 0.94
ROC-Auc of SVM on df_short: 0.97
F1 score of SVM on df_short: 0.96
--------------------------------------------------
Average presicion of XBG on df_short: 0.84
ROC-Auc of XGB on df_short: 0.92
F1 score of XGB on df_short: 0.91


In [19]:
model_trial(df_medium)


--------------------------------------------------
Average presicion of SVM on df_medium: 0.81
ROC-Auc of SVM on df_medium: 0.86
F1 score of SVM on df_medium: 0.86
--------------------------------------------------
Average presicion of XBG on df_medium: 0.77
ROC-Auc of XGB on df_medium: 0.82
F1 score of XGB on df_medium: 0.82


In [20]:
model_trial(df_dank)


--------------------------------------------------
Average presicion of SVM on df_dank: 0.54
ROC-Auc of SVM on df_dank: 0.55
F1 score of SVM on df_dank: 0.56
--------------------------------------------------
Average presicion of XBG on df_dank: 0.55
ROC-Auc of XGB on df_dank: 0.57
F1 score of XGB on df_dank: 0.53
