In [2]:
import wandb
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import xgboost as xgb 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import average_precision_score, f1_score, roc_auc_score
from sklearn.svm import LinearSVC


In [3]:
df_short = pd.read_csv("data/processed/processed_short.csv")
df_medium = pd.read_csv("data/processed/processed_medium.csv")
df_dank = pd.read_csv("data/processed/processed_dank.csv")

In [4]:
tfidf = TfidfVectorizer()

In [48]:
def model_trial(df):
    name =[x for x in globals() if globals()[x] is df][0]
    #creating the desired vectors
    text = df['text']
    y = df["label"]
    text_train, text_test, y_train, y_test = train_test_split(
    text, y, test_size=0.33, random_state=42)
    X_train = tfidf.fit_transform(text_train.astype('U').values)
    X_test = tfidf.transform(text_test.astype('U').values)
    D_train = xgb.DMatrix(X_train, label=y_train)
    D_test = xgb. DMatrix(X_test, label=y_test)

    #svm
    svm = LinearSVC()
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    print('--------------------------------------------------')
    print("Average presicion of SVM on " + str(name) + ": {:.2f}".format(average_precision_score(y_test,y_pred)))
    print("ROC-Auc of SVM on " + str(name) + ": {:.2f}".format(roc_auc_score(y_test,y_pred)))
    print("F1 score of SVM on " + str(name) + ": {:.2f}".format(f1_score(y_test,y_pred)))

    #xgboost
    booster = xgb.XGBClassifier()

    param = {
    'eta': 0.4, 
    'max_depth': 6,  
    'objective': 'binary:hinge'} 
    steps = 10

    model = xgb.train(param, D_train, steps)

    y_pred = model.predict(D_test)
    print('--------------------------------------------------')
    print("Average presicion of XBG on " + str(name) + ": {:.2f}".format(average_precision_score(y_test,y_pred)))
    print("ROC-Auc of XGB on " + str(name) + ": {:.2f}".format(roc_auc_score(y_test,y_pred)))
    print("F1 score of XGB on " + str(name) + ": {:.2f}".format(f1_score(y_test,y_pred)))

In [49]:
model_trial(df_short)


--------------------------------------------------
Average presicion of SVM on df_short: 0.89
ROC-Auc of SVM on df_short: 0.94
F1 score of SVM on df_short: 0.93
--------------------------------------------------
Average presicion of XBG on df_short: 0.70
ROC-Auc of XGB on df_short: 0.89
F1 score of XGB on df_short: 0.82


In [50]:
model_trial(df_medium)


--------------------------------------------------
Average presicion of SVM on df_medium: 0.81
ROC-Auc of SVM on df_medium: 0.86
F1 score of SVM on df_medium: 0.86
--------------------------------------------------
Average presicion of XBG on df_medium: 0.65
ROC-Auc of XGB on df_medium: 0.71
F1 score of XGB on df_medium: 0.75


In [51]:
model_trial(df_dank)



--------------------------------------------------
Average presicion of SVM on df_dank: 0.54
ROC-Auc of SVM on df_dank: 0.55
F1 score of SVM on df_dank: 0.56
--------------------------------------------------
Average presicion of XBG on df_dank: 0.53
ROC-Auc of XGB on df_dank: 0.53
F1 score of XGB on df_dank: 0.68
