In [2]:
import wandb
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import xgboost as xgb 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import average_precision_score, f1_score, roc_auc_score
from sklearn.svm import LinearSVC


In [3]:
df_short = pd.read_csv("data/processed/processed_short.csv")
df_medium = pd.read_csv("data/processed/processed_medium.csv")
df_long = pd.read_csv("data/processed/processed_long.csv")

In [21]:
tfidf = TfidfVectorizer()

In [5]:
def model_trial(df):
    name =[x for x in globals() if globals()[x] is df][0]
    #creating the desired vectors
    text_vec = tfidf.fit_transform(df["text"])
    y = df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
    text_vec, y, test_size=0.33, random_state=42)
    #svm
    svm = LinearSVC()
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    print('--------------------------------------------------')
    print("Average presicion of SVM on " + str(name) + ": {:.2f}".format(average_precision_score(y_test,y_pred)))
    print("ROC-Auc of SVM on " + str(name) + ": {:.2f}".format(roc_auc_score(y_test,y_pred)))
    print("F1 score of SVM on " + str(name) + ": {:.2f}".format(f1_score(y_test,y_pred)))

    #xgboost
    booster = xgb.XGBClassifier()
    booster.fit(X_train, y_train)
    y_pred = booster.predict(X_test)
    print('--------------------------------------------------')
    print("Average presicion of XBG on " + str(name) + ": {:.2f}".format(average_precision_score(y_test,y_pred)))
    print("ROC-Auc of XGB on " + str(name) + ": {:.2f}".format(roc_auc_score(y_test,y_pred)))
    print("F1 score of XGB on " + str(name) + ": {:.2f}".format(f1_score(y_test,y_pred)))

In [18]:
model_trial(df_short)


--------------------------------------------------
Average presicion of SVM on df_short: 0.86
ROC-Auc of SVM on df_short: 0.92
F1 score of SVM on df_short: 0.91
--------------------------------------------------
Average presicion of XBG on df_short: 0.84
ROC-Auc of XGB on df_short: 0.93
F1 score of XGB on df_short: 0.90


In [19]:
model_trial(df_medium)


--------------------------------------------------
Average presicion of SVM on df_medium: 0.81
ROC-Auc of SVM on df_medium: 0.86
F1 score of SVM on df_medium: 0.86
--------------------------------------------------
Average presicion of XBG on df_medium: 0.78
ROC-Auc of XGB on df_medium: 0.82
F1 score of XGB on df_medium: 0.83


In [20]:
model_trial(df_long)


--------------------------------------------------
Average presicion of SVM on df_long: 1.00
ROC-Auc of SVM on df_long: 1.00
F1 score of SVM on df_long: 1.00
--------------------------------------------------
Average presicion of XBG on df_long: 0.99
ROC-Auc of XGB on df_long: 0.99
F1 score of XGB on df_long: 0.99
