In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import xgboost as xgb 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import precision_score, f1_score, roc_auc_score, accuracy_score, recall_score
from sklearn.svm import LinearSVC


In [3]:
df_short = pd.read_csv("data/processed/processed_short.csv")
df_medium = pd.read_csv("data/processed/processed_medium.csv")
df_dank = pd.read_csv("data/processed/processed_dank.csv")

df_joint = pd.concat([df_short, df_medium, df_dank])
df_joint = df_joint.reset_index(drop=True)
df_train = df_joint.sample(frac=1).reset_index(drop=True)

In [4]:
tfidf = TfidfVectorizer()

In [20]:
def model_trial(df):
    name =[x for x in globals() if globals()[x] is df][0]
    #creating the desired vectors
    text = df['text']
    y = df["label"]
    text_train, text_test, y_train, y_test = train_test_split(
    text, y, test_size=0.33, random_state=42)
    X_train = tfidf.fit_transform(text_train.astype('U').values)
    X_test = tfidf.transform(text_test.astype('U').values)
    D_train = xgb.DMatrix(X_train, label=y_train)
    D_test = xgb. DMatrix(X_test, label=y_test)

    #svm
    svm = LinearSVC()
    svm.fit(X_train, y_train)
    y_pred_bool = svm.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_bool)
    precision = precision_score(y_test, y_pred_bool)
    recall = recall_score(y_test, y_pred_bool)
    roc = roc_auc_score(y_test, y_pred_bool)
    
    
    metrics = {'accuracy': accuracy, 'precision': precision, 'recall': recall, "roc": roc}
    print('------------------SVM-----------------------------')
    print(metrics)
    print('--------------------------------------------------')


    #xgboost
    booster = xgb.XGBClassifier()

    param = {
    'eta': 0.3, 
    'max_depth': 6,  
    'objective': 'binary:hinge'} 
    steps = 15

    model = xgb.train(param, D_train, steps)

    y_pred_bool = model.predict(D_test)
    accuracy = accuracy_score(y_test, y_pred_bool)
    precision = precision_score(y_test, y_pred_bool)
    recall = recall_score(y_test, y_pred_bool)
    roc = roc_auc_score(y_test, y_pred_bool)
    
    
    metrics = {'accuracy': accuracy, 'precision': precision, 'recall': recall, "roc": roc}
    print('------------------XGB-----------------------------')
    print(metrics)


In [22]:
model_trial(df_short)

------------------SVM-----------------------------
{'accuracy': 0.9825802939575394, 'precision': 0.9821428571428571, 'recall': 0.8870967741935484, 'roc': 0.9422897338557421}
--------------------------------------------------
------------------XGB-----------------------------
{'accuracy': 0.954817637452368, 'precision': 0.8634361233480177, 'recall': 0.7903225806451613, 'roc': 0.885406727704582}


In [23]:
model_trial(df_medium)

------------------SVM-----------------------------
{'accuracy': 0.8563636363636363, 'precision': 0.8593200468933178, 'recall': 0.8623529411764705, 'roc': 0.8561764705882353}
--------------------------------------------------
------------------XGB-----------------------------
{'accuracy': 0.7266666666666667, 'precision': 0.6950146627565983, 'recall': 0.8364705882352941, 'roc': 0.723235294117647}


In [24]:
model_trial(df_dank)

------------------SVM-----------------------------
{'accuracy': 0.5533333333333333, 'precision': 0.5681544028950543, 'recall': 0.5541176470588235, 'roc': 0.5533088235294118}
--------------------------------------------------
------------------XGB-----------------------------
{'accuracy': 0.5509090909090909, 'precision': 0.5367003367003367, 'recall': 0.9376470588235294, 'roc': 0.5388235294117647}
