In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.corpus import stopwords
import tensorflow as tf

In [3]:
aita_raw = pd.read_csv("data/processed/aita_xy_cleaned_full.csv", index_col=0)

def pp_regularize_text(df, body_or_title):
    stop_words = []
    for word in list(stopwords.words('english')):
        word = re.sub(r'[^A-Za-z ]+', '', word)
        stop_words.append(word)

    text_list = []
    for text in df[body_or_title]:
        text = text.lower()
        text = text.replace('\n', ' ').replace('\r', '').replace('’', '\'')
        text = re.sub(r'[^A-Za-z ]+', '', text)
        goodtext = text.encode('cp1252', 'ignore')
        goodtext = goodtext.decode('utf-8', 'ignore')

        token = nltk.word_tokenize(goodtext)
        filtered = [y for y in token if not y in stop_words]
        final_text = (" ").join(filtered)

        text_list.append(final_text)
    df[body_or_title] = text_list
    return df

aita_raw = pp_regularize_text(aita_raw, "body")
aita_raw = pp_regularize_text(aita_raw, "title")
print(aita_raw.shape)
aita_raw.head()

(20800, 4)


Unnamed: 0,target,binary_target,title,body
0,NTA,0,aita renting house telling neighbors go landlo...,house rent neighbors complaints property maint...
1,NTA,0,aita boyfriend let get breast reduction,tldr large breasts caused nothing back pain bo...
2,NTA,0,aita parents took wardrobe away punishment sai...,got trouble school fall im junior high school ...
3,YTA,1,aita going week long vacation without wife yea...,ill try make short im wife f married years met...
4,NTA,0,wibta warned current coworkers new problematic...,im senior broadcast producer manager announced...


In [4]:
def tokenize(df, body_or_title):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(df[body_or_title])
    return tokenizer

def convert_to_matrix(df, tokenizer, body_or_title):
    matrix = tokenizer.texts_to_matrix(df[body_or_title], mode="tfidf")
    final_df = pd.DataFrame(matrix)
    return final_df

In [5]:
tokenizer = tokenize(aita_raw, "body")
x_body = convert_to_matrix(aita_raw, tokenizer, "body")
y_binary = aita_raw[["binary_target"]]
print(y_binary.shape)
print(x_body.shape)

(20800, 1)
(20800, 10000)


In [6]:
def f1_and_confusion_matrix_scratch(y_test, y_pred):
    y_test = list(y_test["binary_target"])
    tn = tp = fn = fp = 0
    for num in range(len(y_test)):
        if y_test[num] == 1:
            if y_test[num] == y_pred[num]:
                tp += 1
            else:
                fn += 1
        else:
            if y_test[num] == y_pred[num]:
                tn += 1
            else:
                fp += 1
    precision = tp/(tp+fp)
    recall = tp/(tp+tn)
    try:
        f1 = 2*precision*recall/(precision+recall)
    except:
        f1 = 0
    prec_recl_f1 = [precision, recall, f1]
    conf_matrix = [[tn,fp], [fn,tp]]
    return conf_matrix, prec_recl_f1

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x_body, y_binary, test_size=0.30, random_state=1)

In [8]:
clf = LogisticRegression(random_state=1, class_weight="balanced")
fit = clf.fit(X_train, y_train)
y_pred = fit.predict(X_test)

cm, f1 = f1_and_confusion_matrix_scratch(y_test, y_pred)
print(cm)
print(
    "F1:  ", round(f1[2], 3),
    "\nPrec:", round(f1[0], 3),
    "\nRecl:", round(f1[1], 3)
)

[[3439, 1243], [1056, 502]]
F1:   0.177 
Prec: 0.288 
Recl: 0.127


In [9]:
clf = RandomForestClassifier(random_state=1, class_weight="balanced")
fit = clf.fit(X_train, y_train)
y_pred = fit.predict(X_test)

cm, f1 = f1_and_confusion_matrix_scratch(y_test, y_pred)
print(cm)
print(
    "F1:  ", round(f1[2], 3),
    "\nPrec:", round(f1[0], 3),
    "\nRecl:", round(f1[1], 3)
)

[[4674, 8], [1553, 5]]
F1:   0.002 
Prec: 0.385 
Recl: 0.001


In [10]:
def undersample_majority(x, y, majority):
    y_min = y[y["binary_target"] != majority]
    y_maj = y[y["binary_target"] == majority]
    y_maj = y_maj.sample(n=len(y_min))
    y = pd.concat([y_min, y_maj])

    merged = pd.merge(y, x, how="inner", left_index=True, right_index=True)
    x = merged.drop(columns=["binary_target"])
    y = merged[["binary_target"]]
    return x, y

x_undersample, y_undersample = undersample_majority(x_body, y_binary, 0)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(x_undersample, y_undersample, test_size=0.30, random_state=1)

In [12]:
clf = LogisticRegression(random_state=1, class_weight=None)
fit = clf.fit(X_train, y_train)
y_pred = fit.predict(X_test)

cm, f1 = f1_and_confusion_matrix_scratch(y_test, y_pred)
print(cm)
print(
    "F1:  ", round(f1[2], 3),
    "\nPrec:", round(f1[0], 3),
    "\nRecl:", round(f1[1], 3)
)

[[861, 655], [706, 847]]
F1:   0.528 
Prec: 0.564 
Recl: 0.496


In [13]:
clf = RandomForestClassifier(random_state=1, class_weight=None)
fit = clf.fit(X_train, y_train)
y_pred = fit.predict(X_test)

cm, f1 = f1_and_confusion_matrix_scratch(y_test, y_pred)
print(cm)
print(
    "F1:  ", round(f1[2], 3),
    "\nPrec:", round(f1[0], 3),
    "\nRecl:", round(f1[1], 3)
)

[[974, 542], [764, 789]]
F1:   0.51 
Prec: 0.593 
Recl: 0.448
