In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
import joblib

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline, make_pipeline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import confusion_matrix, classification_report, plot_roc_curve, roc_auc_score, mean_squared_error


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

from spacy.lemmatizer import Lemmatizer
import en_core_web_sm
from spacy import vocab

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [46]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [47]:
#read in the dataset
final = pd.read_csv('datasets/final.csv')
final.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,nta,cleaned_text,nltk_lem,nltk_pos,spacy_lem
0,jbswil,Thanks for this. When I was getting gaslit I c...,2,1,0,thanks getting gaslit came couple times got lo...,thanks getting gaslit came couple time got lot...,thanks get gaslit come couple time get lot sta...,thank get gaslit come couple time get lot stay...
1,k4owfz,does the 'no covid posts' rule extend to the c...,38,1,0,covid posts rule extend comments seen posts to...,covid post rule extend comment seen post top c...,covid post rule extend comment see post top co...,covid post rule extend comment see post commen...
2,k4owfz,"Where do mods draw the line as far as the ""acc...",15,1,0,mods draw line far accept judgment rule get tr...,mod draw line far accept judgment rule get tri...,mod draw line far accept judgment rule get tri...,mod draw line far accept judgment rule tricky ...
3,k4owfz,Does anyone ever ask AITA for cutting off fami...,28,1,0,anyone ever ask cutting family results verdict...,anyone ever ask cutting family result verdict ...,anyone ever ask cut family result verdict seem...,ask cut family result verdict like general rig...
4,k4owfz,AITA has reduced to recycling the same comment...,32,1,0,reduced recycling comments get upvotes well wr...,reduced recycling comment get upvotes well wri...,reduce recycle comment get upvotes well write ...,reduce recycling comment upvote write comment ...


In [48]:
final.shape

(178015, 9)

In [49]:
final.isnull().any()

post_id          False
comment_text     False
comment_score    False
yta              False
nta              False
cleaned_text      True
nltk_lem          True
nltk_pos          True
spacy_lem         True
dtype: bool

Because some of these comments were just 'YTA' or 'NTA' it created blank fields when the stop words filtered them out and subsequently became null fields when imported. Since these don't have any value in a language model, I will be dropping these data entries. 

In [50]:
final.dropna(inplace=True)

# Machine Learning Models

In this next section, I generated several combinations of machine learning models using 4 of the different lemmatization formats created in the first part of this project. 

Texts
1. cleaned_text
2. nltk_lem
3. nltk_pos
4. spacy_lem


Vectorizers
1. CountVectorizer
2. TfidfVectorizer 

Sampled
1. Smote 

Classified through Model
1. MultinomialNB
2. LogisticRegression


In [51]:
#Assigning stopwords
stop_words = set(stopwords.words('english'))  
special_stops = {'YTA', 'yta', 'NTA', 'nta', 'ESH', 'esh', 'NAH', 'nah', 'wibta', 'aita'}
stop_words = stop_words.union(special_stops)

In [52]:
X = final['cleaned_text']
y = final['yta']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                   test_size = 0.33,
                                                   random_state = 42,
                                                   stratify = y)

In [54]:
#Check Baseline accuracy
y_train.value_counts(normalize=True)

0    0.771874
1    0.228126
Name: yta, dtype: float64

The baseline accuracy to beat is 77%

## Parameters for Pipeline

In [11]:
pl = Pipeline([
    ('vectorizer', None),
    ('sampler', None),
    ('classifier', None),
])

In a vectorizer, the max_df is set to ignore terms that will appear in more than the % of the documents, this cuts out words that appear too frequently. Additionally the min_df is used to remove terms that are appearing too infrequently. Selecting the following parameters, the vectorizer will ignore terms that appear in more than 90-95% of the documents & terms that only appear in < 2,3,4 documents. 



In [12]:
nb_param_grid = [{
        # vectorisers to try: count vectoriser, tf-idf vectoriser
        'vectorizer': [CountVectorizer(stop_words = stop_words),
                       TfidfVectorizer(stop_words = stop_words)],
        # feature selection by max df
        'vectorizer__max_df': [.95, .90],
        'vectorizer__min_df' : [2,3,4],
        'vectorizer__ngram_range': [(1,2),(1,3),(2,3)],
    
        #Sampler
        'sampler': [SMOTE(random_state=42)],

        # models to test: multinomial Naive Bayes and logistic regression
        'classifier': [MultinomialNB()] 
    }]

In [13]:
lr_param_grid = [{
        # vectorisers to try: count vectoriser, tf-idf vectoriser
        'vectorizer': [CountVectorizer(stop_words = stop_words),
                       TfidfVectorizer(stop_words = stop_words)],
        # feature selection by max df
        'vectorizer__max_df': [.95, .90],
        'vectorizer__min_df' : [2,3,4],
        'vectorizer__ngram_range': [(1,2),(1,3),(2,3)],
    
        #Sampler
        'sampler': [SMOTE(random_state=42)],

        # models to test: multinomial Naive Bayes and logistic regression
        'classifier': [LogisticRegression()] 
    }]

## Cleaned text

### Cleaned text NB

In [14]:
nb_cleaned_text = GridSearchCV(pl, cv=5, param_grid = nb_param_grid, scoring = 'accuracy', verbose = 1, n_jobs = -1) 
nb_cleaned_text.fit(X_train, y_train)
nb_y_pred = nb_cleaned_text.predict(X_test)

print('Train Score:', nb_cleaned_text.score(X_train, y_train))
print('Test Score:', nb_cleaned_text.score(X_test, y_test))

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 18.3min finished


Train Score: 0.9519711574824987
Test Score: 0.8457377832377833


In [15]:
joblib.dump(nb_cleaned_text.best_estimator_, 'models/nb_cleaned_text.pkl')

['models/nb_cleaned_text.pkl']

In [16]:
y_train_pred = nb_cleaned_text.predict_proba(X_train)[:,1]
y_test_pred = nb_cleaned_text.predict_proba(X_test)[:,1]

# Scoring on testidation data with best params
print(f'Training AUC on best params: {roc_auc_score(y_train, y_train_pred)}')
    
print(f'Validation AUC on best params: {roc_auc_score(y_test, y_test_pred)}')

# Printing Confusion Matrix and Scoring reports
print()
print(confusion_matrix(y_test, nb_cleaned_text.predict(X_test)))
print()
print(classification_report(y_test, nb_cleaned_text.predict(X_test)))

Training AUC on best params: 0.9833932575605958
Validation AUC on best params: 0.858733253987892

[[42160  3078]
 [ 5963  7407]]

              precision    recall  f1-score   support

           0       0.88      0.93      0.90     45238
           1       0.71      0.55      0.62     13370

    accuracy                           0.85     58608
   macro avg       0.79      0.74      0.76     58608
weighted avg       0.84      0.85      0.84     58608



### Cleaned text LR

In [17]:
lr_cleaned_text = GridSearchCV(pl, cv=5, param_grid = lr_param_grid, scoring = 'accuracy', verbose = 1, n_jobs = -1) 
lr_cleaned_text.fit(X_train, y_train)
lr_y_pred = lr_cleaned_text.predict(X_test)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 20.8min finished


In [44]:
print('Train Score:', lr_cleaned_text.score(X_train, y_train))
print('Test Score:', lr_cleaned_text.score(X_test, y_test))

Train Score: 0.8698052793908783
Test Score: 0.7900116025116025


In [18]:
joblib.dump(lr_cleaned_text.best_estimator_, 'models/lr_cleaned_text.pkl')

['models/lr_cleaned_text.pkl']

In [19]:
y_train_pred = lr_cleaned_text.predict_proba(X_train)[:,1]
y_test_pred = lr_cleaned_text.predict_proba(X_test)[:,1]

# Scoring on testidation data with best params
print(f'Training AUC on best params: {round(roc_auc_score(y_train, y_train_pred), 3)}')
    
print(f'Validation AUC on best params: {round(roc_auc_score(y_test, y_test_pred), 3)}')

# Printing Confusion Matrix and Scoring reports
print()
print(confusion_matrix(y_test, lr_cleaned_text.predict(X_test)))
print()
print(classification_report(y_test, lr_cleaned_text.predict(X_test)))

Training AUC on best params: 0.927
Validation AUC on best params: 0.814

[[38011  7227]
 [ 5080  8290]]

              precision    recall  f1-score   support

           0       0.88      0.84      0.86     45238
           1       0.53      0.62      0.57     13370

    accuracy                           0.79     58608
   macro avg       0.71      0.73      0.72     58608
weighted avg       0.80      0.79      0.80     58608



## NLTK Lem

In [20]:
X_n = final['nltk_lem']
y_n = final['yta']

In [21]:
X_n_train, X_n_test, y_n_train, y_n_test = train_test_split(X_n, 
                                                    y_n,
                                                   test_size = 0.33,
                                                   random_state = 42,
                                                   stratify = y)

### NLTK Lem NB

In [22]:
nb_nltk_lem = GridSearchCV(pl, cv=5, param_grid = nb_param_grid, scoring = 'accuracy', verbose = 1, n_jobs = -1) 
nb_nltk_lem.fit(X_n_train, y_n_train)
nb_y_pred = nb_nltk_lem.predict(X_n_test)

print('Train Score:', nb_nltk_lem.score(X_n_train, y_n_train))
print('Test Score:', nb_nltk_lem.score(X_n_test, y_n_test))

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 20.1min finished


Train Score: 0.9503743980637192
Test Score: 0.841011466011466


In [23]:
joblib.dump(nb_nltk_lem.best_estimator_, 'models/nb_nltk_lem.pkl')

['models/nb_nltk_lem.pkl']

In [24]:
y_n_train_pred = nb_nltk_lem.predict_proba(X_n_train)[:,1]
y_n_test_pred = nb_nltk_lem.predict_proba(X_n_test)[:,1]

# Scoring on testidation data with best params
print(f'Training AUC on best params: {(roc_auc_score(y_n_train, y_n_train_pred))}')
    
print(f'Validation AUC on best params: {(roc_auc_score(y_n_test, y_n_test_pred))}')

# Printing Confusion Matrix and Scoring reports
print()
print(confusion_matrix(y_n_test, nb_nltk_lem.predict(X_n_test)))
print()
print(classification_report(y_n_test, nb_nltk_lem.predict(X_n_test)))

Training AUC on best params: 0.9822770025395124
Validation AUC on best params: 0.8526532042630148

[[42038  3200]
 [ 6118  7252]]

              precision    recall  f1-score   support

           0       0.87      0.93      0.90     45238
           1       0.69      0.54      0.61     13370

    accuracy                           0.84     58608
   macro avg       0.78      0.74      0.75     58608
weighted avg       0.83      0.84      0.83     58608



### NLTK Lem LR

In [25]:
lr_nltk_lem = GridSearchCV(pl, cv=5, param_grid = lr_param_grid, scoring = 'accuracy', verbose = 1, n_jobs = -1) 
lr_nltk_lem.fit(X_n_train, y_n_train)
lr_y_pred = lr_nltk_lem.predict(X_n_test)

print('Train Score:', lr_nltk_lem.score(X_n_train, y_n_train))
print('Test Score:', lr_nltk_lem.score(X_n_test, y_n_test))

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 21.4min finished


Train Score: 0.9184139976973048
Test Score: 0.7387046137046137


In [26]:
joblib.dump(lr_nltk_lem.best_estimator_, 'models/lr_nltk_lem.pkl')

['models/lr_nltk_lem.pkl']

In [27]:
y_n_train_pred = lr_nltk_lem.predict_proba(X_n_train)[:,1]
y_n_test_pred = lr_nltk_lem.predict_proba(X_n_test)[:,1]

# Scoring on testidation data with best params
print(f'Training AUC on best params: {(roc_auc_score(y_n_train, y_n_train_pred))}')
    
print(f'Validation AUC on best params: {(roc_auc_score(y_n_test, y_n_test_pred))}')

# Printing Confusion Matrix and Scoring reports
print()
print(confusion_matrix(y_n_test, lr_nltk_lem.predict(X_n_test)))
print()
print(classification_report(y_n_test, lr_nltk_lem.predict(X_n_test)))

Training AUC on best params: 0.9725346152819807
Validation AUC on best params: 0.7810681323671896

[[34829 10409]
 [ 4905  8465]]

              precision    recall  f1-score   support

           0       0.88      0.77      0.82     45238
           1       0.45      0.63      0.53     13370

    accuracy                           0.74     58608
   macro avg       0.66      0.70      0.67     58608
weighted avg       0.78      0.74      0.75     58608



## NLTK POS

In [28]:
X_p = final['nltk_pos']
y_p = final['yta']

In [29]:
X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(X_p, 
                                                    y_p,
                                                   test_size = 0.33,
                                                   random_state = 42,
                                                   stratify = y)

### NLTK POS NB

In [30]:
nb_nltk_pos = GridSearchCV(pl, cv=5, param_grid = nb_param_grid, scoring = 'accuracy', verbose = 1, n_jobs = -1) 
nb_nltk_pos.fit(X_p_train, y_p_train)
nb_y_pred = nb_nltk_pos.predict(X_p_test)

print('Train Score:', nb_nltk_pos.score(X_p_train, y_p_train))
print('Test Score:', nb_nltk_pos.score(X_p_test, y_p_test))

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 20.2min finished


Train Score: 0.9488952946021127
Test Score: 0.8405337155337156


In [31]:
joblib.dump(nb_nltk_lem.best_estimator_, 'models/nb_nltk_pos.pkl')

['models/nb_nltk_pos.pkl']

In [32]:
y_p_train_pred = nb_nltk_pos.predict_proba(X_p_train)[:,1]
y_p_test_pred = nb_nltk_pos.predict_proba(X_p_test)[:,1]

# Scoring on testidation data with best params
print(f'Training AUC on best params: {(roc_auc_score(y_p_train, y_p_train_pred))}')
    
print(f'Validation AUC on best params: {(roc_auc_score(y_p_test, y_p_test_pred))}')

# Printing Confusion Matrix and Scoring reports
print()
print(confusion_matrix(y_p_test, nb_nltk_pos.predict(X_p_test)))
print()
print(classification_report(y_p_test, nb_nltk_pos.predict(X_p_test)))

Training AUC on best params: 0.9813997993959207
Validation AUC on best params: 0.8503895734296888

[[41978  3260]
 [ 6086  7284]]

              precision    recall  f1-score   support

           0       0.87      0.93      0.90     45238
           1       0.69      0.54      0.61     13370

    accuracy                           0.84     58608
   macro avg       0.78      0.74      0.75     58608
weighted avg       0.83      0.84      0.83     58608



## Spacy Lem

In [35]:
X_s = final['spacy_lem']
y_s = final['yta']

In [36]:
X_s_train, X_s_test, y_s_train, y_s_test = train_test_split(X_s, 
                                                    y_s,
                                                   test_size = 0.33,
                                                   random_state = 42,
                                                   stratify = y)

### Spacy Lem NB

In [37]:
nb_spacy_lem = GridSearchCV(pl, cv=5, param_grid = nb_param_grid, scoring = 'accuracy', verbose = 1, n_jobs = -1) 
nb_spacy_lem.fit(X_s_train, y_s_train)
nb_y_pred = nb_spacy_lem.predict(X_s_test)

print('Train Score:', nb_spacy_lem.score(X_s_train, y_s_train))
print('Test Score:', nb_spacy_lem.score(X_s_test, y_s_test))

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 18.1min finished


Train Score: 0.9404744896672858
Test Score: 0.8327361452361453


In [38]:
joblib.dump(nb_spacy_lem.best_estimator_, 'models/nb_spacy_lem.pkl')

['models/nb_spacy_lem.pkl']

In [39]:
y_s_train_pred = nb_spacy_lem.predict_proba(X_s_train)[:,1]
y_s_test_pred = nb_spacy_lem.predict_proba(X_s_test)[:,1]

# Scoring on testidation data with best params
print(f'Training AUC on best params: {(roc_auc_score(y_s_train, y_s_train_pred))}')
    
print(f'Validation AUC on best params: {(roc_auc_score(y_s_test, y_s_test_pred))}')

# Printing Confusion Matrix and Scoring reports
print()
print(confusion_matrix(y_s_test, nb_spacy_lem.predict(X_s_test)))
print()
print(classification_report(y_s_test, nb_spacy_lem.predict(X_s_test)))

Training AUC on best params: 0.9774156666829124
Validation AUC on best params: 0.8441512111643025

[[41280  3958]
 [ 5845  7525]]

              precision    recall  f1-score   support

           0       0.88      0.91      0.89     45238
           1       0.66      0.56      0.61     13370

    accuracy                           0.83     58608
   macro avg       0.77      0.74      0.75     58608
weighted avg       0.83      0.83      0.83     58608



### Spacy Lem LR

In [40]:
lr_spacy_lem = GridSearchCV(pl, cv=5, param_grid = lr_param_grid, scoring = 'accuracy', verbose = 1, n_jobs = -1) 
lr_spacy_lem.fit(X_s_train, y_s_train)
lr_y_pred = lr_spacy_lem.predict(X_s_test)

print('Train Score:', lr_spacy_lem.score(X_s_train, y_s_train))
print('Test Score:', lr_spacy_lem.score(X_s_test, y_s_test))

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 19.2min finished


Train Score: 0.8592666672269331
Test Score: 0.7788356538356538


In [41]:
joblib.dump(lr_spacy_lem.best_estimator_, 'models/lr_spacy_lem.pkl')

['models/lr_spacy_lem.pkl']

In [42]:
lr_spacy_lem = joblib.load('lr_spacy_lem.pkl')

In [43]:
y_s_train_pred = lr_spacy_lem.predict_proba(X_s_train)[:,1]
y_s_test_pred = lr_spacy_lem.predict_proba(X_s_test)[:,1]

# Scoring on testidation data with best params
print(f'Training AUC on best params: {(roc_auc_score(y_s_train, y_s_train_pred))}')
    
print(f'Validation AUC on best params: {(roc_auc_score(y_s_test, y_s_test_pred))}')

# Printing Confusion Matrix and Scoring reports
print()
print(confusion_matrix(y_s_test, lr_spacy_lem.predict(X_s_test)))
print()
print(classification_report(y_s_test, lr_spacy_lem.predict(X_s_test)))

Training AUC on best params: 0.9123322783414028
Validation AUC on best params: 0.8011109067201233

[[37254  7984]
 [ 5165  8205]]

              precision    recall  f1-score   support

           0       0.88      0.82      0.85     45238
           1       0.51      0.61      0.56     13370

    accuracy                           0.78     58608
   macro avg       0.69      0.72      0.70     58608
weighted avg       0.79      0.78      0.78     58608



# Best Model & Evaluation
Surprisingly, the model that generated with the highest accuracy was from the cleaned_text dataset that only used limited text processing with no lemmatizers. This model had a test score of 0.846 along with the highest precision for 

Let's pull out the features.

In [59]:
#Load in the Cleaned_text multinomial bayes model
ctnb = joblib.load('models/nb_cleaned_text.pkl')

In [60]:
tf = ctnb.named_steps['vectorizer']
nb = ctnb.named_steps['classifier']

In [61]:
features = pd.DataFrame(data = [tf.get_feature_names(), nb.feature_log_prob_[0]])
features = features.transpose()
features.columns = ['features', 'coef']

In [62]:
top = features.sort_values('coef', ascending=True).head(25)
bot = features.sort_values('coef', ascending=False).head(25)

In [63]:
top.head(25)

Unnamed: 0,features,coef
325103,quotes needed,-13.7789
206883,keep decision,-13.7789
206891,keep dick,-13.7789
318200,probably feels utterly,-13.7789
393992,support structure,-13.7789
393994,support stupid,-13.7789
421539,transition whatever,-13.7789
206914,keep doubt,-13.7789
421534,transition need,-13.7789
421533,transition hard,-13.7789


The top predictors for the model contain far more negative language than the bottom 25. Several of the n-gram combinations use language like 'stupid' or 'asshole'. Additionally, many of the descriptors are actions, using  which indicate that when voting for YTA, people tend to use more active language such as 'keep *action-ing*'

In [64]:
bot

Unnamed: 0,features,coef
226845,like,-6.75228
459722,would,-6.88085
153347,get,-6.91859
301833,people,-7.19734
130384,family,-7.23697
437842,want,-7.24865
414743,time,-7.27165
295630,parents,-7.28151
285727,one,-7.28863
118006,even,-7.34657


For the predictors for NTA they are mostly 1 word n-grams. A common theme in there seems to be family, 'parents, mom, sister, kids'. 

# Test Predict

Finally, let's predict the final chosen model to the test data and have a look at the results. 

In [8]:
test = pd.read_csv('datasets/test.csv')
test.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,nta,cleaned_text,nltk_lem,nltk_pos,spacy_lem
0,k4owfz,Is it possible for mods to pin some ‘YTA’ thre...,23,1,1,possible mods pin threads front page pretty pl...,possible mod pin thread front page pretty plea...,possible mod pin thread front page pretty plea...,possible mod pin thread page pretty single pos...
1,k4owfz,Is there anything which could be done about th...,7,1,1,anything could done section posts op get massi...,anything could done section post op get massiv...,anything could do section post op get massive ...,section post op massive majority respond small...
2,k4owfz,This sub has a really huge double standards pr...,-3,1,1,sub really huge double standards problem read ...,sub really huge double standard problem read p...,sub really huge double standard problem read p...,sub huge double standard problem read post hus...
3,k4owfz,"Or... You can pass your judgement, upvote the ...",13,1,1,pass judgement upvote judgements agree let chi...,pas judgement upvote judgement agree let chip ...,pas judgement upvote judgement agree let chip ...,pass judgement upvote judgement agree let chip...
4,k4owfz,I do upvote any YTA post and don’t upvote any ...,3,1,1,upvote post upvote posts regardless want ratio...,upvote post upvote post regardless want ratio ...,upvote post upvote post regardless want ratio ...,upvote post upvote post regardless want ratio ...


In [9]:
test.shape

(880, 9)

In [10]:
#Load in the Cleaned_text multinomial bayes model
ctnb = joblib.load('models/nb_cleaned_text.pkl')

In [11]:
test.dropna(inplace=True)

In [15]:
predict_proba = ctnb.predict_proba(test['cleaned_text'])


array([[0.4640047 , 0.5359953 ],
       [0.70148627, 0.29851373],
       [0.6083726 , 0.3916274 ],
       [0.45207695, 0.54792305],
       [0.4365492 , 0.5634508 ],
       [0.87443529, 0.12556471],
       [0.66427474, 0.33572526],
       [0.50913708, 0.49086292],
       [0.46703686, 0.53296314],
       [0.83062604, 0.16937396]])

In [16]:
predict_proba[:10]

array([[0.4640047 , 0.5359953 ],
       [0.70148627, 0.29851373],
       [0.6083726 , 0.3916274 ],
       [0.45207695, 0.54792305],
       [0.4365492 , 0.5634508 ],
       [0.87443529, 0.12556471],
       [0.66427474, 0.33572526],
       [0.50913708, 0.49086292],
       [0.46703686, 0.53296314],
       [0.83062604, 0.16937396]])

In [17]:
predictions[:10]

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 0])

In [18]:
test_preds = pd.DataFrame(predict_proba, columns = ['NTA', 'YTA'])

In [19]:
test_preds['comment'] = test['comment_text'].copy(deep=True)

In [20]:
test_preds['predicted'] = predictions

In [24]:
r_nta = test_preds[test_preds['NTA'] > 0.75]
r_nta.head()

Unnamed: 0,NTA,YTA,comment,predicted
5,0.874435,0.125565,In this specific case I see OP trying to defle...,0
9,0.830626,0.169374,NTA for serving the salad or not changing the ...,0
10,0.758963,0.241037,"NTA, because Claire is just overreacting, but ...",0
11,0.786301,0.213699,This woman is LITERAL garbage and YTA for brin...,0
13,0.811574,0.188426,YTA if you don't walk away from such a selfish...,0


In [37]:
r_yta = test_preds[test_preds['YTA'] > 0.75]
r_yta.head()

Unnamed: 0,NTA,YTA,comment,predicted
23,0.224109,0.775891,NTA. She dumps her kids on you unannounced and...,1
79,0.033502,0.966498,YTA *and* a horrible mother. I feel so sad for...,1
103,0.240079,0.759921,YTA to your wife as you ARE stating that you d...,1
125,0.235578,0.764422,YTA\n\nYou're the one in the same house as you...,1
126,0.240677,0.759323,"YTA. Absolutely. And as a teacher, let me teac...",1


In [36]:
r_yta['comment'][263] #0.76 #YTA

"YTA - you're happy to dump your kids on her, and you make an uneducated guess (you can't even name any of the reptiles) as to how dirty or dangerous they are over her advice as an educator.    Then you get mad when she tells your kids the truth!   There is not a single comment in your narrative that would make you NTA."

In [42]:
r_yta['comment'][103]

'YTA to your wife as you ARE stating that you do not trust her.  It takes two to tango and your ex gf is at fault the same as your brother.  You wife is not your ex girlfriend, but you are treating her as if she were.  NTA to your brother as you have the right to want him out before you leave no matter the reasoning.'

In [39]:
r_nta.head()

Unnamed: 0,NTA,YTA,comment,predicted
5,0.874435,0.125565,In this specific case I see OP trying to defle...,0
9,0.830626,0.169374,NTA for serving the salad or not changing the ...,0
10,0.758963,0.241037,"NTA, because Claire is just overreacting, but ...",0
11,0.786301,0.213699,This woman is LITERAL garbage and YTA for brin...,0
13,0.811574,0.188426,YTA if you don't walk away from such a selfish...,0


In [197]:
r_nta.tail()

Unnamed: 0,YTA,NTA,comment,predicted
723,0.228369,0.771631,NTA for not going but YTA for being upset she ...,1
776,0.165916,0.834084,Why are people saying YTA? NTA: you have a ri...,1
833,0.049038,0.950962,YTA. That's rude. If you simply suggested ther...,1
834,0.059344,0.940656,YTA bordering on E s h.\n\n\nShe isn't respect...,1
863,0.204134,0.795866,"Half NTA, Half YTA. I would say that it's a f...",1


In [41]:
r_nta['comment'][13] #0.81 # NTA

"YTA if you don't walk away from such a selfish person that she thinks a WHEELCHAIR bound child should be upstairs solely because a room that MIGHT have guests in it is larger.\n\nNTA for insisting the downstairs room is rightly more suitable for her. \n\nIf your fiance is fighting about this now, do you really trust her to be a good mother to your daughter when you aren't around?"

# Conclusion and Recommendations

In conclusion, through training on 4 types of text processing using a variety of vectorizers and classifers I identified a model that predicted if a person would vote YTA or NTA to a 85.9% beating out the baseline accuracy of 77%. Although more advanced techniques were used in the lemmatization of the text, the simplest text model beat out most of the other models. 

The end model seems to pick up reliably on the text. For example, in the above comment, the model was able to pick up on the language use although active strong language was used. From the extraction of the coefs, I would assume the model would recognize such language use as 'YTA' rather than the strong 'NTA" the post was given. 

Due to time restraints many of the aspects of the project were not explored. Future improvements to the project could be to access the post texts to do a word study between the comments and the original post, common words used between the two for example. 