In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
import string

from bs4 import BeautifulSoup  
from matplotlib_venn import venn2
from wordcloud import WordCloud

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix, roc_auc_score

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.tokens import Doc
from nltk.corpus import stopwords
import en_core_web_sm

# Data Cleaning & Preprocessing

In [2]:
import pandas as pd

## Importing CSV

In [8]:
c1 = pd.read_csv('datasets/comments.csv')
c1.head()

Unnamed: 0,post_id,comment_text,comment_distinguished,comment_score
0,jbswil,This is such a great post with wealth of infor...,,739
1,jbswil,Thank you so much! So glad to see this up and ...,,218
2,jbswil,Since joining this subreddit purely for entert...,,66
3,jbswil,Thank you for putting in the effort to help pr...,,159
4,jbswil,NTA \n\nThis is great. Almost like breaking th...,,97


In [9]:
c2 = pd.read_csv('datasets/comments2.csv')
c2.head()

Unnamed: 0,post_id,comment_text,comment_distinguished,comment_score
0,kgiekd,[The guy who couldn’t handle his teacher GF’s ...,,1732
1,kgiekd,This was my absolute FAVORITE comment this yea...,,1451
2,kgiekd,The recent thread of the couple who celebrated...,,1366
3,kgiekd,[This guy for Asshole of the year](https://www...,,1248
4,kgiekd,[The woman who mixed all her boyfriend's rice ...,,1024


In [13]:
c3 = pd.read_csv('datasets/comments3.csv')
c3.head()

Unnamed: 0,post_id,comment_text,comment_distinguished,comment_score
0,koga1e,How many accounts of our “my stepmom sucks” tr...,,49
1,koga1e,I feel like this subreddit has deteriorated ov...,,45
2,koga1e,"Can there be ban on ""aita for reporting my [th...",,91
3,koga1e,"Something has to be done against the ""sick bur...",,77
4,koga1e,People really need to be encouraged to use ESH...,,37


In [18]:
c4 = pd.read_csv('datasets/comments4_1.csv')
c4.head()

Unnamed: 0,post_id,comment_text,comment_distinguished,comment_score
0,koga1e,I feel like this subreddit has deteriorated ov...,,69
1,koga1e,How many accounts of our “my stepmom sucks” tr...,,62
2,koga1e,"Can there be ban on ""aita for reporting my [th...",,119
3,koga1e,"Something has to be done against the ""sick bur...",,114
4,koga1e,People really need to be encouraged to use ESH...,,47


In [19]:
c5 = pd.read_csv('datasets/comments5.csv')
c5.head()

Unnamed: 0,post_id,comment_text,comment_distinguished,comment_score
0,koga1e,I feel like this subreddit has deteriorated ov...,,73
1,koga1e,How many accounts of our “my stepmom sucks” tr...,,71
2,koga1e,"Can there be ban on ""aita for reporting my [th...",,122
3,koga1e,"Something has to be done against the ""sick bur...",,122
4,koga1e,People really need to be encouraged to use ESH...,,50


## Duplicate entries in dataset c3, c4 & c5

The first entries appear to be duplicates of each other in dataset c3, c4 & c5. I'm not sure why this was the case since they were pulled 1 week apart similarly to c1 & 2 with no duplicates. The comment scores are changing in those cases showing that time has passed between them. Either way I'm checking for what else could have been duplicated. 

In [15]:
c1[c1['comment_text'].eq(c2['comment_text'])]

Unnamed: 0,post_id,comment_text,comment_distinguished,comment_score
16594,keq3jb,NTA,,1
23349,kei6y5,NTA,,0
23470,kei6y5,[deleted],,-13
28848,kehjtm,YTA,,2
42446,kegaah,NTA,,3
73119,kdvpp1,NTA,,1
76384,kdnydq,YTA,,1
77379,kdnydq,Your comment has been removed because it viola...,moderator,1


In [16]:
c2[c2['comment_text'].eq(c3['comment_text'])]

Unnamed: 0,post_id,comment_text,comment_distinguished,comment_score
2382,krxs5l,NTA,,1
5700,krq6il,NTA,,1
8635,krj7cr,[removed],,1
15653,kra9ca,[removed],,1
81040,kqe7js,NTA,,1


In [20]:
c3[c3['comment_text'].eq(c4['comment_text'])]

Unnamed: 0,post_id,comment_text,comment_distinguished,comment_score
2,koga1e,"Can there be ban on ""aita for reporting my [th...",,91
3,koga1e,"Something has to be done against the ""sick bur...",,77
4,koga1e,People really need to be encouraged to use ESH...,,37
34,koga1e,Besides reporting for No Interpersonal Conflic...,,12
4518,kulttx,[deleted],,-15
23057,ktuqop,NTA,,1
33118,ktrljn,[removed],,32
51545,ktac7r,NTA,,1
51855,ktac7r,[deleted],,0
51876,ktac7r,[deleted],,1


In [9]:
c3[c3['comment_text'].eq(c5['comment_text'])]

  c3[c3['comment_text'].eq(c5['comment_text'])]


Unnamed: 0,post_id,comment_text,comment_distinguished,comment_score
2,koga1e,"Can there be ban on ""aita for reporting my [th...",,91
3,koga1e,"Something has to be done against the ""sick bur...",,77
4,koga1e,People really need to be encouraged to use ESH...,,37
66,koga1e,Can we ban the whole parent or spouse finding ...,,9
10055,kupp8t,NTA,,1
15916,ku4lab,[removed],,1
17430,ku3xev,NTA,,1
18133,ku1f24,[removed],,1
39682,kun16f,YTA,,1
51622,ktac7r,Your comment has been removed because it viola...,moderator,1


In [10]:
c4[c4['comment_text'].eq(c5['comment_text'])]

  c4[c4['comment_text'].eq(c5['comment_text'])]


Unnamed: 0,post_id,comment_text,comment_distinguished,comment_score
0,koga1e,I feel like this subreddit has deteriorated ov...,,69
1,koga1e,How many accounts of our “my stepmom sucks” tr...,,62
2,koga1e,"Can there be ban on ""aita for reporting my [th...",,119
3,koga1e,"Something has to be done against the ""sick bur...",,114
4,koga1e,People really need to be encouraged to use ESH...,,47
5,koga1e,"can I just share, after reading through multip...",,45
6,koga1e,Why does everyone have twins? If you have twin...,,36
40,koga1e,"How about a stickied thread titled ""This is no...",,15
133,koga1e,I had been away from Reddit for a while but am...,,3
1048,l1meub,[removed],,1


It seems that the main culprit is the parent post with the post_id 'koga1e', likely a stickied post that has held for the past 3 weeks. 

In [26]:
post3 = pd.read_csv('datasets/posts3.csv')
post3.head()

Unnamed: 0,title,id,date_created,text,distinguished,score,upvote_ratio
0,Monthly Open Forum January 2021,koga1e,2021-01-01 18:33:45,Welcome to the monthly open forum! This is the...,,187,0.91
1,META: r/AmITheAsshole Best of 2020 Awards - Vo...,ksm2ot,2021-01-07 20:34:01,"# Hey assholes!\n\nAs you know, we are hosting...",,204,0.94
2,AITA for walking out on husband and babies?,kusiba,2021-01-11 01:52:35,Throw away and title sounds awful and I’m (35F...,,6207,0.97
3,"AITA for ""lying to my cat""",kulkxn,2021-01-10 19:57:39,Oh god this is stupid but I was told to ask ot...,,7654,0.96
4,AITA: For firing my son?,kuhtd9,2021-01-10 16:44:23,\nThrowaway because I don't want this seen on ...,,23104,0.97


In [27]:
post3['text'][0]

"Welcome to the monthly open forum! This is the place to share all your meta thoughts about the sub, and to have a dialog with the mod team.\n\n#Keep things civil. Rules still apply.\n\nIt's 2021! Everything is fixed now!\n\nA couple notes.\n\n* Our bot is live, but definitely still in testing. **Please help us by reporting the ~~judgement bot comment~~ post when it doesn't actually explain why they think they may be an asshole.** Some people are using it like a TL;DR or just copying and pasting their post as a reply. [ETA - sounds like the report option doesn't work on all platforms for the bot comment, so you can just report the post. The option is bundled with the META report]\n\n* Please stop PMing mods. We spam the hell out of the [modmail link](https://www.reddit.com/message/compose/?to=/r/AmItheAsshole). When you PM us, it's super easy for things to get buried in our inbox and delay your response time.\n\n#As always, do not directly link to posts/comments or post uncensored scre

Since this is the case, dropping duplicates will remove the repeated posts.

## Join Datasets/Drop Duplicates

In [29]:
full = pd.concat([c1, c2, c3, c4, c5], axis=0, ignore_index=True)
full.head()

Unnamed: 0,post_id,comment_text,comment_distinguished,comment_score
0,jbswil,This is such a great post with wealth of infor...,,739
1,jbswil,Thank you so much! So glad to see this up and ...,,218
2,jbswil,Since joining this subreddit purely for entert...,,66
3,jbswil,Thank you for putting in the effort to help pr...,,159
4,jbswil,NTA \n\nThis is great. Almost like breaking th...,,97


In [30]:
full.shape

(438598, 4)

In [31]:
full.drop_duplicates(subset='comment_text', keep='first',inplace=True)

In [32]:
full.shape

(417459, 4)

# Creating Target Variable

I will be mapping the coded language from the subreddit to the data based off if they contain 'YTA' or 'NTA' for the target variable. Additionally, for personal curiousity I will be creating 'ESH' and 'NAH' to see what the distribution is like

In [16]:
def ah_ratings(df):
    yta_list = []
    nta_list = []
    esh_list = []
    nah_list = []
    for i in df['comment_text']:
        if 'YTA' in i:
            yta_list.append(1)
        else: 
            yta_list.append(0)
        if 'NTA' in i: 
            nta_list.append(1)
        else: 
            nta_list.append(0)
        if 'ESH' in i: 
            esh_list.append(1)
        else: 
            esh_list.append(0)
        if 'NAH' in i: 
            nah_list.append(1)
        else: 
            nah_list.append(0)
            
    df['yta'] = yta_list
    df['esh'] = esh_list
    df['nta'] = nta_list
    df['nah'] = nah_list
        
    return df

In [17]:
ah_ratings(full)

Unnamed: 0,post_id,comment_text,comment_score,yta,esh,nta,nah
0,jbswil,This is such a great post with wealth of infor...,739,0,0,0,0
1,jbswil,Thank you so much! So glad to see this up and ...,218,0,0,0,0
2,jbswil,Since joining this subreddit purely for entert...,66,0,0,0,1
3,jbswil,Thank you for putting in the effort to help pr...,159,0,0,0,0
4,jbswil,NTA \n\nThis is great. Almost like breaking th...,97,0,0,1,0
...,...,...,...,...,...,...,...
438364,l2buhx,um no don’t listen to this person. screaming a...,7,0,0,0,0
438365,l2buhx,"No, no she is not. Your. Mental. Health. Is. I...",1,0,0,0,0
438541,l0pixt,"For Americans, this is like LA to Portland in ...",1,0,0,0,0
438549,l0pixt,I had to go watch it and laugh again lol,2,0,0,0,0


In [18]:
full[['yta','nta','esh','nah']].value_counts()

yta  nta  esh  nah
0    0    0    0      224793
     1    0    0      137384
1    0    0    0       40631
0    0    1    0        7030
          0    1        5615
1    1    0    0         880
0    1    1    0         316
1    0    1    0         252
0    1    0    1         252
1    0    0    1         123
0    0    1    1          85
1    1    1    0          45
          0    1          21
0    1    1    1          13
1    0    1    1          12
     1    1    1           7
dtype: int64

In [19]:
#Defining a function for easier access to specific combinations of values
def seek(yta_val,nta_val,esh_val,nah_val):
    return full[(full.yta ==yta_val) & (full.nta == nta_val) & (full.esh == esh_val) & (full.nah == nah_val)]

In [20]:
seek(0,0,0,0)

Unnamed: 0,post_id,comment_text,comment_score,yta,esh,nta,nah
0,jbswil,This is such a great post with wealth of infor...,739,0,0,0,0
1,jbswil,Thank you so much! So glad to see this up and ...,218,0,0,0,0
3,jbswil,Thank you for putting in the effort to help pr...,159,0,0,0,0
5,jbswil,Thank you so much for doing this. We often rea...,28,0,0,0,0
6,jbswil,Thank you. It breaks my heart seeing so many p...,24,0,0,0,0
...,...,...,...,...,...,...,...
438364,l2buhx,um no don’t listen to this person. screaming a...,7,0,0,0,0
438365,l2buhx,"No, no she is not. Your. Mental. Health. Is. I...",1,0,0,0,0
438541,l0pixt,"For Americans, this is like LA to Portland in ...",1,0,0,0,0
438549,l0pixt,I had to go watch it and laugh again lol,2,0,0,0,0


In [21]:
seek(1,1,0,0)

Unnamed: 0,post_id,comment_text,comment_score,yta,esh,nta,nah
198,k4owfz,Is it possible for mods to pin some ‘YTA’ thre...,23,1,0,1,0
235,k4owfz,Is there anything which could be done about th...,7,1,0,1,0
265,k4owfz,This sub has a really huge double standards pr...,-3,1,0,1,0
420,k4owfz,"Or... You can pass your judgement, upvote the ...",13,1,0,1,0
438,k4owfz,I do upvote any YTA post and don’t upvote any ...,3,1,0,1,0
...,...,...,...,...,...,...,...
436455,l1h6oh,NTA for giving Alice 6p days to move out and e...,0,1,0,1,0
436481,l1h6oh,You're both. \nNTA when it comes to Alice but ...,-1,1,0,1,0
437773,l2hnwv,It depends on the tone you used when you told ...,2,1,0,1,0
437972,l2744i,A lot of people responding seem to be on the s...,7,1,0,1,0


In [22]:
#Dropping posts with no voting data 
full.drop(seek(0,0,0,0).index, inplace=True)

In [23]:
full.shape

(192666, 7)

Creating a dataframe consisting of only 'YTA' and 'NTA' comments. 

In [24]:
c_yta = seek(1,0,0,0)
c_yta.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,esh,nta,nah
47,jbswil,Thanks for this. When I was getting gaslit I c...,2,1,0,0,0
127,k4owfz,does the 'no covid posts' rule extend to the c...,38,1,0,0,0
137,k4owfz,"Where do mods draw the line as far as the ""acc...",15,1,0,0,0
138,k4owfz,Does anyone ever ask AITA for cutting off fami...,28,1,0,0,0
164,k4owfz,AITA has reduced to recycling the same comment...,32,1,0,0,0


In [25]:
c_nta = seek(0,1,0,0)
c_nta.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,esh,nta,nah
4,jbswil,NTA \n\nThis is great. Almost like breaking th...,97,0,0,1,0
10,jbswil,NTA,32,0,0,1,0
74,jbswil,Lmaoo NTA,9,0,0,1,0
77,jbswil,"It is indeed heartbreaking, but when they make...",4,0,0,1,0
121,k4owfz,i feel like people on here are not contextuali...,41,0,0,1,0


In [26]:
test = (seek(1,1,0,0)).copy(deep=True)

In [27]:
test

Unnamed: 0,post_id,comment_text,comment_score,yta,esh,nta,nah
198,k4owfz,Is it possible for mods to pin some ‘YTA’ thre...,23,1,0,1,0
235,k4owfz,Is there anything which could be done about th...,7,1,0,1,0
265,k4owfz,This sub has a really huge double standards pr...,-3,1,0,1,0
420,k4owfz,"Or... You can pass your judgement, upvote the ...",13,1,0,1,0
438,k4owfz,I do upvote any YTA post and don’t upvote any ...,3,1,0,1,0
...,...,...,...,...,...,...,...
436455,l1h6oh,NTA for giving Alice 6p days to move out and e...,0,1,0,1,0
436481,l1h6oh,You're both. \nNTA when it comes to Alice but ...,-1,1,0,1,0
437773,l2hnwv,It depends on the tone you used when you told ...,2,1,0,1,0
437972,l2744i,A lot of people responding seem to be on the s...,7,1,0,1,0


In [28]:
final = pd.concat([c_yta, c_nta], axis=0, ignore_index=True)
final.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,esh,nta,nah
0,jbswil,Thanks for this. When I was getting gaslit I c...,2,1,0,0,0
1,k4owfz,does the 'no covid posts' rule extend to the c...,38,1,0,0,0
2,k4owfz,"Where do mods draw the line as far as the ""acc...",15,1,0,0,0
3,k4owfz,Does anyone ever ask AITA for cutting off fami...,28,1,0,0,0
4,k4owfz,AITA has reduced to recycling the same comment...,32,1,0,0,0


In [29]:
final.shape

(178015, 7)

Because the project is aiming solely for 

In [32]:
final.drop(columns= ['esh','nah'], inplace=True)

In [33]:
test.drop(columns= ['esh','nah'], inplace=True)

In [34]:
final.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,nta
0,jbswil,Thanks for this. When I was getting gaslit I c...,2,1,0
1,k4owfz,does the 'no covid posts' rule extend to the c...,38,1,0
2,k4owfz,"Where do mods draw the line as far as the ""acc...",15,1,0
3,k4owfz,Does anyone ever ask AITA for cutting off fami...,28,1,0
4,k4owfz,AITA has reduced to recycling the same comment...,32,1,0


In [35]:
test.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,nta
198,k4owfz,Is it possible for mods to pin some ‘YTA’ thre...,23,1,1
235,k4owfz,Is there anything which could be done about th...,7,1,1
265,k4owfz,This sub has a really huge double standards pr...,-3,1,1
420,k4owfz,"Or... You can pass your judgement, upvote the ...",13,1,1
438,k4owfz,I do upvote any YTA post and don’t upvote any ...,3,1,1


# Text Cleaning

In this section, I create 4 different sets of data.
1. cleaned_text: Stripped and cleaned text with only stopwords filtered
2. nltk_lem: Using the default wordnet lemmatizer
3. nltk_pos: Using nltk with parts of speech tagger to help filter the lemmatization
4. Spacy_lem: Using spacy lemmatizer 

In [46]:
# Creating stopwords
stop_words = set(stopwords.words('english'))  
special_stops = {'yta', 'nta', 'esh', 'nah', 'wibta', 'aita', 'YTA', 'NTA', 'ESH', 'NAH'}
stop_words = stop_words.union(special_stops)

In [37]:
def clean_coded_words(text):
    # 1. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    # 2. Convert to lower case, split into individual words.
    words = letters_only.lower().split()  
    # 3. Remove stopwords.
    meaningful_words = [w for w in words if not w in stop_words]
    # 4. Join the words back into one string separated by space
    return(" ".join(meaningful_words))

In [38]:
#Simple stripping
final['cleaned_text'] = final['comment_text'].apply(clean_coded_words)
final.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,nta,cleaned_text
0,jbswil,Thanks for this. When I was getting gaslit I c...,2,1,0,thanks getting gaslit came couple times got lo...
1,k4owfz,does the 'no covid posts' rule extend to the c...,38,1,0,covid posts rule extend comments seen posts to...
2,k4owfz,"Where do mods draw the line as far as the ""acc...",15,1,0,mods draw line far accept judgment rule get tr...
3,k4owfz,Does anyone ever ask AITA for cutting off fami...,28,1,0,anyone ever ask cutting family results verdict...
4,k4owfz,AITA has reduced to recycling the same comment...,32,1,0,reduced recycling comments get upvotes well wr...


In [39]:
#Simple stripping
test['cleaned_text'] = test['comment_text'].apply(clean_coded_words)
test.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,nta,cleaned_text
198,k4owfz,Is it possible for mods to pin some ‘YTA’ thre...,23,1,1,possible mods pin threads front page pretty pl...
235,k4owfz,Is there anything which could be done about th...,7,1,1,anything could done section posts op get massi...
265,k4owfz,This sub has a really huge double standards pr...,-3,1,1,sub really huge double standards problem read ...
420,k4owfz,"Or... You can pass your judgement, upvote the ...",13,1,1,pass judgement upvote judgements agree let chi...
438,k4owfz,I do upvote any YTA post and don’t upvote any ...,3,1,1,upvote post upvote posts regardless want ratio...


# NLTK Lemmatizer

In [40]:
#Assigning stopwords
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def nltk_clean(document):
    letters_only = re.sub("[^a-zA-Z]", " ", document)
    stop = " ".join([i for i in letters_only.lower().split() if i not in stop_words])
    remove_punctuation = ''.join(ch for ch in stop if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in remove_punctuation.split())
    return normalized

In [41]:
final['nltk_lem']=final['comment_text'].apply(lambda x:nltk_clean(x))
final.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,nta,cleaned_text,nltk_lem
0,jbswil,Thanks for this. When I was getting gaslit I c...,2,1,0,thanks getting gaslit came couple times got lo...,thanks getting gaslit came couple time got lot...
1,k4owfz,does the 'no covid posts' rule extend to the c...,38,1,0,covid posts rule extend comments seen posts to...,covid post rule extend comment seen post top c...
2,k4owfz,"Where do mods draw the line as far as the ""acc...",15,1,0,mods draw line far accept judgment rule get tr...,mod draw line far accept judgment rule get tri...
3,k4owfz,Does anyone ever ask AITA for cutting off fami...,28,1,0,anyone ever ask cutting family results verdict...,anyone ever ask cutting family result verdict ...
4,k4owfz,AITA has reduced to recycling the same comment...,32,1,0,reduced recycling comments get upvotes well wr...,reduced recycling comment get upvotes well wri...


In [42]:
test['nltk_lem']=test['comment_text'].apply(lambda x:nltk_clean(x))
test.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,nta,cleaned_text,nltk_lem
198,k4owfz,Is it possible for mods to pin some ‘YTA’ thre...,23,1,1,possible mods pin threads front page pretty pl...,possible mod pin thread front page pretty plea...
235,k4owfz,Is there anything which could be done about th...,7,1,1,anything could done section posts op get massi...,anything could done section post op get massiv...
265,k4owfz,This sub has a really huge double standards pr...,-3,1,1,sub really huge double standards problem read ...,sub really huge double standard problem read p...
420,k4owfz,"Or... You can pass your judgement, upvote the ...",13,1,1,pass judgement upvote judgements agree let chi...,pas judgement upvote judgement agree let chip ...
438,k4owfz,I do upvote any YTA post and don’t upvote any ...,3,1,1,upvote post upvote posts regardless want ratio...,upvote post upvote post regardless want ratio ...


## NLTK POS

In [43]:
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [44]:
lemmatizer = WordNetLemmatizer()

def nltk_clean_pos(document):
    #Return only letters in document
    letters_only = re.sub("[^a-zA-Z]", " ", document)
    #Iterating & removing stop_words
    stop = " ".join([i for i in letters_only.lower().split() if i not in stop_words])
    #removing punctuation
    remove_punctuation = ''.join(ch for ch in stop if ch not in exclude)
    #Iterating through, finding the correct POS Tags(through the earlier function) and lemmatizing accordingly
    pos_tag = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(remove_punctuation)]
    normalized = ' '.join(pos_tag)
    return normalized

In [45]:
final['nltk_pos']=final['comment_text'].apply(lambda x:nltk_clean_pos(x))
final.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,nta,cleaned_text,nltk_lem,nltk_pos
0,jbswil,Thanks for this. When I was getting gaslit I c...,2,1,0,thanks getting gaslit came couple times got lo...,thanks getting gaslit came couple time got lot...,thanks get gaslit come couple time get lot sta...
1,k4owfz,does the 'no covid posts' rule extend to the c...,38,1,0,covid posts rule extend comments seen posts to...,covid post rule extend comment seen post top c...,covid post rule extend comment see post top co...
2,k4owfz,"Where do mods draw the line as far as the ""acc...",15,1,0,mods draw line far accept judgment rule get tr...,mod draw line far accept judgment rule get tri...,mod draw line far accept judgment rule get tri...
3,k4owfz,Does anyone ever ask AITA for cutting off fami...,28,1,0,anyone ever ask cutting family results verdict...,anyone ever ask cutting family result verdict ...,anyone ever ask cut family result verdict seem...
4,k4owfz,AITA has reduced to recycling the same comment...,32,1,0,reduced recycling comments get upvotes well wr...,reduced recycling comment get upvotes well wri...,reduce recycle comment get upvotes well write ...


In [46]:
test['nltk_pos']=test['comment_text'].apply(lambda x:nltk_clean_pos(x))
test.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,nta,cleaned_text,nltk_lem,nltk_pos
198,k4owfz,Is it possible for mods to pin some ‘YTA’ thre...,23,1,1,possible mods pin threads front page pretty pl...,possible mod pin thread front page pretty plea...,possible mod pin thread front page pretty plea...
235,k4owfz,Is there anything which could be done about th...,7,1,1,anything could done section posts op get massi...,anything could done section post op get massiv...,anything could do section post op get massive ...
265,k4owfz,This sub has a really huge double standards pr...,-3,1,1,sub really huge double standards problem read ...,sub really huge double standard problem read p...,sub really huge double standard problem read p...
420,k4owfz,"Or... You can pass your judgement, upvote the ...",13,1,1,pass judgement upvote judgements agree let chi...,pas judgement upvote judgement agree let chip ...,pas judgement upvote judgement agree let chip ...
438,k4owfz,I do upvote any YTA post and don’t upvote any ...,3,1,1,upvote post upvote posts regardless want ratio...,upvote post upvote post regardless want ratio ...,upvote post upvote post regardless want ratio ...


## Spacy

In [50]:
nlp = en_core_web_sm.load()
#with full text
def spacy_lemma(doc):
    text=nlp(doc)
    newtext=[]
    for token in text:
        if not token.is_stop:
                newtext.append(token.lemma_)

    return " ".join(token for token in newtext)

In [51]:
final['spacy_lem']=final['cleaned_text'].apply(lambda x:spacy_lemma(x))
final.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,nta,cleaned_text,nltk_lem,nltk_pos,spacy_lem
0,jbswil,Thanks for this. When I was getting gaslit I c...,2,1,0,thanks getting gaslit came couple times got lo...,thanks getting gaslit came couple time got lot...,thanks get gaslit come couple time get lot sta...,thank get gaslit come couple time get lot stay...
1,k4owfz,does the 'no covid posts' rule extend to the c...,38,1,0,covid posts rule extend comments seen posts to...,covid post rule extend comment seen post top c...,covid post rule extend comment see post top co...,covid post rule extend comment see post commen...
2,k4owfz,"Where do mods draw the line as far as the ""acc...",15,1,0,mods draw line far accept judgment rule get tr...,mod draw line far accept judgment rule get tri...,mod draw line far accept judgment rule get tri...,mod draw line far accept judgment rule tricky ...
3,k4owfz,Does anyone ever ask AITA for cutting off fami...,28,1,0,anyone ever ask cutting family results verdict...,anyone ever ask cutting family result verdict ...,anyone ever ask cut family result verdict seem...,ask cut family result verdict like general rig...
4,k4owfz,AITA has reduced to recycling the same comment...,32,1,0,reduced recycling comments get upvotes well wr...,reduced recycling comment get upvotes well wri...,reduce recycle comment get upvotes well write ...,reduce recycling comment upvote write comment ...


In [52]:
test['spacy_lem']=test['cleaned_text'].apply(lambda x:spacy_lemma(x))
test.head()

Unnamed: 0,post_id,comment_text,comment_score,yta,nta,cleaned_text,nltk_lem,nltk_pos,spacy_lem
198,k4owfz,Is it possible for mods to pin some ‘YTA’ thre...,23,1,1,possible mods pin threads front page pretty pl...,possible mod pin thread front page pretty plea...,possible mod pin thread front page pretty plea...,possible mod pin thread page pretty single pos...
235,k4owfz,Is there anything which could be done about th...,7,1,1,anything could done section posts op get massi...,anything could done section post op get massiv...,anything could do section post op get massive ...,section post op massive majority respond small...
265,k4owfz,This sub has a really huge double standards pr...,-3,1,1,sub really huge double standards problem read ...,sub really huge double standard problem read p...,sub really huge double standard problem read p...,sub huge double standard problem read post hus...
420,k4owfz,"Or... You can pass your judgement, upvote the ...",13,1,1,pass judgement upvote judgements agree let chi...,pas judgement upvote judgement agree let chip ...,pas judgement upvote judgement agree let chip ...,pass judgement upvote judgement agree let chip...
438,k4owfz,I do upvote any YTA post and don’t upvote any ...,3,1,1,upvote post upvote posts regardless want ratio...,upvote post upvote post regardless want ratio ...,upvote post upvote post regardless want ratio ...,upvote post upvote post regardless want ratio ...


In [53]:
final.shape

(178015, 9)

In [54]:
test.shape

(880, 9)

In [55]:
final.to_csv('datasets/final.csv',index = False)

In [56]:
test.to_csv('datasets/test.csv',index = False)

Just a quick comparison to understand what the lemmatizers have done & the differences between them.

In [55]:
final['comment_text'][0]

'Thanks for this. When I was getting gaslit I came here a couple of times and got a lot of "YTA for staying with him/dating him in the first place". I think this happens more on the low traffic posts especially. Not particularly helpful, especially since I just took that quiz and the threshold for abuse is 5 and that relationship scored 33!'

In [56]:
final['cleaned_text'][0]

'thanks getting gaslit came couple times got lot staying dating first place think happens low traffic posts especially particularly helpful especially since took quiz threshold abuse relationship scored'

In [57]:
final['nltk_lem'][0]

'thanks getting gaslit came couple time got lot staying dating first place think happens low traffic post especially particularly helpful especially since took quiz threshold abuse relationship scored'

In [58]:
final['nltk_pos'][0]

'thanks get gaslit come couple time get lot stay date first place think happens low traffic post especially particularly helpful especially since take quiz threshold abuse relationship score'

In [59]:
final['spacy_lem'][0]

'thank get gaslit come couple time get lot stay date place think happen low traffic post especially particularly helpful especially take quiz threshold abuse relationship score'