In [1]:
import pandas as pd
import numpy as np
import os
import praw

E:\Current_Project


In [2]:
import pandas as pd

from collections import Counter
import re
 
import squarify
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
from spacy.tokenizer import Tokenizer

import en_core_web_lg
nlp = en_core_web_lg.load()

In [6]:
# This notebook is a tool for scraping text data from self posts on multiple subreddits, 
# then tokeinzing and lemmatizing the text post 

# First, a scraper
# client_id and client_secret are unique to each reddit user. Use yours before running this notebook.
reddit = praw.Reddit(client_id='BuYtI7QK33-lng', client_secret='Cyu6wGzW2pBsFyVWRShY8vEaUsg', user_agent='Scraping')

In [7]:
subs_list = ['suicidewatch', 'casualconversation', 'self', 'relationship_advice']

posts = []

for subs in subs_list:
    sw_subreddit = reddit.subreddit(subs)
    for post in sw_subreddit.top(limit = 1000):
        posts.append([post.title, post.score, post.id, post.subreddit, post.num_comments, post.selftext])

posts = pd.DataFrame(posts,columns=['title', 'score', 'id', 'subreddit', 'num_comments', 'body']).sample(frac=1)

In [3]:
# Then a tokenizer/lemmatizer

Tokenizer = Tokenizer(nlp.vocab)

In [4]:
def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(temp, columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc.sort_values(by='rank')

def get_lemmas(text):

    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

In [5]:
import string
printable = set(string.printable)

def cleanup(x):
    x = " ".join(x.split("\\n"))
    x = " ".join(x.split("\\t"))
    x = " ".join(x.split("\\r"))
    x = " ".join(x.split("\n"))
    x = " ".join(x.split("\t"))
    x = " ".join(x.split("\r"))
    x = " ".join(x.split(","))
    x = " ".join(x.split("."))
    x = " ".join(x.split("!"))
    x = " ".join(x.split("?"))
    x = x.lower()
    x = "".join(list(filter(lambda c: c in printable, x)))
    x = " ".join(filter(lambda z: z != '', x.split(" ")))
    return x

In [9]:
posts.head()

Unnamed: 0,title,score,id,subreddit,num_comments,body
837,Contemplating suicide due to being a 50 year o...,165,9k0d4y,True,89,I'm so fucking sick and tired of this life. I ...
442,Just got told to kill myself,324,e1wfoz,True,66,I guess that’s it. \n \nI didn’t expe...
1600,I wonder how many tourist's photos I'm in. I w...,4279,aaauag,False,276,I live in St. Augustine and it is quite a tour...
1019,I remember one time when I was in like 1st gra...,19466,e0feat,False,354,"And he said, “Yeah, that makes sense. You prob..."
2833,Hey reddit... my wife just stayed up with me u...,725,envjq,False,250,Only you guys could truly appreciate what an a...


In [10]:
# Finally, some cleaning up of dirty text.

import time
import re
"""Lemmatizing and stemming gives us a lower ROC-AUC score. So we will only clean \\n's, Username, IP and http links"""

start_time=time.time()
# remove '\\n'
posts['body'] = posts['body'].map(lambda x: re.sub('\\n',' ',str(x)))
    
# remove any text starting with User... 
posts['body'] = posts['body'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
# remove IP addresses or user IDs
posts['body'] = posts['body'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    
#remove http links in the text
posts['body'] = posts['body'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))

end_time=time.time()
print("total time",end_time-start_time)

total time 0.2465674877166748


In [11]:
%time posts['body'] = posts['body'].apply(cleanup)

Wall time: 649 ms


In [12]:
%time posts['lemmas'] = posts['body'].apply(get_lemmas)

Wall time: 1min 59s


In [27]:
posts.head()

Unnamed: 0,title,score,id,subreddit,num_comments,body,lemmas
837,Contemplating suicide due to being a 50 year o...,165,9k0d4y,1,89,i'm so fucking sick and tired of this life i j...,"[fuck, sick, tired, life, turn, 50, month, hav..."
442,Just got told to kill myself,324,e1wfoz,1,66,i guess thats it i didnt expect this to blow u...,"[guess, s, not, expect, blow, like, thank, lea..."
1600,I wonder how many tourist's photos I'm in. I w...,4279,aaauag,0,276,i live in st augustine and it is quite a touri...,"[live, st, augustine, tourist, attraction, bea..."
1019,I remember one time when I was in like 1st gra...,19466,e0feat,0,354,and he said yeah that makes sense you probably...,"[say, yeah, make, sense, probably, not, want, ..."
2833,Hey reddit... my wife just stayed up with me u...,725,envjq,0,250,only you guys could truly appreciate what an a...,"[guy, truly, appreciate, amazing, love, big, c..."


In [14]:
from symspellpy import SymSpell

corpus = []
for line in posts['lemmas'].values:
    tokens = [token for token in line if len(token) > 0]
    corpus.extend(tokens)
    
with open('toxicCorpus.txt', 'w') as filehandle:
        for listitem in corpus:
            filehandle.write('%s\n' % listitem)

symspell = SymSpell()
symspell.create_dictionary(corpus="toxicCorpus.txt")

True

In [15]:
def correctSpelling(x):
    corr = symspell.lookup(x, verbosity=10)
    if len(corr) > 0:
        return corr[0].term
    
    return x

posts['lemmas'] = [ [ correctSpelling(lemma) for lemma in line]
                 for line in posts['lemmas'].values ]

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [21]:
X = posts['lemmas']
y = posts['subreddit']

In [None]:
posts.to_csv('reddit_posts.csv')