# Topic Modeling

Purpose: Identify topics within the dataset with the LDA model
<br>Note: Cleaning isn't perfect results show there are distinguishable topics

In [57]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import re
import csv
import nltk
import math
import random
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [58]:
#Variables used in cleaning the posts
my_stopwords = nltk.corpus.stopwords.words('english')
my_stopwords.extend(['t','v','don','didn','gt','like','got','asshole','i','im',"'t"])
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'
Apos_dict={"'s":" is","n't":" not","'m":" am","'ll":" will",
           "'d":" would","'ve":" have","'re":" are"}

In [59]:
#Cleans each post
def clean_post(post, bigrams=False):

    #removing contractions
    for key,value in Apos_dict.items():
        if key in post:
            post = post.replace(key,value)
    post = post.lower() #lowercase
    post = post.replace("'","") #removing apostraphe
    post = re.sub('[' + my_punctuation + ']+', ' ', post) # strip punctuation
    post = re.sub('\s+', ' ', post) #remove double spacing
    post = re.sub('([0-9]+)', '', post) # remove numbers
    post_token_list = [word for word in post.split(' ')
                        if word not in my_stopwords] # remove stopwords

    post_token_list = [word_rooter(word) if '#' not in word else word
                        for word in post_token_list] # apply word rooter
    if bigrams:
        post_token_list = post_token_list+[post_token_list[i]+'_'+post_token_list[i+1]
                                            for i in range(len(post_token_list)-1)]
    post = ' '.join(post_token_list)
    return post

In [60]:
#Removes blank lines, deleted posts, or removed posts
def csvClean(infile):
    redditData = pd.read_csv(infile)
    data = redditData['selftext']
    data  = data.replace('[deleted]',np.nan)
    data= data.replace('[removed]', np.nan)
    data = data.dropna()
    return data
    

In [61]:
#Returns a dataframe with the topics & weights
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [62]:
#Removing blank lines, removed posts, or deleted posts
postData = csvClean('2018_posts.csv')
postData = postData.to_frame()

#Cleaning the remaining posts
postData['clean_posts'] = postData.selftext.apply(clean_post)

#Creating vector
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

#Transforming vector
tf = vectorizer.fit_transform(postData['clean_posts'].values.astype('U')).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

#Creating model
number_of_topics = 10
no_top_words = 10
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
model.fit(tf)
topicData = display_topics(model, tf_feature_names, no_top_words)
print(topicData)

  Topic 0 words Topic 0 weights Topic 1 words Topic 1 weights Topic 2 words  \
0            ’t         15376.0           mom          3999.2            ’t   
1             ’         10871.2          year          3607.9             ’   
2             i         10231.4          time          3357.3             i   
3            ’m          6578.2            go          3175.9            ’m   
4             ”          4818.1          want          3146.8            go   
5           don          4478.0         would          3031.9          time   
6            it          3489.7        famili          3003.8           get   
7          didn          3338.3        sister          2784.4          want   
8            ’v          2905.9           dad          2676.3           don   
9           she          2341.1           get          2548.7            it   

  Topic 2 weights Topic 3 words Topic 3 weights Topic 4 words Topic 4 weights  \
0          5947.9          work          7190.8  