<a href="https://colab.research.google.com/github/anandaltekar/subredditTopics/blob/master/redditTopics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import sqlite3

# Text Cleaning
import string
import re
import nltk 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

#Generating n-grams
from gensim.models import Phrases

In [0]:
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
df = pd.read_csv('/content/drive/My Drive/Policygenius/Insurance_comments.csv')
df.drop(columns=['Author','ID','score'])
df = df[df['Author'] != '[deleted]']
df.describe()

Unnamed: 0,score
count,164751.0
mean,2.245273
std,3.209472
min,-47.0
25%,1.0
50%,1.0
75%,3.0
max,171.0


In [0]:
def first_clean(text):
    text = str(text)
    text = text.replace('\\n',' ')
    text = text.replace('&amp',' ')
    text = text.replace(';#x200B;',' ')
    text = text.replace('nbsp',' ')

    return text
df['body'] = df['body'].apply(lambda x : first_clean(x))

In [0]:
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_english = stopwords.words('english')

In [0]:
def clean_text(text): 
    #make string lowercase 
    text = str(text)
    text = text.lower()
    
    #remove links
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)

    #tokenize
    tokens = nltk.word_tokenize(text) 
    clean_text = []
    
    #remove stopwords, puncuation, then lemmatize
    for word in tokens:
        if (word not in stopwords_english and word not in string.punctuation): 
            token = wordnet_lemmatizer.lemmatize(word)
            clean_text.append(token)
            
    #remove words of length 3 or smaller        
    clean_text = [token for token in clean_text if len(token) > 3] 
            
    return clean_text

In [0]:
df['body_clean'] = df['body'].apply(lambda x : clean_text(x))

**remove comments < 5 words**

In [0]:
df['body_length'] = df['body_clean'].apply(lambda x : len(x))
df = df[df['body_length'] >= 5]

In [0]:
df = df.drop(columns=['body_length'])

**creating Bi and Tri-grams**

In [0]:
docs = df['body_clean']
bigram = Phrases(docs, min_count=10)
trigram = Phrases(bigram[docs])

In [0]:
def add_ngram(doc): 
    return trigram[bigram[doc]]

In [0]:
df['body_ngrams'] = df['body_clean'].apply(lambda x : add_ngram(x))

## **Topic Model**

In [0]:
!pip install pyLDAvis

In [0]:
import matplotlib.pyplot as plt
from datetime import datetime

#language processsing 
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
%matplotlib inline

In [0]:
comments = df['body_clean']
dictionary = Dictionary(comments)
dictionary.filter_extremes(no_below=20, no_above=0.5)

corpus = [dictionary.doc2bow(doc) for doc in comments]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 8589
Number of documents: 159601


In [0]:
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = 1 
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

# index to word dictionary
%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)


CPU times: user 26min 42s, sys: 464 ms, total: 26min 43s
Wall time: 26min 44s


In [0]:
vis =  pyLDAvis.gensim.prepare(model, corpus, dictionary)

In [0]:
vis

In [0]:
top_topics = model.top_topics(corpus)

# Average topic coherence = sum of topic coherences of all topics / number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -2.7714.
[([(0.028572386, 'would'),
   (0.018242227, 'like'),
   (0.013850699, 'time'),
   (0.012927533, 'make'),
   (0.012859421, 'know'),
   (0.012531349, 'need'),
   (0.011944751, 'going'),
   (0.011383575, 'could'),
   (0.011340816, 'work'),
   (0.010869248, 'want'),
   (0.010847188, 'even'),
   (0.01059936, 'also'),
   (0.009944916, 'thing'),
   (0.009395708, 'people'),
   (0.008957905, 'take'),
   (0.008746757, 'much'),
   (0.00836156, 'think'),
   (0.007937216, 'sure'),
   (0.007619242, 'good'),
   (0.00738317, 'really')],
  -1.8316223946778893),
 ([(0.13506289, 'insurance'),
   (0.08351016, 'claim'),
   (0.057518415, 'company'),
   (0.027367488, 'accident'),
   (0.01856414, 'damage'),
   (0.018113334, 'deductible'),
   (0.016034855, 'file'),
   (0.016019328, 'fault'),
   (0.015176741, 'insurer'),
   (0.014800169, 'likely'),
   (0.014781383, 'case'),
   (0.012712441, 'party'),
   (0.012605648, 'adjuster'),
   (0.012053311, 'driver'),
   (0.011960924, 'pe