In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/reddit-comments-may-2015/database.sqlite
/kaggle/input/reddit-comments-may-2015/reddit-comments-may-2015/database.sqlite


In [2]:
# read the database
import sqlite3
db = sqlite3.connect('/kaggle/input/reddit-comments-may-2015/reddit-comments-may-2015/database.sqlite')

documents = pd.read_sql_query("SELECT * FROM May2015 LIMIT 50000", db)
documents['index'] = documents.index
print(len(documents))
print(documents[:5])

50000
   created_utc  ups subreddit_id    link_id        name  score_hidden  \
0   1430438400    4     t5_378oi  t3_34di91  t1_cqug90g             0   
1   1430438400    4     t5_2qo4s  t3_34g8mx  t1_cqug90h             0   
2   1430438400    0     t5_2cneq  t3_34f7mc  t1_cqug90i             0   
3   1430438400    3     t5_2qh1i  t3_34f9rh  t1_cqug90j             0   
4   1430438400    3     t5_2qh1i  t3_34fvry  t1_cqug90k             0   

  author_flair_css_class author_flair_text  subreddit       id  ... archived  \
0                   None              None  soccer_jp  cqug90g  ...        0   
1                   Heat              Heat        nba  cqug90h  ...        0   
2                   None              None   politics  cqug90i  ...        0   
3                   None              None  AskReddit  cqug90j  ...        0   
4                   None              None  AskReddit  cqug90k  ...        0   

           author  score  retrieved_on  \
0           rx109      4    1432

In [3]:
# preprocessing imports

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2019)
import nltk
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


False

In [4]:
# preprocessing functions

def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(lemmatize(token))
    return result

doc_sample = documents[documents['index'] == 4310]['body'].values[0]
print('original document: ', doc_sample)
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document:  Cowboy and gandiva for chain? 
['Cowboy', 'and', 'gandiva', 'for', 'chain?', '']


 tokenized and lemmatized document: 
['cowboy', 'gandiva', 'chain']


In [5]:
# preprocess all docs

processed_docs = documents['body'].map(preprocess)
processed_docs[:10]

0                                          [図書館に出ねーかな]
1                           [watch, nfl, draft, guess]
2    [imply, return, time, near, political, environ...
3    [european, accent, exist, accent, europe, euro...
4                            [kid, remind, kevin, sad]
5    [haha, get, nauseous, ingame, experience, give...
6    [read, wholeheartedly, believe, let, simply, a...
7                                           [let, guy]
8       [buy, mystery, sampler, small, batch, request]
9    [nihilum, significantly, better, theory, think...
Name: body, dtype: object

In [6]:
# create pre-bow structure to filter out extremes

dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 draft
1 guess
2 nfl
3 watch
4 american
5 concept
6 environment
7 imply
8 luck
9 near
10 people


In [7]:
# create bow

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[-1]

[(58, 1),
 (96, 1),
 (220, 1),
 (319, 1),
 (523, 1),
 (832, 2),
 (905, 1),
 (1446, 1),
 (1989, 1),
 (2338, 1),
 (2428, 1)]

In [8]:
# preview of bow

bow_doc_last = bow_corpus[-1]
for i in range(len(bow_doc_last)):
    print("Word {} (\"{}\") appears {} time(s).".format(bow_doc_last[i][0], dictionary[bow_doc_last[i][0]], bow_doc_last[i][1]))

Word 58 ("think") appears 1 time(s).
Word 96 ("bad") appears 1 time(s).
Word 220 ("kill") appears 1 time(s).
Word 319 ("heavy") appears 1 time(s).
Word 523 ("speed") appears 1 time(s).
Word 832 ("reload") appears 2 time(s).
Word 905 ("fine") appears 1 time(s).
Word 1446 ("wont") appears 1 time(s).
Word 1989 ("plenty") appears 1 time(s).
Word 2338 ("pve") appears 1 time(s).
Word 2428 ("pvp") appears 1 time(s).


In [9]:
# create tf*idf model

from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[]


In [10]:
# run lda model with bow

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.023*"play" + 0.016*"time" + 0.013*"game" + 0.012*"like" + 0.010*"get" + 0.008*"nice" + 0.008*"think" + 0.008*"good" + 0.008*"win" + 0.008*"man"
Topic: 1 
Words: 0.040*"http" + 0.039*"amp" + 0.039*"com" + 0.036*"thank" + 0.023*"www" + 0.019*"reddit" + 0.018*"wiki" + 0.018*"org" + 0.017*"https" + 0.012*"wikipedia"
Topic: 2 
Words: 0.036*"post" + 0.025*"comment" + 0.023*"reddit" + 0.022*"question" + 0.020*"message" + 0.019*"com" + 0.016*"http" + 0.016*"www" + 0.013*"yes" + 0.012*"lol"
Topic: 3 
Words: 0.017*"like" + 0.016*"go" + 0.016*"trade" + 0.014*"people" + 0.011*"get" + 0.010*"white" + 0.009*"think" + 0.009*"black" + 0.008*"day" + 0.008*"shit"
Topic: 4 
Words: 0.012*"time" + 0.012*"know" + 0.011*"right" + 0.011*"come" + 0.010*"like" + 0.009*"help" + 0.009*"guy" + 0.008*"actually" + 0.007*"get" + 0.007*"go"
Topic: 5 
Words: 0.051*"com" + 0.039*"http" + 0.033*"watch" + 0.022*"www" + 0.019*"https" + 0.016*"imgur" + 0.015*"youtube" + 0.009*"jpg" + 0.007*"video" + 0.007

In [11]:
# run lda model with tf*idf

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.452*"delete" + 0.023*"yes" + 0.011*"love" + 0.007*"pronounce" + 0.007*"let" + 0.007*"say" + 0.007*"wait" + 0.005*"wtf" + 0.005*"til" + 0.004*"disappoint"
Topic: 1 Word: 0.015*"trade" + 0.014*"know" + 0.012*"yeah" + 0.011*"wow" + 0.010*"want" + 0.010*"titans" + 0.010*"goodell" + 0.009*"right" + 0.008*"holy" + 0.007*"fuck"
Topic: 2 Word: 0.014*"fuck" + 0.007*"mean" + 0.006*"like" + 0.006*"think" + 0.005*"people" + 0.005*"try" + 0.004*"get" + 0.004*"know" + 0.004*"want" + 0.004*"say"
Topic: 3 Word: 0.011*"nice" + 0.010*"jag" + 0.008*"get" + 0.007*"fowler" + 0.005*"question" + 0.005*"like" + 0.005*"post" + 0.005*"good" + 0.005*"bot" + 0.004*"subreddit"
Topic: 4 Word: 0.011*"god" + 0.009*"shit" + 0.007*"happen" + 0.007*"work" + 0.007*"damn" + 0.006*"awesome" + 0.006*"great" + 0.006*"like" + 0.006*"eagle" + 0.006*"guy"
Topic: 5 Word: 0.030*"thank" + 0.015*"lol" + 0.011*"marcus" + 0.011*"marioto" + 0.011*"pick" + 0.008*"like" + 0.007*"cool" + 0.006*"people" + 0.005*"look" + 0

In [12]:
# performance evaluation

print(processed_docs[40000])

for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

['know', 'lot', 'people', 'shit', 'think', 'tnt', 'half', 'time', 'crew', 'pretty', 'funny', 'meh', 'insight', 'humor']

Score: 0.5498533844947815	 
Topic: 0.036*"like" + 0.024*"think" + 0.022*"look" + 0.021*"good" + 0.016*"try" + 0.011*"work" + 0.009*"people" + 0.009*"love" + 0.009*"get" + 0.008*"know"

Score: 0.05001950263977051	 
Topic: 0.040*"http" + 0.039*"amp" + 0.039*"com" + 0.036*"thank" + 0.023*"www" + 0.019*"reddit" + 0.018*"wiki" + 0.018*"org" + 0.017*"https" + 0.012*"wikipedia"

Score: 0.05001908540725708	 
Topic: 0.051*"com" + 0.039*"http" + 0.033*"watch" + 0.022*"www" + 0.019*"https" + 0.016*"imgur" + 0.015*"youtube" + 0.009*"jpg" + 0.007*"video" + 0.007*"nfl"

Score: 0.0500178225338459	 
Topic: 0.026*"people" + 0.014*"think" + 0.012*"say" + 0.011*"know" + 0.009*"like" + 0.007*"point" + 0.007*"things" + 0.006*"mean" + 0.006*"want" + 0.006*"get"

Score: 0.050016436725854874	 
Topic: 0.023*"play" + 0.016*"time" + 0.013*"game" + 0.012*"like" + 0.010*"get" + 0.008*"nice" + 0.