In [113]:
# Imports
import pandas as pd
import gensim
import numpy as np
import nltk
import re

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
# nltk.download('punkt')

np.random.seed(2018) # set random seed


In [128]:
# Load data
datafolder = './data/'
tweets1 = pd.read_csv(datafolder + 'IRAhandle_tweets_1.csv')

# selecting content columns for subject categorization (all should be in english)
content = tweets1[tweets1.language == 'English'].content

# Remove external links to unknown websites and get rid of punctuation and lower case everything
content = content.apply(lambda x: re.sub(r'http\S+', '', x)).apply(lambda x: re.sub(r"'|\"|`|:|\?|~|,|\.", '', x))\
                .apply(lambda x: x.lower()).apply(lambda x: remove_stopwords(x))

content = content.values

In [129]:
# Tokenization: Split text into words. Lowercase the words and remove punctuation

stemmer = SnowballStemmer("english") # initiate dictionary type
# create words from sentences
content_tokens = [nltk.word_tokenize(x) for x in content]
# Stem words (truncate)
content_stemmed = []
for tweet in content_tokens:
    stem = [stemmer.stem(x) for x in tweet if len(x) > 3] # only consider words more than 3 letters
    content_stemmed.append(stem)

In [130]:
# Create dictionary to know the frequency of words
dictionary = gensim.corpora.Dictionary(content_stemmed)
dictionary.filter_extremes(no_below=15) # remove words that have appearances less than 15 times

# Create bag-of-words
bow_corpus = [dictionary.doc2bow(doc) for doc in content_stemmed]

In [131]:
# Check shit is working
test = bow_corpus[126]
for i in range(len(test)):
    print("Word {} (\"{}\") appears {} time.".format(test[i][0],dictionary[test[i][0]], 
test[i][1]))

Word 294 ("left") appears 1 time.
Word 449 ("absolut") appears 1 time.
Word 537 ("irma") appears 1 time.
Word 580 ("boat") appears 1 time.
Word 581 ("car") appears 1 time.
Word 582 ("larg") appears 1 time.
Word 583 ("mess") appears 1 time.
Word 584 ("road") appears 1 time.


In [132]:
# Create model on bag-of-words

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [133]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.021*"state" + 0.015*"hillari" + 0.013*"clinton" + 0.012*"investig" + 0.010*"head" + 0.010*"justic" + 0.008*"meet" + 0.007*"muslim" + 0.007*"obama" + 0.007*"speak"
Topic: 1 
Words: 0.044*"polit" + 0.015*"trump" + 0.015*"rt_america" + 0.011*"parti" + 0.008*"histori" + 0.008*"berni" + 0.008*"democrat" + 0.008*"blackhistorymonth" + 0.007*"celebr" + 0.007*"exercis"
Topic: 2 
Words: 0.031*"financ" + 0.027*"fit" + 0.013*"orlean" + 0.009*"peac" + 0.009*"break" + 0.009*"maryland" + 0.008*"power" + 0.007*"profile】" + 0.007*"【see" + 0.007*"rice"
Topic: 3 
Words: 0.079*"stock" + 0.048*"weight" + 0.042*"lose" + 0.036*"white" + 0.012*"hous" + 0.009*"loss" + 0.008*"market" + 0.007*"flint" + 0.006*"water" + 0.006*"inspir"
Topic: 4 
Words: 0.016*"citi" + 0.014*"love" + 0.013*"feel" + 0.011*"come" + 0.011*"good" + 0.010*"join" + 0.008*"like" + 0.008*"happi" + 0.008*"greec" + 0.008*"univers"
Topic: 5 
Words: 0.063*"black" + 0.044*"trump" + 0.016*"support" + 0.015*"presid" + 0.015*"raci