In [18]:
# Imports
import pandas as pd
import gensim
import numpy as np
import nltk
import re

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
nltk.download('punkt')

np.random.seed(2018) # set random seed


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/halimaschede/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [92]:
# Load data
datafolder = './data/'
tweets1 = pd.read_csv(datafolder + 'IRAhandle_tweets_1.csv')

# selecting content column for subject categorization
content = tweets1.content

# Remove external links to unknown websites and get rid of punctuation and lower case everything
content = content.apply(lambda x: re.sub(r'http\S+', '', x)).apply(lambda x: re.sub(r"'|\"|`|:|\?|~|,|\.", '', x))\
                .apply(lambda x: x.lower())

content = content.values

In [95]:
# Tokenization: Split text into words. Lowercase the words and remove punctuation

stemmer = SnowballStemmer("english") # initiate dictionary type
# create words from sentences
content_tokens = [nltk.word_tokenize(x) for x in content]
# Stem words (truncate)
content_stemmed = []
for tweet in content_tokens:
    stem = [stemmer.stem(x) for x in tweet if len(x) > 3] # only consider words more than 3 letters
    content_stemmed.append(stem)

In [100]:
# Create dictionary to know the frequency of words
dictionary = gensim.corpora.Dictionary(content_stemmed)
dictionary.filter_extremes(no_below=15) # remove words that have appearances less than 15 times

# Create bag-of-words
bow_corpus = [dictionary.doc2bow(doc) for doc in content_stemmed]

In [107]:
# Check shit is working
test = bow_corpus[126]
for i in range(len(test)):
    print("Word {} (\"{}\") appears {} time.".format(test[i][0],dictionary[test[i][0]], 
test[i][1]))

Word 192 ("what") appears 1 time.
Word 352 ("left") appears 1 time.
Word 466 ("behind") appears 1 time.
Word 519 ("absolut") appears 1 time.
Word 614 ("irma") appears 1 time.
Word 658 ("boat") appears 1 time.
Word 659 ("car") appears 1 time.
Word 660 ("larg") appears 1 time.
Word 661 ("mess") appears 1 time.
Word 662 ("road") appears 1 time.


In [110]:
# Create model on bag-of-words

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [111]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.031*"this" + 0.029*"that" + 0.018*"your" + 0.017*"have" + 0.016*"they" + 0.016*"peopl" + 0.013*"what" + 0.012*"like" + 0.010*"will" + 0.010*"when"
Topic: 1 
Words: 0.026*"workout" + 0.020*"maga" + 0.015*"trump" + 0.015*"million" + 0.012*"water" + 0.010*"with" + 0.009*"check" + 0.009*"over" + 0.008*"exercis" + 0.008*"flint"
Topic: 2 
Words: 0.080*"weight" + 0.017*"merkel" + 0.010*"north" + 0.010*"bank" + 0.009*"father" + 0.008*"rice" + 0.007*"minist" + 0.007*"threat" + 0.006*"sport" + 0.006*"korea"
Topic: 3 
Words: 0.065*"stock" + 0.020*"financ" + 0.019*"polit" + 0.014*"with" + 0.010*"more" + 0.010*"news" + 0.009*"trump" + 0.009*"profile】" + 0.009*"【see" + 0.009*"market"
Topic: 4 
Words: 0.032*"fit" + 0.032*"school" + 0.013*"high" + 0.009*"john" + 0.008*"blackhistori" + 0.007*"best" + 0.007*"meet" + 0.007*"star" + 0.007*"with" + 0.006*"honor"
Topic: 5 
Words: 0.042*"black" + 0.036*"polic" + 0.021*"white" + 0.017*"kill" + 0.016*"blacklivesmatt" + 0.015*"cop" + 0.013*"a