In [3]:
# Imports
import pandas as pd
import gensim
import numpy as np
import nltk
import re

from helpers  import *

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
#nltk.download('punkt')

np.random.seed(2018) # set random seed


In [4]:
# Load data
tweets1 = pd.read_csv(access_folder('data') + 'IRAhandle_tweets_1.csv')

# selecting content columns for subject categorization (all should be in english)
content = tweets1[tweets1.language == 'English'].content

# Remove external links to unknown websites and get rid of punctuation and lower case everything
content = content.apply(lambda x: re.sub(r'http\S+', '', x)).apply(lambda x: re.sub(r"'|\"|`|:|\?|~|,|\.", '', x))\
                .apply(lambda x: x.lower()).apply(lambda x: remove_stopwords(x))

content = content.values

content

array(['sitting democrat senator trial corruption youve barely heard peep mainstream media @nedryun',
       'marshawn lynch arrives game anti-trump shirt judging sagging pants shirt lynch vs belt',
       'daughter fallen navy sailor delivers powerful monologue anthem protests burns nfl packers gear #boycottnfl',
       ...,
       '“this world canvas imagination” – henry david thoreau american essayist poet philosopher',
       'house republicans release dismantle affordable care act likely resulting uninsured',
       'retweeted citizen tv kenya (@citizentvkenya) koome court won’t sit week ask president'],
      dtype=object)

In [None]:
# Tokenization: Split text into words. Lowercase the words and remove punctuation

stemmer = SnowballStemmer("english") # initiate dictionary type
# create words from sentences
content_tokens = [nltk.word_tokenize(x) for x in content]
# Stem words (truncate)
content_stemmed = []
for tweet in content_tokens:
    stem = [stemmer.stem(x) for x in tweet if len(x) > 3] # only consider words more than 3 letters
    content_stemmed.append(stem)

In [None]:
# Create dictionary to know the frequency of words
dictionary = gensim.corpora.Dictionary(content_stemmed)
dictionary.filter_extremes(no_below=15) # remove words that have appearances less than 15 times

# Create bag-of-words
bow_corpus = [dictionary.doc2bow(doc) for doc in content_stemmed]

In [None]:
# Check shit is working
test = bow_corpus[126]
for i in range(len(test)):
    print("Word {} (\"{}\") appears {} time.".format(test[i][0],dictionary[test[i][0]], 
test[i][1]))

In [None]:
# Create model on bag-of-words

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))