In [255]:
# Imports
import pandas as pd
import gensim
import numpy as np
import nltk
import re

from helpers  import *

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

#nltk.download('wordnet')

np.random.seed(2018) # set random seed


In [256]:
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()

#import parser able to cut sentences into english words

def tokenize(text):
    '''
    cuts the sentences into words, performs also specific operations on URLs and @, plus lowercasing
    '''
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
#             lda_tokens.append('SCREEN_NAME')
            #PROBABLY BETTER THIS ONE BUT LDA CRASHES DUE TO THE HUGE NUMBER OF WORDS WITH 1 OCCURRENCE (NEED TO AGGREGATE)
            lda_tokens.append(token.string[1:].lower())
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [257]:
token = parser('La CACca è blu, @ciao')
for tok in token:
    print(tok.string[1:].lower())
    
tokenize('La CACca è blu, @ciao')

a 
acca 
 
lu
 
ciao


['la', 'cacca', 'è', 'blu', ',', 'ciao']

In [258]:
# Load data
tweets1 = pd.read_csv(access_folder('data') + 'IRAhandle_tweets_1.csv')

# selecting content columns for subject categorization (all should be in english)
content = tweets1[tweets1.language == 'English'].content

content[1]

'Marshawn Lynch arrives to game in anti-Trump shirt. Judging by his sagging pants the shirt should say Lynch vs. belt https://t.co/mLH1i30LZZ'

In [259]:
content[1:90]

1     Marshawn Lynch arrives to game in anti-Trump s...
2     Daughter of fallen Navy Sailor delivers powerf...
3     JUST IN: President Trump dedicates Presidents ...
4     19,000 RESPECTING our National Anthem! #StandF...
5     Dan Bongino: "Nobody trolls liberals better th...
6                           🐝🐝🐝 https://t.co/MorL3AQW0z
7     '@SenatorMenendez @CarmenYulinCruz Doesn't mat...
8     As much as I hate promoting CNN article, here ...
9     After the 'genocide' remark from San Juan Mayo...
10    After the 'genocide' remark from San Juan Mayo...
11    '@thehill Why won't she apologize to us for ly...
12    Sarah Sanders destroys NBC reporter: "Trump ma...
13    Hi @MichelleObama, remember when you said Wein...
14    Hi @MichelleObama, remember when you praised H...
15    Wow! Even CNN is slamming the Obamas for silen...
16    First lady Melania Trump visits infant opioid ...
17    BREAKING: The audio of sexual predator Harvey ...
18                   '@Breaking911 Build that wa

In [260]:
import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet as wn

def get_lemma(word):
    '''
    To be understood better, it should transform words into their roots
    '''
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer

def get_lemma2(word):
    '''
    To be understood better, it should transform words into their roots. DIFFERENT VERSION
    '''
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utente\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [261]:
nltk.download('stopwords')

#define set of stopwords to be ignored 

en_stop = set(nltk.corpus.stopwords.words('english'))
print(en_stop)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utente\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
{'your', 'is', 'from', 'she', 'few', 't', 'yours', 'what', 'that', 'him', 'mightn', "isn't", 'up', 'needn', 'weren', 'but', 'not', 'out', 'in', 'very', 'during', 'whom', 'after', 'between', 'herself', 'having', 'you', "won't", 'nor', 'don', "hasn't", "wouldn't", 'won', 'didn', 'the', 'of', 'wouldn', "mightn't", 'while', 'do', 'ain', 'm', "you'd", 'our', 'which', 'll', "don't", "mustn't", 'these', 'shouldn', 'was', 'be', 'because', 'will', 'myself', 'only', 'them', 'with', 'those', 'off', 'for', 'over', 'yourself', 'a', 'd', 'each', 'have', "couldn't", 'any', 'more', 'being', "you've", 'me', 'did', 'their', 'we', 'can', 'her', 'y', "hadn't", 'too', 'so', 'when', 'it', 's', "shan't", 'yourselves', 'aren', 'how', 'here', 'all', 'has', 'by', "aren't", 'and', 'who', 'where', 'ma', 'own', "you're", 'below', 'on', 'hasn', 'had', 'hadn

In [262]:
def prepare_text_for_lda(text, filter_size = 3):
    
    '''
    Collection of all function described above + filtering for short words (default > 3 letters)
    '''
    
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > filter_size]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [263]:
#apply transformation to every tweets to fit the lda datatype

import random
text_data = []
for line in content:
    tokens = prepare_text_for_lda(line)
#     if random.random() > .9999:
#         print(tokens)
    text_data.append(tokens)

In [265]:
# drop the empty lines
text_data = [x for x in text_data if x != []]


[['marshawn',
  'lynch',
  'arrive',
  'game',
  'anti',
  'trump',
  'shirt',
  'judging',
  'sag',
  'pants',
  'shirt',
  'lynch',
  'belt'],
 ['daughter',
  'fall',
  'navy',
  'sailor',
  'deliver',
  'powerful',
  'monologue',
  'anthem',
  'protest',
  'burns',
  'packer',
  'gear',
  'boycottnfl'],
 ['president',
  'trump',
  'dedicate',
  'president',
  'golf',
  'tournament',
  'trophy',
  'people',
  'florida',
  'texas',
  'puerto',
  'rico'],
 ['19,000', 'respect', 'national', 'anthem', 'standforouranthem'],
 ['bongino',
  'nobody',
  'troll',
  'liberal',
  'better',
  'donald',
  'trump',
  'exactly'],
 ['senatormenendez ',
  'carmenyulincruz ',
  'matter',
  'report',
  'crime',
  'change',
  'fact',
  'going'],
 ['much',
  'hate',
  'promote',
  'article',
  'admit',
  'everything',
  'trump',
  'say',
  'relief',
  'days'],
 ['genocide',
  'remark',
  'juan',
  'mayor',
  'narrative',
  'change',
  'though',
  'cnn ',
  'fix',
  'reporting',
  'constantly'],
 ['genoci

In [266]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

#Build corpus of frequency occurrencies

corpus = [dictionary.doc2bow(text) for text in text_data]

In [268]:
np.version.version


'1.11.3'

In [269]:
'''
Train the model: 
-NUM_TOPICS: must be tuned with human judgement;

-alpha     : tunes the number of topics that could generate one word,
           the lower alpha the easier it is to get a word belonging only to few topics
'''

NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=2)

In [270]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.047*"life" + 0.032*"first" + 0.025*"fitness" + 0.024*"matter"')
(1, '0.098*"police" + 0.048*"cop" + 0.035*"kill" + 0.034*"policebrutality"')
(2, '0.034*"never" + 0.031*"come" + 0.019*"someone" + 0.018*"always"')
(3, '0.026*"shit" + 0.021*"money" + 0.019*"stock" + 0.018*"must"')
(4, '0.029*"justice" + 0.020*"trump" + 0.020*"medium" + 0.019*"well"')
(5, '0.061*"weight" + 0.026*"become" + 0.024*"blackskinisnotacrime" + 0.023*"fire"')
(6, '0.033*"right" + 0.030*"blacktolive" + 0.024*"play" + 0.020*"music"')
(7, '0.036*"trump" + 0.027*"best" + 0.024*"support" + 0.024*"obama"')
(8, '0.151*"black" + 0.060*"people" + 0.047*"white" + 0.046*"blacklivesmatter"')
(9, '0.069*"like" + 0.042*"know" + 0.037*"love" + 0.031*"would"')
(10, '0.035*"start" + 0.028*"finance" + 0.023*"news" + 0.019*"real"')
(11, '0.034*"playing" + 0.028*"check" + 0.026*"-&gt" + 0.017*"happy"')
(12, '0.040*"school" + 0.036*"call" + 0.020*"house" + 0.019*"hear"')
(13, '0.069*"lose" + 0.020*"breaking" + 0.017*"teen" + 0.

In [271]:
for idx, topic in ldamodel.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.047*"life" + 0.032*"first" + 0.025*"fitness" + 0.024*"matter" + 0.021*"live" + 0.019*"time" + 0.017*"week" + 0.015*"kid" + 0.015*"die" + 0.014*"fact"
Topic: 1 
Words: 0.098*"police" + 0.048*"cop" + 0.035*"kill" + 0.034*"policebrutality" + 0.031*"officer" + 0.025*"shot" + 0.024*"blacklivesmatter" + 0.022*"arrest" + 0.018*"shooting" + 0.018*"death"
Topic: 2 
Words: 0.034*"never" + 0.031*"come" + 0.019*"someone" + 0.018*"always" + 0.014*"hope" + 0.013*"see" + 0.012*"season" + 0.011*"star" + 0.011*"back" + 0.010*"make"
Topic: 3 
Words: 0.026*"shit" + 0.021*"money" + 0.019*"stock" + 0.018*"must" + 0.017*"fucking" + 0.016*"bill" + 0.015*"hillary" + 0.013*"business" + 0.013*"mixtape" + 0.012*"clinton"
Topic: 4 
Words: 0.029*"justice" + 0.020*"trump" + 0.020*"medium" + 0.019*"well" + 0.018*"forget" + 0.016*"go" + 0.016*"nothing" + 0.013*"never" + 0.013*"remember" + 0.012*"beat"
Topic: 5 
Words: 0.061*"weight" + 0.026*"become" + 0.024*"blackskinisnotacrime" + 0.023*"fire" + 0

In [272]:
import pyLDAvis.gensim

lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)

pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
