# Analisis de Tópicos

In [3]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [4]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\carlo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [5]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\carlo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [36]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 3]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [54]:
import random
import pandas as pd

csv1 = pd.read_csv('twenty_twentyfive.csv', encoding='iso-8859-1')
csv1_grouped_by_thread = csv1.groupby(['thread_number'])

texts1 = {}

for thread, data in dict(list(csv1_grouped_by_thread)).items():
    texts1[thread] = list(data['text'])

documentos1 = texts1["Thread 5"]
#l = list(texts1.values())
#documentos1 = [item for sublist in l for item in sublist]



text_data = []

for line in documentos1:
    #print(line)
    tokens = prepare_text_for_lda(line)
    if random.random() > .09:
        print(tokens)
        text_data.append(tokens)

['yale', 'history', 'prof', 'SCREEN_NAME', 'outline', 'lesson', '20th', 'century', 'prevent', 'tyranny', 'overcome']
['obey', 'advance', 'acquiesce', 'conform', 'regime', 'offering', 'comply', 'compliance']
['beware', 'party', 'state', 'fascism', 'tyranny', 'prevail', 'party', 'exist', 'defend', 'everyone', 'right']
['take', 'responsibility', 'face', 'world', 'ensure', 'symbol', 'bigotry', 'hate', 'remove', 'america', "we'v"]
['remember', 'professional', 'ethics', 'concentration', 'camp', 'director', 'seek', 'businessmen', 'interest', 'cheap', 'labor', 'professio']
['beware', 'paramilitary', 'especially', 'militia', 'march', 'weapon', 'torch', 'support', 'emerge', 'tyranni']
['reflective', 'must', 'arm', 'public', 'servant', 'carry', 'weapon', 'first', 'thank']
['kind', 'language', 'think', 'speaking', 'disconnect', 'internet', 'television', 'time']
['believe', 'truth', 'nothing', 'true', 'criticize', 'power', 'trump', 'admin', 'right', 'wing', 'medium']
['investigate', 'spend', 'time'

In [38]:
text_data

[['obey',
  'advance',
  'acquiesce',
  'conform',
  'regime',
  'offering',
  'comply',
  'compliance'],
 ['defend',
  'institution',
  'trump',
  'aggressively',
  'seek',
  'undermine',
  'independent',
  'institution',
  'oppose'],
 ['beware',
  'party',
  'state',
  'fascism',
  'tyranny',
  'prevail',
  'party',
  'exist',
  'defend',
  'everyone',
  'right'],
 ['take',
  'responsibility',
  'face',
  'world',
  'ensure',
  'symbol',
  'bigotry',
  'hate',
  'remove',
  'america',
  "we'v"],
 ['beware',
  'paramilitary',
  'especially',
  'militia',
  'march',
  'weapon',
  'torch',
  'support',
  'emerge',
  'tyranni'],
 ['stand',
  'something',
  'follow',
  'along',
  'freedom',
  'erode',
  'look',
  'example',
  'ghandi'],
 ['kind',
  'language',
  'think',
  'speaking',
  'disconnect',
  'internet',
  'television',
  'time'],
 ['believe',
  'truth',
  'nothing',
  'true',
  'criticize',
  'power',
  'trump',
  'admin',
  'right',
  'wing',
  'medium'],
 ['investigate',
  's

In [55]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [71]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.017*"tyranny" + 0.017*"america" + 0.017*"SCREEN_NAME" + 0.017*"take" + 0.017*"responsibility"')
(1, '0.023*"right" + 0.023*"party" + 0.013*"labor" + 0.013*"remember" + 0.013*"seek"')
(2, '0.058*"good" + 0.030*"country" + 0.017*"make" + 0.017*"friend" + 0.017*"time"')
(3, '0.023*"take" + 0.023*"regime" + 0.023*"autonomy" + 0.023*"disaster" + 0.023*"calm"')
(4, '0.029*"weapon" + 0.029*"march" + 0.029*"beware" + 0.029*"support" + 0.029*"tyranni"')


In [67]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)