### Importar librerías

In [1]:
import pandas as pd 

In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andres\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.stem import PorterStemmer 

def lemmatize_stemming(text):
    ps = PorterStemmer()
    return ps.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text): #  gensim.utils.simple_preprocess tokeniza el texto
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [4]:
data=pd.read_csv("datos/All_posts_comments.csv")
data_text = data[['comment']]
data_text['index'] = data_text.index
documents = data_text

In [5]:
print(len(documents))
print(documents[:5])

113310
                                             comment  index
0  Hi u/radome9! Welcome to r/RussiaUkraineWar202...      0
1  How fucked up is it that the promise of NOT be...      1
2  * Shooting your commanding officer? 8 years.\n...      2
3                                           Glorious      3
4  Hi u/MrAutoFem! Welcome to r/RussiaUkraineWar2...      4


^C


In [6]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('documento original: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n documento tokenizado y lematizado: ')
print(preprocess(doc_sample))

documento original: 
['No', 'its', 'not', 'likely', 'and', 'I', "won't", 'happen', 'the', 'next', '10years.']


 documento tokenizado y lematizado: 
['like', 'happen', 'year']


### Preprocesamiento de textos

In [8]:
processed_docs = documents['comment'].map(preprocess)
processed_docs[:10]

0    [radom, welcom, heavili, moder, subreddit, not...
1    [fuck, promis, exchang, incent, russian, want,...
2    [shoot, command, offic, year, voluntarili, sur...
3                                            [gloriou]
4    [mrautofem, welcom, heavili, moder, subreddit,...
5                                       [wrong, photo]
6                [look, modern, soviet, helicopt, see]
7                                     [portug, ukrain]
8                                  [know, spare, part]
9    [photo, right, type, http, wikipedia, wiki, ka...
Name: comment, dtype: object

### Construcción del diccionario

In [9]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 access
1 account
2 action
3 automat
4 ban
5 behaviour
6 channel
7 combat
8 comment
9 compos
10 concern


In [11]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

### Gensim doc2bow

In [15]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(54, 1), (143, 1), (664, 1)]

In [13]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 54 ("year") appears 1 time.
Word 143 ("like") appears 1 time.
Word 664 ("happen") appears 1 time.


### TF-IDF

In [16]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.13667569244376987),
 (1, 0.1353607084629341),
 (2, 0.12661576773189193),
 (3, 0.26212640200876935),
 (4, 0.19907863714156343),
 (5, 0.1373504870852835),
 (6, 0.13872417094206665),
 (7, 0.13231118776071307),
 (8, 0.2637110699771633),
 (9, 0.13141564432407454),
 (10, 0.13000481571621791),
 (11, 0.1305303576717125),
 (12, 0.13776605237020875),
 (13, 0.12718974394640373),
 (14, 0.2046982467371575),
 (15, 0.13448211971982818),
 (16, 0.1357085390458957),
 (17, 0.15059003858407272),
 (18, 0.12978462073936226),
 (19, 0.1376066276389906),
 (20, 0.2626025275097364),
 (21, 0.1955393397330564),
 (22, 0.1302767942989027),
 (23, 0.12230102133027823),
 (24, 0.12585157643826309),
 (25, 0.12475935671256531),
 (26, 0.1529077162326946),
 (27, 0.14539569905177344),
 (28, 0.15857758768027683),
 (29, 0.1377292031861966),
 (30, 0.26157121710885556),
 (31, 0.13231118776071307),
 (32, 0.14736457299787353),
 (33, 0.13500456705767683),
 (34, 0.14312523291623183),
 (35, 0.08713967839965166),
 (36, 0.133755

### Corriendo LDA usando la bolsa de palabras

In [17]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [18]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.067*"russian" + 0.016*"ukrainian" + 0.013*"nice" + 0.013*"germani" + 0.012*"nazi" + 0.012*"know" + 0.011*"german" + 0.009*"word" + 0.008*"believ" + 0.008*"live"
Topic: 1 
Words: 0.064*"http" + 0.054*"thank" + 0.048*"follow" + 0.044*"remind" + 0.040*"appear" + 0.038*"subreddit" + 0.038*"natur" + 0.038*"mobil" + 0.031*"contribut" + 0.031*"ensur"
Topic: 2 
Words: 0.026*"like" + 0.024*"shit" + 0.022*"good" + 0.020*"look" + 0.018*"video" + 0.015*"link" + 0.015*"russian" + 0.014*"dead" + 0.014*"guy" + 0.013*"menu"
Topic: 3 
Words: 0.022*"need" + 0.015*"slava" + 0.015*"russian" + 0.012*"ukraini" + 0.011*"great" + 0.011*"drone" + 0.011*"bridg" + 0.010*"bomb" + 0.008*"drop" + 0.008*"like"
Topic: 4 
Words: 0.033*"fuck" + 0.026*"orc" + 0.023*"hell" + 0.015*"bitch" + 0.014*"giphi" + 0.014*"dude" + 0.012*"sourc" + 0.012*"http" + 0.010*"bastard" + 0.009*"deserv"
Topic: 5 
Words: 0.055*"like" + 0.036*"peopl" + 0.033*"look" + 0.027*"russia" + 0.024*"putin" + 0.021*"russian" + 0.012*

### Corriendo  LDA usando TF-IDF

In [19]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.021*"good" + 0.011*"savevideo" + 0.010*"russian" + 0.008*"russia" + 0.007*"kill" + 0.007*"like" + 0.006*"peopl" + 0.006*"respect" + 0.005*"putin" + 0.005*"right"
Topic: 1 Word: 0.035*"hero" + 0.017*"rest" + 0.013*"peac" + 0.010*"brave" + 0.010*"putin" + 0.010*"stupid" + 0.008*"savevideobot" + 0.008*"base" + 0.008*"warrior" + 0.007*"true"
Topic: 2 Word: 0.027*"fuck" + 0.017*"russia" + 0.015*"russian" + 0.011*"love" + 0.008*"germani" + 0.008*"sourc" + 0.007*"ukrainian" + 0.007*"like" + 0.006*"bastard" + 0.006*"guy"
Topic: 3 Word: 0.010*"translat" + 0.008*"russian" + 0.008*"like" + 0.007*"look" + 0.007*"russia" + 0.007*"great" + 0.007*"lmao" + 0.007*"say" + 0.007*"fuck" + 0.006*"poor"
Topic: 4 Word: 0.021*"shit" + 0.019*"bitch" + 0.015*"song" + 0.014*"word" + 0.013*"piec" + 0.012*"real" + 0.012*"brother" + 0.011*"cool" + 0.011*"deport" + 0.009*"go"
Topic: 5 Word: 0.022*"delet" + 0.013*"nice" + 0.010*"hell" + 0.009*"russian" + 0.007*"know" + 0.007*"come" + 0.007*"orc" + 0.

### Evaluación del desempeño clasificando el documento de muestra usando el modelo LDA de bolsa de palabras

In [20]:
processed_docs[4310]

['like', 'happen', 'year']

In [21]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.7749541997909546	 
Topic: 0.055*"like" + 0.036*"peopl" + 0.033*"look" + 0.027*"russia" + 0.024*"putin" + 0.021*"russian" + 0.012*"countri" + 0.012*"think" + 0.011*"want" + 0.010*"kill"

Score: 0.025009844452142715	 
Topic: 0.026*"like" + 0.024*"shit" + 0.022*"good" + 0.020*"look" + 0.018*"video" + 0.015*"link" + 0.015*"russian" + 0.014*"dead" + 0.014*"guy" + 0.013*"menu"

Score: 0.025009023025631905	 
Topic: 0.017*"russia" + 0.015*"ukrain" + 0.013*"say" + 0.012*"know" + 0.011*"sure" + 0.009*"think" + 0.008*"mean" + 0.007*"rapid" + 0.007*"thing" + 0.007*"need"

Score: 0.0250083040446043	 
Topic: 0.038*"ukrain" + 0.033*"russia" + 0.018*"russian" + 0.014*"militari" + 0.011*"go" + 0.011*"weapon" + 0.009*"armi" + 0.008*"forc" + 0.008*"fight" + 0.007*"nato"

Score: 0.02500782534480095	 
Topic: 0.067*"russian" + 0.016*"ukrainian" + 0.013*"nice" + 0.013*"germani" + 0.012*"nazi" + 0.012*"know" + 0.011*"german" + 0.009*"word" + 0.008*"believ" + 0.008*"live"

Score: 0.02500583417713642	

### Evaluación del desempeño clasificando el documento de muestra usando el modelo LDA TF-IDF.

In [22]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.7749181985855103	 
Topic: 0.022*"delet" + 0.013*"nice" + 0.010*"hell" + 0.009*"russian" + 0.007*"know" + 0.007*"come" + 0.007*"orc" + 0.007*"happi" + 0.006*"russia" + 0.006*"like"

Score: 0.02501380629837513	 
Topic: 0.021*"slava" + 0.016*"like" + 0.015*"look" + 0.014*"ukraini" + 0.009*"russian" + 0.008*"ukrain" + 0.008*"send" + 0.006*"russia" + 0.006*"shoot" + 0.006*"sound"

Score: 0.02501256950199604	 
Topic: 0.017*"giphi" + 0.012*"dead" + 0.010*"hope" + 0.008*"stay" + 0.008*"russian" + 0.008*"damn" + 0.007*"ukrain" + 0.007*"bless" + 0.007*"burn" + 0.007*"awesom"

Score: 0.025010382756590843	 
Topic: 0.010*"nazi" + 0.010*"think" + 0.009*"russian" + 0.008*"fuck" + 0.008*"russia" + 0.008*"special" + 0.008*"like" + 0.007*"look" + 0.007*"oper" + 0.006*"yeah"

Score: 0.025010032579302788	 
Topic: 0.021*"good" + 0.011*"savevideo" + 0.010*"russian" + 0.008*"russia" + 0.007*"kill" + 0.007*"like" + 0.006*"peopl" + 0.006*"respect" + 0.005*"putin" + 0.005*"right"

Score: 0.02500950358

## Prueba del modelo con un documento no visto antes.

In [23]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.35029685497283936	 Topic: 0.067*"russian" + 0.016*"ukrainian" + 0.013*"nice" + 0.013*"germani" + 0.012*"nazi"
Score: 0.29998016357421875	 Topic: 0.055*"like" + 0.036*"peopl" + 0.033*"look" + 0.027*"russia" + 0.024*"putin"
Score: 0.2096201628446579	 Topic: 0.033*"fuck" + 0.026*"orc" + 0.023*"hell" + 0.015*"bitch" + 0.014*"giphi"
Score: 0.020021185278892517	 Topic: 0.038*"ukrain" + 0.033*"russia" + 0.018*"russian" + 0.014*"militari" + 0.011*"go"
Score: 0.020016038790345192	 Topic: 0.050*"fuck" + 0.022*"russia" + 0.019*"hero" + 0.018*"love" + 0.018*"ukrain"
Score: 0.020014991983771324	 Topic: 0.022*"need" + 0.015*"slava" + 0.015*"russian" + 0.012*"ukraini" + 0.011*"great"
Score: 0.020012659952044487	 Topic: 0.064*"http" + 0.054*"thank" + 0.048*"follow" + 0.044*"remind" + 0.040*"appear"
Score: 0.020012659952044487	 Topic: 0.026*"like" + 0.024*"shit" + 0.022*"good" + 0.020*"look" + 0.018*"video"
Score: 0.020012659952044487	 Topic: 0.047*"section" + 0.044*"automat" + 0.044*"subreddi