# Detección de tópicos por tweet en threads

## Importacion de librerias y definicion de funciones

Se importan las librerias necesarias y se definen las funciones para tokenizar, lemmatizar y para preparar el texto para el LDA.

En la tokenizacion se eliminan los hashtags y los usuarios citados

In [1]:
import os

In [2]:
import spacy
spacy.load('en')

from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            continue
        elif token.orth_.startswith('#'):
            continue
        elif token.orth_.startswith('@'):
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

import nltk

nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 3]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\carlo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\carlo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Importacion de archivos

Se cargan los archivos csv y se agrupan los tweets por threads, para luego crear un diccionario de tweets por cada thread (thread 1 : tweet1, tweet2...)

In [3]:
import random
import pandas as pd


In [4]:

csv1 = pd.read_csv('five_ten.csv', encoding='iso-8859-1')
csv1_grouped_by_thread = csv1.groupby(['thread_number'])
threads1 = {}
documentos1 = []

csv2 = pd.read_csv('ten_fifteen.csv', encoding='iso-8859-1')
csv2_grouped_by_thread = csv2.groupby(['thread_number'])
threads2 = {}
documentos2 = []

csv3 = pd.read_csv('fifteen_twenty.csv', encoding='iso-8859-1')
csv3_grouped_by_thread = csv3.groupby(['thread_number'])
threads3 = {}
documentos3 = []

csv4 = pd.read_csv('twenty_twentyfive.csv', encoding='iso-8859-1')
csv4_grouped_by_thread = csv4.groupby(['thread_number'])
threads4 = {}
documentos4 = []

csv5 = pd.read_csv('twentyfive_thirty.csv', encoding='iso-8859-1')
csv5_grouped_by_thread = csv5.groupby(['thread_number'])
threads5 = {}
documentos5 = []

## Creación de diccionario de tweets por threads

Se agruparán los tweets de cada hilo en un diccionario para cada archivo.

In [5]:
for thread, data in dict(list(csv1_grouped_by_thread)).items():
    threads1[thread] = list(data['text'])
    


for thread, data in dict(list(csv2_grouped_by_thread)).items():
    threads2[thread] = list(data['text'])
    


for thread, data in dict(list(csv3_grouped_by_thread)).items():
    threads3[thread] = list(data['text'])
    


for thread, data in dict(list(csv4_grouped_by_thread)).items():
    threads4[thread] = list(data['text'])
    


for thread, data in dict(list(csv5_grouped_by_thread)).items():
    threads5[thread] = list(data['text'])
    



In [6]:
threads1

{'Thread 1': ['Extraordinary evidence at Treasury committee from Jon Thompson, CEO of HMRC on customs and Brexit today https://t.co/DJhIQhmVwJ',
  "The Brexiter favourite Max Fac - would cost business between Â£17 and Â£20bn a year\r\r\n\r\r\n- that's almost 1% of GDP\r\r\n\r\r\n- jusâ?¦ https://t.co/0MwIcwre4t",
  'How does he arrive at the figure\r\r\n\r\r\n200m export consignments at an average cost of Â£32.50 each = Â£6.5bn (times two beâ?¦ https://t.co/KxnkU2QiVO',
  "Theresa May's New Customs Partnership is much cheaper for business (almost zero cost)  because it seeks to replicatâ?¦ https://t.co/0LcsJHah0H",
  'Mr Thompson said he did not expect the EU to reciprocate over the customs partnership. \r\r\n\r\r\nWhat that means is UK collâ?¦ https://t.co/9c3uhhnZGX',
  'Both would not be ready by 2021. Max Fac needs 3 years. Customs Partnership requires 5, Mr Thompson said.\r\r\n\r\r\nThe bordâ?¦ https://t.co/luLzgUsiR4',
  '"We think we can manage the risk - we think we can" he sai

## LDA para cada thread de cada CSV

Se definen la cantidad de topicos a detectar, en conjunto con la cantidad de palabras que se mostraran al imprimir los topicos detectados.

La detección de tópicos se realizará a cada thread de todos los archivos CSV, por lo que se considerará cada tweet del thread como un documento.

In [7]:
import gensim
from gensim import corpora
NUM_TOPICS = 5
NUM_WORDS = 5
import pickle



### CSV five_ten

In [8]:
THIS_FOLDER = os.getcwd()
threads_leer = threads1
carpeta_guardar = "tpcsv1"

#Poblar text_data

for hilos in threads_leer:
    camino = os.path.join(THIS_FOLDER, carpeta_guardar)
    text_data = []
    documentos = []
    dictionary = []
    corpus = []
    print(hilos)
    documentos = threads_leer[hilos]

    #print(documentos)

    for line in documentos:
        #print(line)
        tokens = prepare_text_for_lda(line)
        if random.random() > .009:
            #print(tokens)
            text_data.append(tokens)

    #print(text_data) 
    NDIC = camino+"\\"+hilos+"_t_dictionary1.gensim"
    NMOD = camino+"\\"+hilos+"_t_model1.gensim"
    NCOR = camino+"\\"+hilos+"_t_corpus1.pkl"
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    pickle.dump(corpus, open(NCOR, 'wb'))
    dictionary.save(NDIC)

    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    ldamodel.save(NMOD)
    topics = ldamodel.print_topics(num_words=NUM_WORDS)
    for topic in topics:
        print(topic)

Thread 1


FileNotFoundError: [Errno 2] No such file or directory: 'E:\\Documentos\\usm-memoria\\codigo\\twitter\\tpcsv1\\Thread 1_t_corpus1.pkl'

### CSV ten_fifteen

In [None]:
THIS_FOLDER = os.getcwd()
threads_leer = threads2
carpeta_guardar = "tpcsv2"

#Poblar text_data

for hilos in threads_leer:
    camino = os.path.join(THIS_FOLDER, carpeta_guardar)
    text_data = []
    documentos = []
    dictionary = []
    corpus = []
    print(hilos)
    documentos = threads_leer[hilos]

    #print(documentos)

    for line in documentos:
        #print(line)
        tokens = prepare_text_for_lda(line)
        if random.random() > .009:
            #print(tokens)
            text_data.append(tokens)

    #print(text_data) 
    NDIC = camino+"\\"+hilos+"_t_dictionary1.gensim"
    NMOD = camino+"\\"+hilos+"_t_model1.gensim"
    NCOR = camino+"\\"+hilos+"_t_corpus1.pkl"
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    pickle.dump(corpus, open(NCOR, 'wb'))
    dictionary.save(NDIC)

    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    ldamodel.save(NMOD)
    topics = ldamodel.print_topics(num_words=NUM_WORDS)
    for topic in topics:
        print(topic)

### CSV fifteen_twenty

In [None]:
THIS_FOLDER = os.getcwd()
threads_leer = threads3
carpeta_guardar = "tpcsv3"

#Poblar text_data

for hilos in threads_leer:
    camino = os.path.join(THIS_FOLDER, carpeta_guardar)
    text_data = []
    documentos = []
    dictionary = []
    corpus = []
    print(hilos)
    documentos = threads_leer[hilos]

    #print(documentos)

    for line in documentos:
        #print(line)
        tokens = prepare_text_for_lda(line)
        if random.random() > .009:
            #print(tokens)
            text_data.append(tokens)

    #print(text_data) 
    NDIC = camino+"\\"+hilos+"_t_dictionary1.gensim"
    NMOD = camino+"\\"+hilos+"_t_model1.gensim"
    NCOR = camino+"\\"+hilos+"_t_corpus1.pkl"
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    pickle.dump(corpus, open(NCOR, 'wb'))
    dictionary.save(NDIC)

    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    ldamodel.save(NMOD)
    topics = ldamodel.print_topics(num_words=NUM_WORDS)
    for topic in topics:
        print(topic)

### CSV twenty_twentyfive

In [None]:
THIS_FOLDER = os.getcwd()
threads_leer = threads4
carpeta_guardar = "tpcsv4"

#Poblar text_data

for hilos in threads_leer:
    camino = os.path.join(THIS_FOLDER, carpeta_guardar)
    text_data = []
    documentos = []
    dictionary = []
    corpus = []
    print(hilos)
    documentos = threads_leer[hilos]

    #print(documentos)

    for line in documentos:
        #print(line)
        tokens = prepare_text_for_lda(line)
        if random.random() > .009:
            #print(tokens)
            text_data.append(tokens)

    #print(text_data) 
    NDIC = camino+"\\"+hilos+"_t_dictionary1.gensim"
    NMOD = camino+"\\"+hilos+"_t_model1.gensim"
    NCOR = camino+"\\"+hilos+"_t_corpus1.pkl"
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    pickle.dump(corpus, open(NCOR, 'wb'))
    dictionary.save(NDIC)

    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    ldamodel.save(NMOD)
    topics = ldamodel.print_topics(num_words=NUM_WORDS)
    for topic in topics:
        print(topic)

### CSV twentyfive_thirty

In [None]:
THIS_FOLDER = os.getcwd()
threads_leer = threads5
carpeta_guardar = "tpcsv5"

#Poblar text_data

for hilos in threads_leer:
    camino = os.path.join(THIS_FOLDER, carpeta_guardar)
    text_data = []
    documentos = []
    dictionary = []
    corpus = []
    print(hilos)
    documentos = threads_leer[hilos]

    #print(documentos)

    for line in documentos:
        #print(line)
        tokens = prepare_text_for_lda(line)
        if random.random() > .009:
            #print(tokens)
            text_data.append(tokens)

    #print(text_data) 
    NDIC = camino+"\\"+hilos+"_t_dictionary1.gensim"
    NMOD = camino+"\\"+hilos+"_t_model1.gensim"
    NCOR = camino+"\\"+hilos+"_t_corpus1.pkl"
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    pickle.dump(corpus, open(NCOR, 'wb'))
    dictionary.save(NDIC)

    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    ldamodel.save(NMOD)
    topics = ldamodel.print_topics(num_words=NUM_WORDS)
    for topic in topics:
        print(topic)

# Deteccion de topicos por threads

Al contrario del apartado anterior, se buscarán tópicos en el archivo completo, por lo que se considerará cada thread como un documento, para esto se unirán los tweets siendo considerados parrafos separados por saltos de linea "\n".


In [None]:
string = " \n "

for thread, data in dict(list(csv1_grouped_by_thread)).items():
    threads1[thread] = string.join(list(data['text']))    
Tthreads1 = list(threads1.values())

for thread, data in dict(list(csv2_grouped_by_thread)).items():
    threads2[thread] = string.join(list(data['text']))
Tthreads2 = list(threads2.values())

for thread, data in dict(list(csv3_grouped_by_thread)).items():
    threads3[thread] = string.join(list(data['text']))
Tthreads3 = list(threads3.values())

for thread, data in dict(list(csv4_grouped_by_thread)).items():
    threads4[thread] = string.join(list(data['text']))
Tthreads4 = list(threads4.values())

for thread, data in dict(list(csv5_grouped_by_thread)).items():
    threads5[thread] = string.join(list(data['text']))
Tthreads5 = list(threads5.values())

In [None]:
Tthreads1

In [None]:
from gensim import corpora
import gensim
NUM_TOPICS = 20
NUM_WORDS = 10
import pickle

### CSV five_ten

In [None]:
THIS_FOLDER = os.getcwd()
threads_leer = Tthreads1
carpeta_guardar = "Ttpcsv1"

#Poblar text_data


camino = os.path.join(THIS_FOLDER, carpeta_guardar)
text_data = []
documentos = []
dictionary = []
corpus = []
documentos = threads_leer

#print(documentos)

for line in documentos:
    #print(line)
    tokens = prepare_text_for_lda(line)
    if random.random() > .009:
        #print(tokens)
        text_data.append(tokens)

#print(text_data) 
NDIC = camino+"\\t_dictionary1.gensim"
NMOD = camino+"\\t_model1.gensim"
NCOR = camino+"\\t_corpus1.pkl"
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open(NCOR, 'wb'))
dictionary.save(NDIC)

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save(NMOD)
topics = ldamodel.print_topics(num_words=NUM_WORDS)
for topic in topics:
    print(topic)

### CSV Ten_fifteen

In [None]:
THIS_FOLDER = os.getcwd()
threads_leer = Tthreads2
carpeta_guardar = "Ttpcsv2"

#Poblar text_data


camino = os.path.join(THIS_FOLDER, carpeta_guardar)
text_data = []
documentos = []
dictionary = []
corpus = []
documentos = threads_leer

#print(documentos)

for line in documentos:
    #print(line)
    tokens = prepare_text_for_lda(line)
    if random.random() > .009:
        #print(tokens)
        text_data.append(tokens)

#print(text_data) 
NDIC = camino+"\\t_dictionary1.gensim"
NMOD = camino+"\\t_model1.gensim"
NCOR = camino+"\\t_corpus1.pkl"
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open(NCOR, 'wb'))
dictionary.save(NDIC)

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save(NMOD)
topics = ldamodel.print_topics(num_words=NUM_WORDS)
for topic in topics:
    print(topic)

### CSV fifteen_twenty

In [None]:
THIS_FOLDER = os.getcwd()
threads_leer = Tthreads3
carpeta_guardar = "Ttpcsv3"

#Poblar text_data


camino = os.path.join(THIS_FOLDER, carpeta_guardar)
text_data = []
documentos = []
dictionary = []
corpus = []
documentos = threads_leer

#print(documentos)

for line in documentos:
    #print(line)
    tokens = prepare_text_for_lda(line)
    if random.random() > .009:
        #print(tokens)
        text_data.append(tokens)

#print(text_data) 
NDIC = camino+"\\t_dictionary1.gensim"
NMOD = camino+"\\t_model1.gensim"
NCOR = camino+"\\t_corpus1.pkl"
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open(NCOR, 'wb'))
dictionary.save(NDIC)

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save(NMOD)
topics = ldamodel.print_topics(num_words=NUM_WORDS)
for topic in topics:
    print(topic)

### CSV twenty_twentyfive

In [None]:
THIS_FOLDER = os.getcwd()
threads_leer = Tthreads4
carpeta_guardar = "Ttpcsv4"

#Poblar text_data


camino = os.path.join(THIS_FOLDER, carpeta_guardar)
text_data = []
documentos = []
dictionary = []
corpus = []
documentos = threads_leer

#print(documentos)

for line in documentos:
    #print(line)
    tokens = prepare_text_for_lda(line)
    if random.random() > .009:
        #print(tokens)
        text_data.append(tokens)

#print(text_data) 
NDIC = camino+"\\t_dictionary1.gensim"
NMOD = camino+"\\t_model1.gensim"
NCOR = camino+"\\t_corpus1.pkl"
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open(NCOR, 'wb'))
dictionary.save(NDIC)

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save(NMOD)
topics = ldamodel.print_topics(num_words=NUM_WORDS)
for topic in topics:
    print(topic)

### CSV twentyfive_thirty

In [None]:
THIS_FOLDER = os.getcwd()
threads_leer = Tthreads5
carpeta_guardar = "Ttpcsv5"

#Poblar text_data


camino = os.path.join(THIS_FOLDER, carpeta_guardar)
text_data = []
documentos = []
dictionary = []
corpus = []
documentos = threads_leer

#print(documentos)

for line in documentos:
    #print(line)
    tokens = prepare_text_for_lda(line)
    if random.random() > .009:
        #print(tokens)
        text_data.append(tokens)

#print(text_data) 
NDIC = camino+"\\t_dictionary1.gensim"
NMOD = camino+"\\t_model1.gensim"
NCOR = camino+"\\t_corpus1.pkl"
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open(NCOR, 'wb'))
dictionary.save(NDIC)

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save(NMOD)
topics = ldamodel.print_topics(num_words=NUM_WORDS)
for topic in topics:
    print(topic)

## Megacorpus

Como tercera alternativa de análisis, se decide unir todos los threads que se tienen en un megacorpus, por lo que se utilizan cada thread de todos los archivos como un documento, luego se detectan los topicos presentes en los aproximadamente 500 documentos entregados.

In [None]:
megatexto = Tthreads1+Tthreads2+Tthreads3+Tthreads4+Tthreads5

In [None]:
megatexto

In [None]:
from gensim import corpora
import gensim
NUM_TOPICS = 20
NUM_WORDS = 10
import pickle

In [None]:
THIS_FOLDER = os.getcwd()
threads_leer = megatexto
carpeta_guardar = "mega"

#Poblar text_data

camino = os.path.join(THIS_FOLDER, carpeta_guardar)
text_data = []
documentos = []
dictionary = []
corpus = []
documentos = threads_leer

#print(documentos)

for line in documentos:
    #print(line)
    tokens = prepare_text_for_lda(line)
    if random.random() > .009:
        #print(tokens)
        text_data.append(tokens)

print(text_data) 
NDIC = camino+"\\t_dictionary1.gensim"
NMOD = camino+"\\t_model1.gensim"
NCOR = camino+"\\t_corpus1.pkl"
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open(NCOR, 'wb'))
dictionary.save(NDIC)

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save(NMOD)
topics = ldamodel.print_topics(num_words=NUM_WORDS)
for topic in topics:
    print(topic)

### Analisis de resultados de megacorpus

Luego de detectar los topicos, se clasificarán los threads de un archivo según los tópicos obtenidos.

In [None]:
for hilo in Tthreads1:
    hilito = prepare_text_for_lda(hilo)
    hilito_bow = dictionary.doc2bow(hilito)
    print(hilo)
    print(ldamodel.get_document_topics(hilito_bow))

### Análisis de tópicos

Es posible analizar la relación entre los tópicos obtenidos a través de la librería pyLDAvis, la cual grafica la distancia entre los tópicos

In [None]:
dictionary = gensim.corpora.Dictionary.load(NDIC)
corpus = pickle.load(open(NCOR, 'rb'))
lda = gensim.models.ldamodel.LdaModel.load(NMOD)
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)