<a href="https://colab.research.google.com/github/blanco-herrero/Interviews/blob/main/TOPIC_MODELLING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing modules
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim import corpora
from gensim import models
from gensim import matutils
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
os.chdir('..')
from glob import glob
from string import punctuation
import nltk
from nltk.sentiment import vader
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
# Upload the file
from google.colab import files
files.upload()

In [3]:
# Read data into an object
entrevistas_df = pd.read_csv("Entrevistas.txt", header= None, sep='\t')
entrevistas = entrevistas_df.T.squeeze()

In [None]:
# Print out the first rows of the file
entrevistas

In [81]:
# Remove the columns (if the case)
# trans = trans.drop(columns=['id', 'location', 'language'], axis=1).sample(100)

In [None]:
import re
(lambda x: re.sub('[-–_–,:/.¡!¿?...]','', x))

In [None]:
# Remove punctuation
entrevistas = entrevistas.map(lambda x: re.sub('[-–_–,:/.¡!¿?...]','', x))
entrevistas

In [None]:
# Convert uppercase to lowercase
entrevistas.map(lambda x: x.lower())

In [None]:
# PREPARE DATA FOR LDA ANALYSIS

# Generate the stop words
stop_words = stopwords.words('spanish')

stop_words.extend(['http', 'https', 'tco', 'httpstco', 'co', 'pa', 'si', 
                   'rt', 'etc', 'ect', 'tco', 'xa0', 't', 'si', 'q', 'd', 'lo', 'ell', 'call', 
                   'pkly', 'onde', 'pese', 'tb', '000', 'dos', 'tres', 'cia', 'vez', 'ves', 
                   'mira', 'tan', 'tal', 'dar', 'da', 'das', 'dan', 'uma', 'va', 'van', 'ser',
                   'ahí', 'ahi', 'tras', 'detrás', 'detras', 'creo', 'hecho', 'ejemplo', 'demas', 
                   'demás', 'ademas', 'además', 'resto', 'pensar', 'sino', 'decir', 'lado', 
                   'parece', 'piensa', 'pase', 'pesar', 'entender', 'alguien', 'dicho', 'supuesto', 
                   'alli', 'allí', 'aun', 'cualquier', 'cuestión', 'contenido', 'contenidos', 
                   'simplemente', 'símplemente', 'habia', 'parte', 'acaso', 'biden', 'toda', 'todas',
                   'gracias', 'despues', 'después', 'of', 'and', 'his', 'the', 'cada', 'to', 
                   'in', 'ello', 'quiere', 'buenas', 'ningún', 'día', 'pocos', 'cómo', 'como', 'puede', 
                   'pone', 'mientras', 'garcimoreno', 'cabrita', 'veintimillapier', 'qls', 'colocolo', 
                   'gyzytqqtbm', 'sólo', 'solo', 'ahora', 'frente', 'hacen', 'hace', 'bien', 'años', 
                   'nueva', 'luego', 'así', 'asi','claro', 'ver', 'debería', 'video', 'estan', 'mas', 
                   'menos', 'hacer', 'solo', 'pues', 'incluso', 'meses', 'vale', 'dia', 'evidentemente', 
                   'tambien', 'entonces','bueno', 'gente', 'discurso', 'aqui', 'veces', 'digo', 'vamos', 
                   'momento', 'forma', 'cosas', 'tipo', 'siempre', 'caso', 'persona', 'personas', 'puedes', 
                   'quiza', 'pasa', 'tema', 'alguna', 'algunas', 'manera', 'veo', 'final', 'nunca', 'muchas', 
                   'veces', 'igual', 'quizas', 'dice', 'tener', 'hacia', 'digamos', 'bastante', 'mucha', 
                   'tampoco', 'tambien', 'mismo', 'decia', 'voy', 'cosa', 'ambito', 'aunque', 'algun',
                   'puedo', 'haber', 'quiero', 'pueden', 'mejor', 'lleva', 'dicen', 'depende', 'sido',
                   'general'])

len(stop_words)

In [None]:
stop_words

In [44]:
#spanish_stopwords.append('')
#spanish_stopwords.remove('')

In [73]:
def sent_to_words(sentences):
    for sentence in sentences:
        
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]  

In [74]:
data = entrevistas.values.tolist()
data_words = list(sent_to_words(data))

In [75]:
# remove stop words
data_words = remove_stopwords(data_words)

In [None]:
print(data_words[:1][0][:30])

In [None]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

In [None]:
# LDA model training

from pprint import pprint
# Number of topics
num_topics = 4

# Build LDA model
lda_model = models.ldamodel.LdaModel(num_topics=num_topics, corpus=corpus, id2word=id2word, random_state=None, 
                                     per_word_topics=True, alpha='auto', passes=2)
lda_model.print_topics(num_words=10)

In [None]:
# Print the keywords of the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
lda_model.print_topics(num_words=40)


In [None]:
# Analyze LDA model results
!pip install pyLDAvis
!python -m pip install -U pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models
import pickle 

In [79]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(num_topics))

In [None]:
# This is a bit time consuming - make the if statement True
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

In [None]:
# Load the prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

In [None]:
# Take a look to the texts and their topics
topics = pd.DataFrame([dict(lda_model.get_document_topics(doc, minimum_probability=0.1))
                      for doc in corpus])
meta = entrevistas.iloc[entrevistas.index].drop(columns=[0]).reset_index(drop=True)
tpd = pd.concat([meta, topics], axis=1)
tpd [:50]