# Visualizing Tweets Using LDA Based Clustering

### Load Data

In [1]:
# Code to Import Libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords 
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
stopwords = set(stopwords.words('english'))
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
#nltk.download('stopwords')
#nltk.download('wordnet')

In [2]:
# Code Load_ Data into a List
tweetsInfo = pd.read_csv('AllTweetInfo.csv')

#Get Text for Topics
def MorePreprocessing(text):
    text_new = [t for t in text.split() if t not in stopwords]
    text_new = [t for t in text_new if not str.isnumeric(t)]
    
    return text_new
    
tweetsInfo['topic_text'] =tweetsInfo['text_features_new'].apply(MorePreprocessing)

### Creat LDA Model

In [3]:
# Train_LDA_ Model
t_list = tweetsInfo['topic_text'].tolist()
corpdict = corpora.Dictionary(t_list)

doc_term_matrix = [corpdict.doc2bow(doc) for doc in t_list]
corpora.MmCorpus.serialize('corpus.mm', doc_term_matrix)

Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics=20, id2word = corpdict)


## Visualize Topics (Clusters)

In [4]:
# Visualize Models
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
corpora =  gensim.corpora.MmCorpus('corpus.mm')
vis =pyLDAvis.gensim.prepare(ldamodel, corpora, corpdict)
pyLDAvis.display(vis)

### Try to Find Right Setting

In [None]:
# Code to FInd Right LDA Setting
#Refernece from https://markroxor.github.io/gensim/static/notebooks/gensim
def evaluate_graph(dictionary, corpus, texts, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    for num_topics in range(1, limit):
        lm = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lm_list.append(lm)
        cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v.append(cm.get_coherence())
        
    # Show graph
    x = range(1, limit)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    
    return lm_list, c_v

lmlist, c_v = evaluate_graph(dictionary=corpdict, corpus=corpora, texts=t_list, limit=20)


In [9]:
pyLDAvis.gensim.prepare(lmlist[4], corpora, corpdict)