In [1]:
import pandas as pd

# Importing Gensim
import gensim
from gensim import corpora

In [10]:
df = pd.read_csv('all_corpus.csv')

corpus = list(df['Text'])

In [11]:
doc_clean = []
for i in range(len(corpus)):
    doc_clean.append(corpus[i].split())

print(doc_clean[0])

['thank', 'welcome', 'amd', 'first', 'quarter', 'conference', 'call', 'opportunity', 'review', 'copy', 'earning', 'release', 'cfo', 'commentary', 'accompany', 'slide', 'review', 'document', 'find', 'amd', 'website', 'iramdcom']


In [12]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [13]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

In [14]:
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, '0.016*"think" + 0.015*"go" + 0.014*"see"'), (1, '0.029*"revenue" + 0.028*"quarter" + 0.023*"million"'), (2, '0.008*"really" + 0.007*"be" + 0.007*"cloud"')]


#### Prep data for pyLDAvis

In [15]:
# sklearn
from sklearn import datasets
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# logging (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [91]:
from sklearn.feature_extraction import text 

additional_sw = ['thank','caller','okay','join','operator','stacy','pron','question','good','great',
                'think','time','wea','look','say','yes','really','thing','ita','talk','number','right',
                 'overall','thata','probably','dont', 'second','followon','chris','john','ambrish',
                 'joe','thanks','amit','ross','harlan','really','yes','inside','weve','thats','change',
                'turn','let','dave','hand','mark','ill','eyal','comment','bob','laura','brian','devinder',
                 'answer','wrap','forwardlooke','materially','contain','cause','today','discussion',
                 'involve','rafael','review','followup','ask','outlook','order','additional','limit','response',
                'ray','refer','appreciate','evening','kevin','february','lisa','afternoon','welcome','want','actually']

stop_words = text.ENGLISH_STOP_WORDS.union(additional_sw)

tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = stop_words,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)

  and should_run_async(code)


In [92]:
# generating an alternative model using TF-IDF:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(corpus)
#print(dtm_tfidf.shape)

  and should_run_async(code)


In [105]:
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=6, random_state=42)
lda_tfidf.fit(dtm_tfidf)

  and should_run_async(code)


LatentDirichletAllocation(n_components=6, random_state=42)

### Visualizing with pyLDAvis

In [120]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer, mds='mmds')
#vis = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer, mds='mmds')
#pyLDAvis.save_html(vis, 'view_pyLDAvis.html')

  and should_run_async(code)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [50]:
from sklearn.decomposition import NMF

  and should_run_async(code)


In [121]:
# Use NMF to look for 15 topics
n_topics = 10
model = NMF(n_components=n_topics)
model.fit(dtm_tfidf)

# Print the top 10 words
n_words = 15
feature_names = tfidf_vectorizer.get_feature_names()

topic_list = []
for topic_idx, topic in enumerate(model.components_):
    top_n = [feature_names[i]
             for i in topic.argsort()
             [-n_words:]][::-1]
    top_features = ' '.join(top_n)
    topic_list.append(f"topic_{'_'.join(top_n[:3])}") 

    print(f"Topic {topic_idx}: {top_features}")

  and should_run_async(code)


Topic 0: revenue margin gross year profit ago yearoveryear billion segment low increase operate drive point high
Topic 1: million nongaap expense compare operate income charge approximately net share prior debt gaap diluted operating
Topic 2: cash flow free billion return dividend owner capital shareholder repurchase value trail operation period stock
Topic 3: cloud center platform datum new game data gpu performance gpus inference business compute world architecture
Topic 4: inventory year half little expect bit level product customer channel demand kind know start come
Topic 5: quarter fourth revenue decline result financial end increase strong total represent demand provide color start
Topic 6: website financial release statement earning risk available replay result web conference chief officer current differ
Topic 7: market growth continue industrial automotive grow business focus embed analog product investment opportunity provide year
Topic 8: ethernet infiniband gigabit switch r

In [None]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_tfidf, dtm_tfidf, dictionary)

In [None]:
vis_data = pyLDAvis.gensim.prepare(lda_tfidf, dtm_tfidf, dictionary)
pyLDAvis.save_html(vis_data, 'output_filename.html')