### Import packages

In [513]:
import nltk; nltk.download('stopwords')
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim 
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

### Import Dataset

In [520]:
import pandas as pd
import nltk

df = pd.read_excel('Stability_Buzz.xlsx', sheet_name='Relevant')
df = df[['Sentences']]
df['target'] = '0'

data = df
data.columns = ['Sentences', 'target']


data_text = data[['Sentences']]

documents = data_text
documents.head()

Unnamed: 0,Sentences
0,Baidu formed a strategic partnership with Qual...
1,Blackbaud announced the results of a commissio...
2,Earnings per fully diluted ADS on a GAAP basis...
3,"Fortescue's current director of operations, Gr..."
4,GAAP net loss for the third fiscal quarter was...


In [521]:
data = documents.Sentences.values.tolist()

### Data Preprocess

In [522]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['baidu', 'formed', 'strategic', 'partnership', 'with', 'qualcomm', 'to', 'optimize', 'baidu', 'voice', 'assistant', 'dueros', 'for', 'smartphones', 'on', 'the', 'qualcomm', 'snapdragon', 'tm', 'mobile', 'platform', 'including', 'the', 'upcoming', 'snapdragon', 'mobile', 'platform']]


In [523]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=50) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  


bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[data_words[0]]])





['baidu', 'formed', 'strategic', 'partnership', 'with', 'qualcomm', 'to', 'optimize', 'baidu', 'voice', 'assistant', 'dueros', 'for', 'smartphones', 'on', 'the', 'qualcomm', 'snapdragon', 'tm', 'mobile', 'platform', 'including', 'the', 'upcoming', 'snapdragon', 'mobile', 'platform']


In [524]:
from gensim.utils import simple_preprocess
from nltk import word_tokenize
stop_words = stopwords.words('english')
stop_words.extend(['included','including','includes','one','two','three','general','generally','way','better','need','really','believe','provide','say','saying','says','include','said','also','would','could','end','recent','recently','us','like','make','may','look','still','come','see','across','even','much','get','allow','allows','allowed','allowing','well','go','gone','going','that','thats','take','thing','come','move','give','help','put','dont','people','got','kind','know','day','think','continue','look','year','years','new','use','there','want','wanted','due','become','organ'])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


 

In [493]:
def clean_text(texts):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in stop_words and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text
 

tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(texts))

In [525]:
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)


nlp = spacy.load('en', disable=['parser', 'ner'])

data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['baidu', 'form', 'strategic', 'partnership', 'qualcomm', 'optimize', 'baidu', 'voice', 'assistant', 'duero', 'smartphone', 'qualcomm', 'snapdragon', 'mobile', 'platform', 'upcome', 'snapdragon', 'mobile', 'platform']]


### Create Dictionary

In [526]:
texts = data_lemmatized
id2word = gensim.corpora.Dictionary(texts)


# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 2), (8, 2), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1)]]


In [527]:
id2word[1]

'baidu'

In [528]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('assistant', 1),
  ('baidu', 2),
  ('duero', 1),
  ('form', 1),
  ('mobile', 2),
  ('optimize', 1),
  ('partnership', 1),
  ('platform', 2),
  ('qualcomm', 2),
  ('smartphone', 1),
  ('snapdragon', 2),
  ('strategic', 1),
  ('upcome', 1),
  ('voice', 1)]]

### Build LDA model

In [529]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=1500,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True) 

In [530]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.033*"growth" + 0.014*"sale" + 0.012*"quarter" + 0.011*"increase" + '
  '0.011*"revenue" + 0.010*"market" + 0.009*"business" + 0.008*"strong" + '
  '0.007*"continue" + 0.006*"product"'),
 (1,
  '0.008*"market" + 0.007*"product" + 0.007*"company" + 0.007*"sale" + '
  '0.005*"customer" + 0.005*"revenue" + 0.004*"low" + 0.004*"president" + '
  '0.004*"solution" + 0.004*"global"'),
 (2,
  '0.011*"company" + 0.007*"revenue" + 0.006*"customer" + 0.005*"business" + '
  '0.005*"report" + 0.004*"investment" + 0.004*"plan" + 0.004*"first" + '
  '0.004*"growth" + 0.004*"ceo"'),
 (3,
  '0.009*"company" + 0.008*"growth" + 0.007*"customer" + 0.005*"management" + '
  '0.005*"business" + 0.005*"service" + 0.005*"grow" + 0.005*"application" + '
  '0.005*"market" + 0.004*"result"'),
 (4,
  '0.014*"company" + 0.013*"growth" + 0.008*"revenue" + 0.006*"expect" + '
  '0.005*"market" + 0.005*"rate" + 0.004*"business" + 0.004*"base" + '
  '0.004*"technology" + 0.003*"make"')]


####  Visualize the topics

In [534]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Test New Unseen Document

In [478]:
from nltk import word_tokenize
text = 'And a quick one as well on your working capital. I was surprised to see an upward movement in the fourth quarter, which is a bit counter-intuitive given the steady rise in oil prices. Could you elaborate a little bit on this?'
bow = id2word.doc2bow(clean_text(text))
print(lda_model[bow])


[(1, 0.54317534), (2, 0.42331395), (4, 0.018748444)]
