# Second Stage 2

In this stage, mainly conducted LDA (Latent Dirichlet Allocation) study on a sample size of the original data. As we used t-SNE to reduce feature dimension, sample size is limited due to the processing time. Maybe other method should be considered if the data size is big or should break down data into chunks for easier processing.

*Major difference with second stage: focusing on one sector's topic instead of everything*

In [1]:
import warnings
import lda
import logging
import dill 
import os
import pandas as pd
import numpy as np
import bokeh.plotting as bp
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
from collections import Counter

warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.getLogger("lda").setLevel(logging.WARNING)
code_dir = os.getcwd()
data_dir = code_dir.replace('code', 'data')

In [2]:
sub_body = dill.load(open(data_dir+'/#patent_title_lemma.pkd', 'rb'))

tokens = sub_body['title_lemma'].tolist()
titles = []
for token in tokens:
    titles.append(' '.join(token))

sub_body['title_lemma'] = titles
sub_body = sub_body.drop_duplicates(subset='title_lemma')
sub_body = sub_body.dropna(subset=['sector'])
sub_body.shape

(395873, 8)

In [None]:
set(sub_body['sector'])

In [3]:
# set the sub dataframe that you want to focus on
chosen = 'Information Technology'
sub_body1 = sub_body[sub_body['sector'] == chosen]
sub_body2 = sub_body1[sub_body1['exec_dt']>'2010-01-01']
sub_body2.shape

(86222, 8)

In [4]:
title_list = sub_body2['title_lemma'].tolist()
title_list[:3]

['pump throat', 'pump injector', 'adhesive sensor hot melt liquid adhesive']

In [5]:
# based on different data size, adjust parameters to make result more reasonable

cvectorizer = CountVectorizer(min_df=1, max_df=8000, max_features=2000, ngram_range=(3,3))
cvz = cvectorizer.fit_transform(sub_body2['title_lemma'])

n_topics = 20
n_iter = 1000
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)



In [6]:
n_top_words = 10
topic_summaries = []

topic_word = lda_model.topic_word_  # get the topic words
vocab = cvectorizer.get_feature_names()

for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(', '.join(topic_words))
#     print(i, topic_summaries[i])
#     print('------')

In [8]:
with open("file.txt", "w") as output:
    output.write(str(topic_summaries))

In [9]:
tsne_model = TSNE(n_components=2, random_state=0, learning_rate=30, 
                  n_iter=300, angle=0.8, init='pca')
tsne_lda = tsne_model.fit_transform(X_topics)

In [10]:
doc_topic = lda_model.doc_topic_
lda_keys = []
for i, tweet in enumerate(sub_body2['title']):
    lda_keys += [doc_topic[i].argmax()]

In [None]:
# plot_lda = bp.figure(plot_width=700, plot_height=600, title="LDA topic visualization",
#     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
#     x_axis_type=None, y_axis_type=None, min_border=1)

In [11]:
lda_df = pd.DataFrame(tsne_lda, columns=['x','y'])
lda_df['title'] = sub_body2['title_lemma'].tolist()
lda_df['ticker'] = sub_body2['ticker'].tolist()
lda_df['topic'] = lda_keys
lda_df['topic'] = lda_df['topic'].map(int)
lda_df.head()

Unnamed: 0,x,y,title,ticker,topic
0,0.276429,0.487958,pump throat,NDSN,0
1,0.276429,0.487958,pump injector,NDSN,0
2,0.276429,0.487958,adhesive sensor hot melt liquid adhesive,NDSN,0
3,0.276429,0.487958,cover adhesive dispensing gun,NDSN,0
4,0.276429,0.487958,thermally insulated applicator,NDSN,0


In [12]:
colormap = np.array(["#6d8dca", "#69de53", "#723bca", "#c3e14c", "#c84dc9", "#68af4e", "#6e6cd5",
"#e3be38", "#4e2d7c", "#5fdfa8", "#d34690", "#3f6d31", "#d44427", "#7fcdd8", "#cb4053", "#5e9981",
"#803a62", "#9b9e39", "#c88cca", "#e1c37b", "#34223b", "#bdd8a3", "#6e3326", "#cfbdce", "#d07d3c",
"#52697d", "#7d6d33", "#d27c88", "#36422b", "#b68f79"])

In [None]:
# plot_lda.scatter(source=lda_df, x='x', y='y', color=colormap[lda_keys])
# hover = plot_lda.select(dict(type=HoverTool))
# hover.tooltips={"title":"@title", "topic":"@topic", "ticker":"@ticker"}
# show(plot_lda)

# visualization of topics using pyLDAvis

In [13]:
lda_df['len_docs'] = lda_df['title'].apply(lambda x: len(x.split(' ')))

In [14]:
def prepareLDAData():
    data = {
        'vocab': vocab,
        'doc_topic_dists': lda_model.doc_topic_,
        'doc_lengths': list(lda_df['len_docs']),
        'term_frequency':cvectorizer.vocabulary_,
        'topic_term_dists': lda_model.components_
    } 
    return data

In [15]:
import pyLDAvis
pyLDAvis.enable_notebook()

In [16]:
ldadata = prepareLDAData()

In [17]:
prepared_data = pyLDAvis.prepare(mds='tsne', **ldadata)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [18]:
pyLDAvis.save_html(prepared_data,'./interactive plots/{}.html'.format(chosen))

In [None]:
# number of items in each topic
c = Counter(lda_df['topic'])
c.most_common()

In [None]:
test.shape