# Second Stage 2

In this stage, mainly conducted LDA (Latent Dirichlet Allocation) study on a sample size of the original data. As we used t-SNE to reduce feature dimension, sample size is limited due to the processing time. Maybe other method should be considered if the data size is big or should break down data into chunks for easier processing.

*Major difference with second stage: focusing on one sector's topic instead of everything*

In [1]:
import warnings
import lda
import logging
import dill 
import os
import pandas as pd
import numpy as np
import bokeh.plotting as bp
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
from collections import Counter

warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.getLogger("lda").setLevel(logging.WARNING)
code_dir = os.getcwd()
data_dir = code_dir.replace('code', 'data')

In [2]:
sub_body = dill.load(open(data_dir+'/#patent_title_lemma.pkd', 'rb'))

tokens = sub_body['title_lemma'].tolist()
titles = []
for token in tokens:
    titles.append(' '.join(token))

sub_body['title_lemma'] = titles
sub_body = sub_body.drop_duplicates(subset='title_lemma')
sub_body = sub_body.dropna(subset=['sector'])
sub_body.shape

(395873, 8)

In [7]:
sub_body1 = sub_body[sub_body['sector'] == 'Information Technology']
sub_body2 = sub_body1[sub_body1['exec_dt']>'2010-01-01']

In [10]:
sub_body2.shape

(86222, 7)

In [37]:
title_list = sub_body2['title_lemma'].tolist()
# title_list

In [82]:
cvectorizer = CountVectorizer(min_df=10, max_df=3000, max_features=2000, ngram_range=(3,3))
cvz = cvectorizer.fit_transform(sub_body2['title_lemma'])

n_topics = 5
n_iter = 1000
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)



In [83]:
n_top_words = 8
topic_summaries = []

topic_word = lda_model.topic_word_  # get the topic words
vocab = cvectorizer.get_feature_names()

for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(', '.join(topic_words))

topic_summaries

['mobile communication device, non volatile memory, method apparatus system, system method managing, graphical user interface, portable electronic device, electronic device method, system method using',
 'field effect transistor, dispersed storage network, dc dc converter, near field communication, peer to peer, mobile computing device, light emitting diode, replacement metal gate',
 'system method providing, apparatus system method, apparatus associated method, printed circuit board, wireless communication system, method apparatus providing, networked computing environment, system method controlling',
 'system apparatus method, method apparatus controlling, flash memory device, method system apparatus, method system providing, system method apparatus, data storage system, direct memory access',
 'computer program product, wireless communication device, method computer program, system method computer, handheld electronic device, method system computer, mobile wireless communication, ra

In [84]:
X_topics.shape

(86222, 5)

In [85]:
tsne_model = TSNE(n_components=2, random_state=0, learning_rate=30, n_iter=300, angle=0.8, init='pca')
tsne_lda = tsne_model.fit_transform(X_topics)

In [86]:
doc_topic = lda_model.doc_topic_
lda_keys = []
for i, tweet in enumerate(sub_body2['title']):
    lda_keys += [doc_topic[i].argmax()]

In [87]:
plot_lda = bp.figure(plot_width=700, plot_height=600, title="LDA topic visualization",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

In [88]:
lda_df = pd.DataFrame(tsne_lda, columns=['x','y'])
lda_df['title'] = sub_body2['title_lemma'].tolist()
lda_df['ticker'] = sub_body2['ticker'].tolist()
lda_df.head()

Unnamed: 0,x,y,title,ticker
0,0.123207,0.010681,pump throat,NDSN
1,0.123207,0.010681,pump injector,NDSN
2,0.123207,0.010681,adhesive sensor hot melt liquid adhesive,NDSN
3,0.123207,0.010681,cover adhesive dispensing gun,NDSN
4,0.123207,0.010681,thermally insulated applicator,NDSN


In [40]:
# Counter(sample['sector'])

In [89]:
lda_df['topic'] = lda_keys
lda_df['topic'] = lda_df['topic'].map(int)

In [24]:
colormap = np.array(["#6d8dca", "#69de53", "#723bca", "#c3e14c", "#c84dc9", "#68af4e", "#6e6cd5",
"#e3be38", "#4e2d7c", "#5fdfa8", "#d34690", "#3f6d31", "#d44427", "#7fcdd8", "#cb4053", "#5e9981",
"#803a62", "#9b9e39", "#c88cca", "#e1c37b", "#34223b", "#bdd8a3", "#6e3326", "#cfbdce", "#d07d3c",
"#52697d", "#7d6d33", "#d27c88", "#36422b", "#b68f79"])

In [26]:
# plot_lda.scatter(source=lda_df, x='x', y='y', color=colormap[lda_keys])
# hover = plot_lda.select(dict(type=HoverTool))
# hover.tooltips={"title":"@title", "topic":"@topic", "ticker":"@ticker"}
# show(plot_lda)

# visualization of topics using pyLDAvis

In [90]:
lda_df['len_docs'] = lda_df['title'].apply(lambda x: len(x.split(' ')))

In [75]:
def prepareLDAData():
    data = {
        'vocab': vocab,
        'doc_topic_dists': lda_model.doc_topic_,
        'doc_lengths': list(lda_df['len_docs']),
        'term_frequency':cvectorizer.vocabulary_,
        'topic_term_dists': lda_model.components_
    } 
    return data

In [91]:
ldadata = prepareLDAData()

In [77]:
import pyLDAvis
pyLDAvis.enable_notebook()

In [92]:
prepared_data = pyLDAvis.prepare(**ldadata)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [93]:
pyLDAvis.save_html(prepared_data,'./InformationTech_LDA.html')

In [94]:
test = lda_df[lda_df['topic'] == 0]
test

Unnamed: 0,x,y,title,ticker,topic,len_docs
0,0.123207,0.010681,pump throat,NDSN,0,2
1,0.123207,0.010681,pump injector,NDSN,0,2
2,0.123207,0.010681,adhesive sensor hot melt liquid adhesive,NDSN,0,6
3,0.123207,0.010681,cover adhesive dispensing gun,NDSN,0,4
4,0.123207,0.010681,thermally insulated applicator,NDSN,0,3
5,0.123207,0.010681,apparatus method injecting high viscosity derm...,NDSN,0,7
6,0.123207,0.010681,apparatus method injecting dermal filler,NDSN,0,5
7,0.123207,0.010681,self-contained pressurized injection device,NDSN,0,4
8,0.123207,0.010681,powder flow monitoring using grounded hose,NDSN,0,6
9,0.123207,0.010681,coating nozzle coating method inner volume con...,NDSN,0,8


In [95]:
c = Counter(test['ticker'])
c.most_common()

[('IBM', 24580),
 ('MSFT', 10777),
 ('INTC', 9195),
 ('BBRY', 4566),
 ('XRX', 2997),
 ('TXN', 2981),
 ('CY', 2006),
 ('ADBE', 1233),
 ('NVDA', 1160),
 ('HRS', 1026),
 ('SYMC', 1015),
 ('CIEN', 856),
 ('LNKD', 722),
 ('MSCC', 666),
 ('LRCX', 622),
 ('JDSU', 550),
 ('SYNA', 432),
 ('MCHP', 346),
 ('LSCC', 341),
 ('NDSN', 266),
 ('UIS', 261),
 ('NATI', 260),
 ('IPHI', 246),
 ('OLED', 234),
 ('ACCO', 221),
 ('FNSR', 216),
 ('IMMR', 193),
 ('LLTC', 176),
 ('XCRA', 171),
 ('MITL', 170),
 ('QTM', 159),
 ('INFN', 148),
 ('DMRC', 146),
 ('ELX', 144),
 ('AMCC', 143),
 ('FEIC', 143),
 ('CUB', 127),
 ('CLIR', 117),
 ('OIIM', 106),
 ('FLDM', 98),
 ('NPTN', 95),
 ('CGNX', 94),
 ('APH', 92),
 ('TDC', 92),
 ('ALOG', 91),
 ('XXIA', 91),
 ('SMTC', 79),
 ('LUNA', 77),
 ('KOPN', 74),
 ('EMKR', 71),
 ('SANM', 69),
 ('GOOG', 67),
 ('IOTS', 63),
 ('OTEX', 61),
 ('AAPL', 57),
 ('CTS', 53),
 ('IPGP', 52),
 ('MSTR', 49),
 ('WATT', 48),
 ('WDC', 47),
 ('WSTC', 43),
 ('IXYS', 40),
 ('WU', 39),
 ('LRAD', 36),
 ('I

In [96]:
test.shape

(71910, 6)