# Second Stage

In this stage, mainly conducted LDA (Latent Dirichlet Allocation) study on a sample size of the original data. As we used t-SNE to reduce feature dimension, sample size is limited due to the processing time. Maybe other method should be considered if the data size is big or should break down data into chunks for easier processing.

In [17]:
import warnings
import lda
import logging
import dill 
import os
import pandas as pd
import numpy as np
import bokeh.plotting as bp
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
from collections import Counter

warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.getLogger("lda").setLevel(logging.WARNING)
code_dir = os.getcwd()
data_dir = code_dir.replace('code', 'data')

In [139]:
sub_body = dill.load(open(data_dir+'/#patent_title_lemma.pkd', 'rb'))

tokens = sub_body['title_lemma'].tolist()
titles = []
for token in tokens:
    titles.append(' '.join(token))

sub_body['title_lemma'] = titles
sub_body = sub_body.drop_duplicates(subset='title_lemma')
sub_body = sub_body.dropna(subset=['sector'])
sub_body.shape

(395873, 8)

In [108]:
c = Counter(sub_body['sector'].tolist())
c.most_common()

[('Information Technology', 256658),
 ('Industrials', 62841),
 ('Telecom Services', 22903),
 ('Financials', 14141),
 ('Healthcare', 13445),
 ('Consumer Cyclicals', 9033),
 ('Energy', 7300),
 ('Raw & Intermediate Materials', 6459),
 ('Consumer Staples', 2265),
 ('Utilities', 394)]

In [140]:
sample = sub_body.sample(3000)
sample.head()

Unnamed: 0,exec_dt,or_name,title,ee_name_matched,ticker,sector,title_stem,title_lemma
162630,2002-04-09,"HO, JIN-MENG","[signaling, parameterized, quality, service, (...",TEXAS INSTRUMENTS INCORPORATED,TXN,Information Technology,"[signal, parameter, qualiti, servic, qo, support]",signaling parameterized quality service qos su...
31916,2001-01-04,"MCCLOSKEY, PATRICK JOSEPH","[optimization, polycarbonate, preparation, tra...",GENERAL ELECTRIC COMPANY,GE,Industrials,"[optim, polycarbon, prepar, transesterif]",optimization polycarbonate preparation transes...
249883,2001-05-02,"BATES, CARY LEE","[operation, control, system, ensuring, availab...",INTERNATIONAL BUSINESS MACHINES CORPORATION,IBM,Information Technology,"[oper, control, system, ensur, avail, purchas,...",operation control system ensuring availability...
1155297,2010-06-17,"PAEK, TIMOTHY S.","[real-time, typing, assistance]",MICROSOFT CORPORATION,MSFT,Information Technology,"[real-tim, type, assist]",real-time typing assistance
1680815,2008-10-27,"ANDERSEN, DAVID M.","[local, collector]",BANK OF AMERICA CORPORATION,BAC,Financials,"[local, collector]",local collector


In [156]:
title_list = sample['title_lemma'].tolist()
title_list

['signaling parameterized quality service qos support',
 'optimization polycarbonate preparation transesterification',
 'operation control system ensuring availability purchasable item networked machine',
 'real-time typing assistance',
 'local collector',
 'radiation curable silicon containing polyacrylate hardcoat composition method making use',
 'input/output core design method manufacture therefor',
 'pseudo wire mobility management',
 'technique introduce advanced functional behavior database management system without introducing new data type',
 'microdisplay eye gaze detection',
 'multi-space distribution pattern recognition based mixed continuous discrete observation',
 'method system apparatus measuring temperature cold junction compensation',
 'system method touch screen control ultrasound system',
 'method apparatus binding subscriber authentication device authentication communication system',
 'intelligent migration device different hardware software configuration',
 'appar

In [165]:
cvectorizer = CountVectorizer(min_df=4, max_features=2000, ngram_range=(2,3))
cvz = cvectorizer.fit_transform(sample['title_lemma'])

n_topics = 10
n_iter = 1000
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)



In [179]:
n_top_words = 8
topic_summaries = []

topic_word = lda_model.topic_word_  # get the topic words
vocab = cvectorizer.get_feature_names()

for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(', '.join(topic_words))

In [182]:
# topic_word

In [183]:
tsne_model = TSNE(n_components=2, random_state=0, learning_rate=30, n_iter=300, angle=0.7, init='pca')
tsne_lda = tsne_model.fit_transform(X_topics)

In [184]:
doc_topic = lda_model.doc_topic_
lda_keys = []
for i, tweet in enumerate(sample['title']):
    lda_keys += [doc_topic[i].argmax()]

In [185]:
plot_lda = bp.figure(plot_width=700, plot_height=600, title="LDA topic visualization",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

In [187]:
lda_df = pd.DataFrame(tsne_lda, columns=['x','y'])
lda_df['title'] = sample['title_lemma'].tolist()
lda_df['sector'] = sample['sector'].tolist()
lda_df['ticker'] = sample['ticker'].tolist()
lda_df.head()

Unnamed: 0,x,y,title,sector,ticker
0,0.590467,1.369476,signaling parameterized quality service qos su...,Information Technology,TXN
1,0.590467,1.369476,optimization polycarbonate preparation transes...,Industrials,GE
2,-5.596375,-1.139622,operation control system ensuring availability...,Information Technology,IBM
3,-4.307505,5.045486,real-time typing assistance,Information Technology,MSFT
4,0.590467,1.369476,local collector,Financials,BAC


In [40]:
# Counter(sample['sector'])

In [188]:
lda_df['topic'] = lda_keys
lda_df['topic'] = lda_df['topic'].map(int)

In [33]:
colormap = np.array(["#6d8dca", "#69de53", "#723bca", "#c3e14c", "#c84dc9", "#68af4e", "#6e6cd5",
"#e3be38", "#4e2d7c", "#5fdfa8", "#d34690", "#3f6d31", "#d44427", "#7fcdd8", "#cb4053", "#5e9981",
"#803a62", "#9b9e39", "#c88cca", "#e1c37b", "#34223b", "#bdd8a3", "#6e3326", "#cfbdce", "#d07d3c",
"#52697d", "#7d6d33", "#d27c88", "#36422b", "#b68f79"])

In [119]:
# plot_lda.scatter(source=lda_df, x='x', y='y', color=colormap[lda_keys])

# hover = plot_lda.select(dict(type=HoverTool))
# hover.tooltips={"title":"@title", "topic":"@topic", "sector":"@sector", "ticker":"@ticker"}
# show(plot_lda)

# visualization of topics using pyLDAvis

In [189]:
lda_df['len_docs'] = lda_df['title'].apply(lambda x: len(x.split(' ')))

In [190]:
def prepareLDAData():
    data = {
        'vocab': vocab,
        'doc_topic_dists': lda_model.doc_topic_,
        'doc_lengths': list(lda_df['len_docs']),
        'term_frequency':cvectorizer.vocabulary_,
        'topic_term_dists': lda_model.components_
    } 
    return data

In [191]:
ldadata = prepareLDAData()

In [123]:
import pyLDAvis
pyLDAvis.enable_notebook()

In [192]:
prepared_data = pyLDAvis.prepare(**ldadata)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [193]:
pyLDAvis.save_html(prepared_data,'./pyldadavis.html')

In [194]:
test = lda_df[lda_df['topic'] == 0]
test

Unnamed: 0,x,y,title,sector,ticker,topic,len_docs
0,0.590467,1.369476,signaling parameterized quality service qos su...,Information Technology,TXN,0,6
1,0.590467,1.369476,optimization polycarbonate preparation transes...,Industrials,GE,0,4
4,0.590467,1.369476,local collector,Financials,BAC,0,2
7,0.590467,1.369476,pseudo wire mobility management,Telecom Services,QCOM,0,4
8,0.192266,-3.692449,technique introduce advanced functional behavi...,Information Technology,IBM,0,13
9,0.590467,1.369476,microdisplay eye gaze detection,Information Technology,INTC,0,4
10,0.590467,1.369476,multi-space distribution pattern recognition b...,Information Technology,MSFT,0,9
13,-4.480450,6.587622,method apparatus binding subscriber authentica...,Telecom Services,QCOM,0,9
14,0.590467,1.369476,intelligent migration device different hardwar...,Telecom Services,QCOM,0,7
18,0.590467,1.369476,inkjet ink comprising modified pigment attache...,Raw & Intermediate Materials,CBT,0,8


In [131]:
Counter(test['sector'])

Counter({'Consumer Cyclicals': 61,
         'Consumer Staples': 25,
         'Energy': 56,
         'Financials': 100,
         'Healthcare': 97,
         'Industrials': 446,
         'Information Technology': 1846,
         'Raw & Intermediate Materials': 52,
         'Telecom Services': 132,
         'Utilities': 2})

In [96]:
test.shape

(81, 7)