# Second Stage 2

In this stage, mainly conducted topic modeling study using Gensim on a sample size of the original data. As we used t-SNE to reduce feature dimension, sample size is limited due to the processing time. Maybe other method should be considered if the data size is big or should break down data into chunks for easier processing.

*Major difference with second stage: focusing on one sector's topic instead of everything*

In [7]:
import warnings
import gensim
import logging
import dill 
import os
import pandas as pd
import numpy as np
import bokeh.plotting as bp
import random

from gensim import corpora, models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
from collections import Counter

warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.getLogger("lda").setLevel(logging.WARNING)
code_dir = os.getcwd()
data_dir = code_dir.replace('code', 'data')

In [2]:
sub_body = dill.load(open(data_dir+'/#patent_title_lemma.pkd', 'rb'))

tokens = sub_body['title_lemma'].tolist()
titles = []
for token in tokens:
    titles.append(' '.join(token))

sub_body['title_lemma'] = titles
sub_body = sub_body.drop_duplicates(subset='title_lemma')
sub_body = sub_body.dropna(subset=['sector'])
sub_body.shape

(395873, 8)

In [3]:
set(sub_body['sector'])

{'Consumer Cyclicals',
 'Consumer Staples',
 'Energy',
 'Financials',
 'Healthcare',
 'Industrials',
 'Information Technology',
 'Raw & Intermediate Materials',
 'Telecom Services',
 'Utilities'}

In [4]:
# set the sub dataframe that you want to focus on
chosen = 'Information Technology'
sub_body1 = sub_body[sub_body['sector'] == chosen]
sub_body2 = sub_body1[sub_body1['exec_dt']>'2010-01-01']
sub_body2.shape

(86222, 8)

In [5]:
sub_body2.head()

Unnamed: 0,exec_dt,or_name,title,ee_name_matched,ticker,sector,title_stem,title_lemma
4132,2010-01-12,"FULKERSON, TERRENCE M.","[pump, throat]",NORDSON CORPORATION,NDSN,Information Technology,"[pump, throat]",pump throat
4138,2010-01-12,"FULKERSON, TERRENCE M.","[pump, injector]",NORDSON CORPORATION,NDSN,Information Technology,"[pump, injector]",pump injector
4174,2010-06-10,"GAON, MARTIN, MR.","[adhesive, sensor, hot, melt, liquid, adhesives]",NORDSON CORPORATION,NDSN,Information Technology,"[adhes, sensor, hot, melt, liquid, adhes]",adhesive sensor hot melt liquid adhesive
4176,2010-07-01,"BONDESON, BENJAMIN J.","[cover, adhesive, dispensing, gun]",NORDSON CORPORATION,NDSN,Information Technology,"[cover, adhes, dispens, gun]",cover adhesive dispensing gun
4184,2010-07-01,"BONDESON, BENJAMIN J.","[thermally, insulated, applicator]",NORDSON CORPORATION,NDSN,Information Technology,"[thermal, insul, applic]",thermally insulated applicator


In [6]:
title_list = sub_body2['title_lemma'].tolist()

In [9]:
title_tokens = [s.split(' ') for s in title_list]

In [12]:
# based on different data size, adjust parameters to make result more reasonable

# cvectorizer = CountVectorizer(min_df=1, max_df=8000, max_features=20000, ngram_range=(3,3))
# cvz = cvectorizer.fit_transform(sub_body2['title_lemma'])

# n_topics = 20
# n_iter = 1000
# lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
# X_topics = lda_model.fit_transform(cvz)

dictionary = corpora.Dictionary(title_tokens)
corpus = [dictionary.doc2bow(doc) for doc in title_tokens]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=20,
                                          id2word=dictionary, passes=50)

In [23]:
ldamodel.show_topics(num_topics=1)

[(18,
  '0.092*"test" + 0.079*"memory" + 0.035*"cell" + 0.028*"search" + 0.026*"equipment" + 0.023*"probe" + 0.021*"parametric" + 0.021*"using" + 0.021*"error" + 0.020*"camera"')]

In [24]:
ldamodel.print_topics(num_topics=20)

[(0,
  '0.034*"event" + 0.031*"module" + 0.031*"notification" + 0.031*"key" + 0.027*"monitoring" + 0.019*"variable" + 0.019*"messaging" + 0.019*"displaying" + 0.018*"distribution" + 0.017*"utilizing"'),
 (1,
  '0.110*"display" + 0.071*"optical" + 0.043*"testing" + 0.037*"controlling" + 0.036*"antenna" + 0.021*"link" + 0.021*"configuration" + 0.020*"programmable" + 0.017*"receiver" + 0.017*"machine"'),
 (2,
  '0.041*"access" + 0.035*"data" + 0.031*"file" + 0.026*"routing" + 0.025*"transfer" + 0.021*"memory" + 0.020*"processor" + 0.020*"group" + 0.017*"pattern" + 0.017*"using"'),
 (3,
  '0.214*"device" + 0.113*"electronic" + 0.105*"method" + 0.031*"related" + 0.022*"including" + 0.020*"portable" + 0.019*"assembly" + 0.016*"associated" + 0.016*"tester" + 0.013*"sensing"'),
 (4,
  '0.043*"low" + 0.039*"high" + 0.034*"adaptive" + 0.031*"protocol" + 0.028*"battery" + 0.025*"rate" + 0.024*"remote" + 0.023*"control" + 0.022*"efficient" + 0.017*"point"'),
 (5,
  '0.083*"management" + 0.044*"rad

In [98]:
n_top_words = 10
topic_summaries = []

topic_word = lda_model.topic_word_  # get the topic words
vocab = cvectorizer.get_feature_names()

for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(', '.join(topic_words))

In [8]:
# with open("file.txt", "w") as output:
#     output.write(str(topic_summaries))

In [99]:
tsne_model = TSNE(n_components=2, random_state=0, learning_rate=30, 
                  n_iter=300, angle=0.8, init='pca')
tsne_lda = tsne_model.fit_transform(X_topics)

In [10]:
doc_topic = lda_model.doc_topic_
lda_keys = []
for i, tweet in enumerate(sub_body2['title']):
    lda_keys += [doc_topic[i].argmax()]

In [None]:
# plot_lda = bp.figure(plot_width=700, plot_height=600, title="LDA topic visualization",
#     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
#     x_axis_type=None, y_axis_type=None, min_border=1)

In [91]:
Counter(lda_keys)

Counter({0: 65111,
         1: 1105,
         2: 1332,
         3: 1263,
         4: 1054,
         5: 1219,
         6: 1254,
         7: 1237,
         8: 1249,
         9: 1125,
         10: 1170,
         11: 1101,
         12: 986,
         13: 969,
         14: 1074,
         15: 1049,
         16: 765,
         17: 1050,
         18: 1025,
         19: 1084})

In [11]:
lda_df = pd.DataFrame(tsne_lda, columns=['x','y'])
lda_df['title'] = sub_body2['title_lemma'].tolist()
lda_df['ticker'] = sub_body2['ticker'].tolist()
lda_df['topic'] = lda_keys
lda_df['topic'] = lda_df['topic'].map(int)
lda_df.head()

Unnamed: 0,x,y,title,ticker,topic
0,0.276429,0.487958,pump throat,NDSN,0
1,0.276429,0.487958,pump injector,NDSN,0
2,0.276429,0.487958,adhesive sensor hot melt liquid adhesive,NDSN,0
3,0.276429,0.487958,cover adhesive dispensing gun,NDSN,0
4,0.276429,0.487958,thermally insulated applicator,NDSN,0


In [12]:
colormap = np.array(["#6d8dca", "#69de53", "#723bca", "#c3e14c", "#c84dc9", "#68af4e", "#6e6cd5",
"#e3be38", "#4e2d7c", "#5fdfa8", "#d34690", "#3f6d31", "#d44427", "#7fcdd8", "#cb4053", "#5e9981",
"#803a62", "#9b9e39", "#c88cca", "#e1c37b", "#34223b", "#bdd8a3", "#6e3326", "#cfbdce", "#d07d3c",
"#52697d", "#7d6d33", "#d27c88", "#36422b", "#b68f79"])

In [None]:
# plot_lda.scatter(source=lda_df, x='x', y='y', color=colormap[lda_keys])
# hover = plot_lda.select(dict(type=HoverTool))
# hover.tooltips={"title":"@title", "topic":"@topic", "ticker":"@ticker"}
# show(plot_lda)

# visualization of topics using pyLDAvis

In [13]:
lda_df['len_docs'] = lda_df['title'].apply(lambda x: len(x.split(' ')))

In [14]:
def prepareLDAData():
    data = {
        'vocab': vocab,
        'doc_topic_dists': lda_model.doc_topic_,
        'doc_lengths': list(lda_df['len_docs']),
        'term_frequency':cvectorizer.vocabulary_,
        'topic_term_dists': lda_model.components_
    } 
    return data

In [15]:
import pyLDAvis
pyLDAvis.enable_notebook()

In [16]:
ldadata = prepareLDAData()

In [17]:
prepared_data = pyLDAvis.prepare(mds='tsne', **ldadata)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [18]:
pyLDAvis.save_html(prepared_data,'./interactive plots/{}.html'.format(chosen))

In [20]:
# number of items in each topic
c = Counter(lda_df['topic'])
c.most_common()

[(0, 65111),
 (2, 1332),
 (3, 1263),
 (6, 1254),
 (8, 1249),
 (7, 1237),
 (5, 1219),
 (10, 1170),
 (9, 1125),
 (1, 1105),
 (11, 1101),
 (19, 1084),
 (14, 1074),
 (4, 1054),
 (17, 1050),
 (15, 1049),
 (18, 1025),
 (12, 986),
 (13, 969),
 (16, 765)]

## for each topic, check the company rankings

In [57]:
topic_company_rank_df = pd.DataFrame()
for i in range(n_topics):
    topic_df = lda_df[lda_df['topic']==i]
    top_companies = Counter(topic_df['ticker']).most_common(10)
    topic_company_rank_df[i] = top_companies

In [64]:
topic_company_rank_df.columns = topic_summaries

In [96]:
topic_summaries

['cloud computing environment, wireless communication system, system method dynamic, system method wireless, question answer system, management system method, out of order, forward error correction, system method protecting, system method improving',
 'system method controlling, system method generating, phase change memory, data processing device, apparatus system method, programmable logic device, integrated circuit design, service level agreement, solid state memory, method system program',
 'apparatus associated method, printed circuit board, method apparatus providing, question answering system, method apparatus processing, associated method facilitating, radio communication system, system method creating, database management system, pulse width modulation',
 'handheld electronic device, graphical user interface, peer to peer, mobile communication device, system method device, mobile electronic device, mobile communication system, end to end, high aspect ratio, method system proce

In [65]:
topic_company_rank_df

Unnamed: 0,"field effect transistor, random access memory, light emitting diode, metal oxide semiconductor, organic light emitting, static random access, source drain region, fin field effect, dynamic random access, effect transistor device","method apparatus computer, system method apparatus, system method program, system method detecting, self aligned contact, apparatus computer program, method program product, plasma processing system, high aspect ratio, based access control","method system apparatus, three dimensional object, system method using, system method device, virtual machine image, plasma processing chamber, method apparatus using, system method protecting, multi core processor, system method apparatus","system method managing, non volatile memory, phase change ink, volatile memory cell, application programming interface, system on chip, method device system, rf power amplifier, downlink control channel, method manufacturing semiconductor","handheld electronic device, non volatile memory, system method determining, data processing device, device associated method, volatile memory device, graphic processing unit, electronic device associated, handheld communication device, forward error correction","portable electronic device, direct memory access, method apparatus pertaining, method apparatus managing, natural language processing, method apparatus performing, programmable logic device, solid state drive, software defined network, system method dynamic","apparatus system method, system method providing, device system method, processor method system, system method performing, method system instruction, system method wireless, computer readable recording, system method processing, readable recording medium","printed circuit board, system method controlling, electronic device method, data storage system, integrated circuit device, integrated circuit package, optical communication system, mobile data network, point in time, portable electronic device","system method providing, system method implementing, system method generating, system method detecting, method apparatus generating, intermediate transfer member, magnetic recording medium, database management system, user interface element, using mobile device","method apparatus providing, near field communication, communication system providing, field communication nfc, system method identifying, feature related method, integrated circuit chip, system method creating, built in self, content management system","graphical user interface, method apparatus controlling, flash memory device, phase change memory, shallow trench isolation, question answer system, time of flight, method system detecting, flash memory cell, input output device","wireless communication system, wireless communication network, semiconductor device including, integrated circuit design, silicon on insulator, data loss prevention, semiconductor on insulator, system method data, communication system including, pulse width modulation","method apparatus system, method system computer, computer readable medium, system computer readable, system computer program, local area network, question answering system, computer program product, wireless local area, phase locked loop","wireless communication device, mobile communication device, mobile wireless communication, programmable gate array, field programmable gate, communication device method, communication device including, antenna related method, low power state, method system identifying","dispersed storage network, system apparatus method, peer to peer, device to device, encoded data slice, data dispersed storage, distributed storage network, apparatus method performing, to device communication, method system creating","cloud computing environment, mobile computing device, analog to digital, to digital converter, bipolar junction transistor, method apparatus processing, method operation thereof, radio access technology, mobile telecommunication system, method system generating","computer program product, method computer program, system method computer, data processing system, computer software application, system method user, graphic processing unit, using social network, distributed computing system, apparatus method processing","apparatus associated method, networked computing environment, flash memory device, system method reducing, method apparatus reducing, semiconductor memory device, radio communication system, method system managing, associated method facilitating, out of order","dc dc converter, portable electronic device, electronic device including, method apparatus use, mobile communication device, touch sensitive display, system method facilitating, system method dynamically, instruction logic provide, wireless power transmission","method system providing, apparatus method system, semiconductor device method, memory device method, replacement metal gate, system method using, method system determining, method system processing, universal serial bus, system method adjusting"
0,"(IBM, 23141)","(IBM, 323)","(IBM, 345)","(IBM, 306)","(BBRY, 410)","(IBM, 425)","(INTC, 373)","(IBM, 378)","(IBM, 227)","(IBM, 303)","(IBM, 423)","(IBM, 315)","(INTC, 228)","(BBRY, 419)","(IBM, 525)","(IBM, 275)","(IBM, 290)","(IBM, 261)","(BBRY, 251)","(IBM, 268)"
1,"(MSFT, 10320)","(INTC, 137)","(INTC, 229)","(INTC, 189)","(IBM, 133)","(BBRY, 234)","(IBM, 233)","(BBRY, 173)","(BBRY, 176)","(BBRY, 200)","(CY, 152)","(BBRY, 198)","(IBM, 220)","(IBM, 122)","(INTC, 181)","(INTC, 218)","(NVDA, 168)","(BBRY, 239)","(IBM, 167)","(INTC, 139)"
2,"(INTC, 8023)","(BBRY, 111)","(BBRY, 189)","(XRX, 144)","(INTC, 122)","(INTC, 147)","(BBRY, 154)","(INTC, 147)","(INTC, 169)","(INTC, 182)","(INTC, 147)","(INTC, 179)","(BBRY, 131)","(INTC, 110)","(BBRY, 81)","(BBRY, 136)","(INTC, 75)","(CY, 161)","(INTC, 135)","(BBRY, 117)"
3,"(BBRY, 3286)","(LRCX, 73)","(XRX, 111)","(BBRY, 135)","(NVDA, 71)","(MSFT, 58)","(MSFT, 60)","(TXN, 86)","(XRX, 161)","(MSFT, 75)","(MSFT, 97)","(MSFT, 55)","(XXIA, 95)","(HRS, 59)","(MSFT, 53)","(XRX, 74)","(BBRY, 34)","(INTC, 104)","(TXN, 67)","(XRX, 99)"
4,"(TXN, 2804)","(XRX, 63)","(MSFT, 82)","(CY, 121)","(CY, 54)","(XRX, 50)","(XRX, 58)","(CY, 67)","(MSFT, 69)","(XRX, 51)","(BBRY, 77)","(TXN, 46)","(TXN, 53)","(MSCC, 58)","(CY, 49)","(MSFT, 69)","(MSFT, 23)","(XRX, 65)","(CY, 64)","(CY, 74)"
5,"(XRX, 2603)","(MSFT, 63)","(SYMC, 55)","(SYMC, 56)","(TXN, 53)","(CY, 35)","(SYMC, 49)","(CIEN, 44)","(SYMC, 59)","(SYMC, 47)","(TXN, 45)","(SYMC, 45)","(XRX, 38)","(MSFT, 34)","(XRX, 43)","(TXN, 35)","(XRX, 20)","(MSFT, 40)","(MSFT, 50)","(MSFT, 71)"
6,"(CY, 1697)","(SYMC, 44)","(LRCX, 54)","(TXN, 50)","(MSFT, 48)","(TXN, 30)","(TXN, 40)","(MSFT, 40)","(TXN, 39)","(TXN, 34)","(XRX, 41)","(HRS, 39)","(MSFT, 34)","(XRX, 30)","(SYMC, 19)","(CY, 22)","(CY, 20)","(TXN, 29)","(XRX, 40)","(CIEN, 37)"
7,"(ADBE, 1127)","(TXN, 38)","(CIEN, 40)","(MSFT, 50)","(XRX, 30)","(CIEN, 28)","(NVDA, 32)","(XRX, 38)","(CY, 39)","(CY, 32)","(ADBE, 25)","(XRX, 30)","(SYMC, 23)","(TXN, 18)","(TXN, 17)","(NVDA, 20)","(LNKD, 15)","(HRS, 24)","(NVDA, 31)","(SYMC, 36)"
8,"(NVDA, 1048)","(CY, 33)","(TXN, 33)","(MSCC, 19)","(SYMC, 16)","(LSCC, 23)","(CY, 29)","(XCRA, 32)","(CIEN, 33)","(ADBE, 27)","(SYMC, 22)","(CY, 28)","(MSCC, 19)","(SYMC, 18)","(NVDA, 12)","(HRS, 19)","(SYMC, 14)","(SYMC, 17)","(SYMC, 28)","(TXN, 30)"
9,"(HRS, 913)","(ADBE, 30)","(CY, 27)","(CIEN, 19)","(SYNA, 14)","(SYMC, 23)","(CIEN, 19)","(NVDA, 31)","(WDC, 31)","(LNKD, 27)","(HRS, 20)","(CLIR, 17)","(LSCC, 17)","(ADBE, 11)","(CIEN, 10)","(SYMC, 16)","(IMMR, 14)","(NVDA, 15)","(WATT, 25)","(NVDA, 23)"
