# Second Stage 2

In this stage, mainly conducted Topic modeling using LDA (Latent Dirichlet Allocation) on a sample size of the original data. As we used t-SNE to reduce feature dimension, sample size is limited due to the processing time. Maybe other method should be considered if the data size is big or should break down data into chunks for easier processing.

*Major difference with second stage: focusing on one sector's topic instead of everything*

In [19]:
import warnings
import lda
import logging
import dill 
import os
import pandas as pd
import numpy as np
import bokeh.plotting as bp
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
from collections import Counter

warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.getLogger("lda").setLevel(logging.WARNING)
code_dir = os.getcwd()
data_dir = code_dir.replace('code', 'data')

In [2]:
sub_body = dill.load(open(data_dir+'/#patent_title_lemma.pkd', 'rb'))

tokens = sub_body['title_lemma'].tolist()
titles = []
for token in tokens:
    titles.append(' '.join(token))

sub_body['title_lemma'] = titles
sub_body = sub_body.drop_duplicates(subset='title_lemma')
sub_body = sub_body.dropna(subset=['sector'])
sub_body.shape

(395873, 8)

In [None]:
set(sub_body['sector'])

In [3]:
# set the sub dataframe that you want to focus on
chosen = 'Information Technology'
sub_body1 = sub_body[sub_body['sector'] == chosen]
sub_body2 = sub_body1[sub_body1['exec_dt']>'2010-01-01']
sub_body2.shape

(86222, 8)

In [70]:
sub_body2.head()

Unnamed: 0,exec_dt,or_name,title,ee_name_matched,ticker,sector,title_stem,title_lemma
4132,2010-01-12,"FULKERSON, TERRENCE M.","[pump, throat]",NORDSON CORPORATION,NDSN,Information Technology,"[pump, throat]",pump throat
4138,2010-01-12,"FULKERSON, TERRENCE M.","[pump, injector]",NORDSON CORPORATION,NDSN,Information Technology,"[pump, injector]",pump injector
4174,2010-06-10,"GAON, MARTIN, MR.","[adhesive, sensor, hot, melt, liquid, adhesives]",NORDSON CORPORATION,NDSN,Information Technology,"[adhes, sensor, hot, melt, liquid, adhes]",adhesive sensor hot melt liquid adhesive
4176,2010-07-01,"BONDESON, BENJAMIN J.","[cover, adhesive, dispensing, gun]",NORDSON CORPORATION,NDSN,Information Technology,"[cover, adhes, dispens, gun]",cover adhesive dispensing gun
4184,2010-07-01,"BONDESON, BENJAMIN J.","[thermally, insulated, applicator]",NORDSON CORPORATION,NDSN,Information Technology,"[thermal, insul, applic]",thermally insulated applicator


In [4]:
title_list = sub_body2['title_lemma'].tolist()
title_list[:3]

['pump throat', 'pump injector', 'adhesive sensor hot melt liquid adhesive']

In [97]:
# based on different data size, adjust parameters to make result more reasonable

cvectorizer = CountVectorizer(min_df=1, max_df=8000, max_features=20000, ngram_range=(3,3))
cvz = cvectorizer.fit_transform(sub_body2['title_lemma'])

n_topics = 20
n_iter = 1000
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)



In [98]:
n_top_words = 10
topic_summaries = []

topic_word = lda_model.topic_word_  # get the topic words
vocab = cvectorizer.get_feature_names()

for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(', '.join(topic_words))

In [8]:
# with open("file.txt", "w") as output:
#     output.write(str(topic_summaries))

In [99]:
tsne_model = TSNE(n_components=2, random_state=0, learning_rate=30, 
                  n_iter=300, angle=0.8, init='pca')
tsne_lda = tsne_model.fit_transform(X_topics)

In [100]:
doc_topic = lda_model.doc_topic_
lda_keys = []
for i, tweet in enumerate(sub_body2['title']):
    lda_keys += [doc_topic[i].argmax()]

In [None]:
# plot_lda = bp.figure(plot_width=700, plot_height=600, title="LDA topic visualization",
#     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
#     x_axis_type=None, y_axis_type=None, min_border=1)

In [101]:
Counter(lda_keys)

Counter({0: 47682,
         1: 2140,
         2: 2051,
         3: 2109,
         4: 2263,
         5: 2216,
         6: 2047,
         7: 2158,
         8: 2231,
         9: 2231,
         10: 2100,
         11: 1995,
         12: 1764,
         13: 1886,
         14: 1964,
         15: 1768,
         16: 1986,
         17: 1844,
         18: 1819,
         19: 1968})

In [102]:
lda_df = pd.DataFrame(tsne_lda, columns=['x','y'])
lda_df['title'] = sub_body2['title_lemma'].tolist()
lda_df['ticker'] = sub_body2['ticker'].tolist()
lda_df['topic'] = lda_keys
lda_df['topic'] = lda_df['topic'].map(int)
lda_df.head()

Unnamed: 0,x,y,title,ticker,topic
0,0.329216,-0.930769,pump throat,NDSN,0
1,0.329216,-0.930769,pump injector,NDSN,0
2,0.329216,-0.930769,adhesive sensor hot melt liquid adhesive,NDSN,0
3,0.329216,-0.930769,cover adhesive dispensing gun,NDSN,0
4,0.329216,-0.930769,thermally insulated applicator,NDSN,0


In [12]:
colormap = np.array(["#6d8dca", "#69de53", "#723bca", "#c3e14c", "#c84dc9", "#68af4e", "#6e6cd5",
"#e3be38", "#4e2d7c", "#5fdfa8", "#d34690", "#3f6d31", "#d44427", "#7fcdd8", "#cb4053", "#5e9981",
"#803a62", "#9b9e39", "#c88cca", "#e1c37b", "#34223b", "#bdd8a3", "#6e3326", "#cfbdce", "#d07d3c",
"#52697d", "#7d6d33", "#d27c88", "#36422b", "#b68f79"])

In [None]:
# plot_lda.scatter(source=lda_df, x='x', y='y', color=colormap[lda_keys])
# hover = plot_lda.select(dict(type=HoverTool))
# hover.tooltips={"title":"@title", "topic":"@topic", "ticker":"@ticker"}
# show(plot_lda)

# visualization of topics using pyLDAvis

In [103]:
lda_df['len_docs'] = lda_df['title'].apply(lambda x: len(x.split(' ')))

In [14]:
def prepareLDAData():
    data = {
        'vocab': vocab,
        'doc_topic_dists': lda_model.doc_topic_,
        'doc_lengths': list(lda_df['len_docs']),
        'term_frequency':cvectorizer.vocabulary_,
        'topic_term_dists': lda_model.components_
    } 
    return data

In [104]:
import pyLDAvis
pyLDAvis.enable_notebook()

In [105]:
ldadata = prepareLDAData()

In [106]:
prepared_data = pyLDAvis.prepare(mds='tsne', **ldadata)

In [107]:
pyLDAvis.save_html(prepared_data,'./interactive plots/{}.html'.format(chosen))

In [108]:
# number of items in each topic
c = Counter(lda_df['topic'])
c.most_common()

[(0, 47682),
 (4, 2263),
 (9, 2231),
 (8, 2231),
 (5, 2216),
 (7, 2158),
 (1, 2140),
 (3, 2109),
 (10, 2100),
 (2, 2051),
 (6, 2047),
 (11, 1995),
 (16, 1986),
 (19, 1968),
 (14, 1964),
 (13, 1886),
 (17, 1844),
 (18, 1819),
 (15, 1768),
 (12, 1764)]

## for each topic, check the company rankings

In [109]:
topic_company_rank_df = pd.DataFrame()
for i in range(n_topics):
    topic_df = lda_df[lda_df['topic']==i]
    top_companies = Counter(topic_df['ticker']).most_common(10)
    topic_company_rank_df[i] = top_companies

In [110]:
topic_company_rank_df.columns = topic_summaries

In [111]:
topic_summaries

['mobile computing device, analog to digital, system method reducing, to digital converter, method apparatus reducing, system method identifying, method system determining, reducing power consumption, system method dynamically, integrated development environment',
 'computer readable medium, system computer readable, method system computer, light emitting diode, handheld communication device, organic light emitting, high aspect ratio, metal oxide semiconductor, light emitting device, long term evolution',
 'handheld electronic device, mobile electronic device, semiconductor memory device, electronic device associated, device associated method, radio access technology, integrated circuit design, mobile telecommunication system, non volatile semiconductor, distributed computing system',
 'dispersed storage network, encoded data slice, shallow trench isolation, data dispersed storage, time of flight, built in self, distributed storage network, automatic gain control, method use therewith,

In [112]:
topic_company_rank_df

Unnamed: 0,"mobile computing device, analog to digital, system method reducing, to digital converter, method apparatus reducing, system method identifying, method system determining, reducing power consumption, system method dynamically, integrated development environment","computer readable medium, system computer readable, method system computer, light emitting diode, handheld communication device, organic light emitting, high aspect ratio, metal oxide semiconductor, light emitting device, long term evolution","handheld electronic device, mobile electronic device, semiconductor memory device, electronic device associated, device associated method, radio access technology, integrated circuit design, mobile telecommunication system, non volatile semiconductor, distributed computing system","dispersed storage network, encoded data slice, shallow trench isolation, data dispersed storage, time of flight, built in self, distributed storage network, automatic gain control, method use therewith, sidewall image transfer","system method using, system method implementing, system method device, mobile data network, management system method, optical transport network, thermal interface material, system method generating, image forming device, system method secure","flash memory device, printed circuit board, direct memory access, natural language processing, intermediate transfer member, system method automatically, self aligned contact, system on chip, flash memory cell, service level agreement","dc dc converter, method system apparatus, local area network, wireless local area, method apparatus providing, plasma processing system, solid state drive, end to end, software defined network, digital right management","apparatus associated method, system method detecting, method apparatus controlling, phase change ink, device to device, method apparatus generating, radio communication system, system method facilitating, optical communication system, associated method facilitating","cloud computing environment, data storage system, system method generating, system method performing, method system managing, system method dynamic, apparatus method system, method system processing, method system detecting, method system controlling","system method providing, graphical user interface, portable electronic device, wireless communication system, wireless communication network, method apparatus providing, method system providing, method operation thereof, method program system, ink based digital","method apparatus system, system method managing, graphic processing unit, application programming interface, silicon on insulator, bipolar junction transistor, semiconductor on insulator, based access control, content addressable memory, device control method","processor method system, integrated circuit device, integrated circuit package, method apparatus performing, method system instruction, question answer system, integrated circuit chip, semiconductor integrated circuit, method apparatus use, testing integrated circuit","wireless communication device, mobile wireless communication, near field communication, programmable gate array, electronic device including, field programmable gate, touch sensitive display, communication system providing, field communication nfc, portable electronic device","field effect transistor, mobile communication device, system method controlling, fin field effect, source drain region, effect transistor device, method apparatus use, method system creating, distributed processing system, integrated circuit ic","data processing device, replacement metal gate, data processing system, method apparatus processing, magnetic recording medium, high metal gate, metal oxide semiconductor, near eye display, hand held electronic, three dimensional 3d","computer program product, method computer program, system method computer, system computer program, method system computer, method apparatus computer, system method program, apparatus computer program, method program product, system method multi","apparatus system method, system method determining, system apparatus method, device system method, question answering system, system method wireless, out of order, forward error correction, database management system, point in time","non volatile memory, system method enabling, method apparatus managing, volatile memory cell, volatile memory device, system method processing, programmable logic device, user interface element, system method adjusting, multi core processor","random access memory, peer to peer, electronic device method, phase locked loop, three dimensional object, semiconductor device method, memory device method, communication device method, static random access, device method controlling","networked computing environment, system method apparatus, phase change memory, method apparatus pertaining, computer readable recording, readable recording medium, voltage controlled oscillator, virtual machine image, method apparatus using, content management system"
0,"(IBM, 16917)","(IBM, 655)","(IBM, 561)","(IBM, 868)","(IBM, 730)","(IBM, 716)","(IBM, 560)","(IBM, 530)","(IBM, 688)","(IBM, 573)","(IBM, 738)","(IBM, 625)","(BBRY, 592)","(IBM, 801)","(IBM, 605)","(IBM, 660)","(IBM, 558)","(IBM, 467)","(IBM, 454)","(IBM, 646)"
1,"(MSFT, 8625)","(INTC, 293)","(BBRY, 532)","(INTC, 245)","(INTC, 326)","(INTC, 268)","(INTC, 291)","(BBRY, 332)","(INTC, 312)","(BBRY, 361)","(INTC, 363)","(INTC, 336)","(IBM, 328)","(BBRY, 266)","(INTC, 233)","(INTC, 200)","(INTC, 393)","(INTC, 308)","(INTC, 251)","(INTC, 335)"
2,"(INTC, 5851)","(BBRY, 189)","(INTC, 255)","(MSFT, 153)","(BBRY, 184)","(CY, 265)","(BBRY, 227)","(INTC, 248)","(BBRY, 190)","(INTC, 333)","(BBRY, 212)","(BBRY, 179)","(INTC, 180)","(INTC, 213)","(BBRY, 209)","(MSFT, 180)","(BBRY, 214)","(CY, 184)","(BBRY, 194)","(BBRY, 225)"
3,"(BBRY, 2124)","(MSFT, 167)","(MSFT, 129)","(CY, 109)","(MSFT, 160)","(BBRY, 167)","(TXN, 136)","(XRX, 209)","(XRX, 163)","(MSFT, 212)","(MSFT, 133)","(MSFT, 156)","(MSFT, 105)","(MSFT, 109)","(MSFT, 183)","(NVDA, 168)","(MSFT, 120)","(BBRY, 169)","(CY, 136)","(MSFT, 124)"
4,"(TXN, 1996)","(TXN, 112)","(CY, 121)","(TXN, 103)","(XRX, 154)","(MSFT, 142)","(CY, 136)","(MSFT, 144)","(MSFT, 160)","(XRX, 118)","(XRX, 96)","(XRX, 114)","(XRX, 67)","(XRX, 107)","(XRX, 103)","(BBRY, 90)","(TXN, 114)","(MSFT, 142)","(MSFT, 116)","(XRX, 91)"
5,"(XRX, 1797)","(XRX, 102)","(XRX, 93)","(BBRY, 95)","(SYMC, 79)","(XRX, 117)","(MSFT, 131)","(SYMC, 122)","(TXN, 108)","(SYMC, 63)","(TXN, 87)","(TXN, 89)","(HRS, 66)","(CY, 56)","(NVDA, 73)","(TXN, 88)","(XRX, 89)","(XRX, 88)","(TXN, 112)","(TXN, 71)"
6,"(CY, 1022)","(XXIA, 98)","(TXN, 44)","(XRX, 82)","(TXN, 76)","(TXN, 86)","(XRX, 59)","(TXN, 73)","(CY, 80)","(CY, 58)","(CY, 80)","(CY, 64)","(MSCC, 66)","(TXN, 50)","(TXN, 64)","(XRX, 54)","(SYMC, 57)","(TXN, 48)","(XRX, 86)","(CY, 57)"
7,"(ADBE, 847)","(CY, 56)","(SYMC, 32)","(HRS, 42)","(CIEN, 67)","(SYMC, 44)","(LRCX, 54)","(ADBE, 59)","(SYMC, 60)","(NVDA, 45)","(SYMC, 74)","(SYMC, 41)","(TXN, 58)","(NVDA, 25)","(SYMC, 59)","(CY, 41)","(CY, 40)","(MSCC, 44)","(LRCX, 57)","(HRS, 43)"
8,"(NVDA, 763)","(OLED, 47)","(NVDA, 29)","(NVDA, 37)","(CY, 59)","(NVDA, 32)","(CIEN, 47)","(CY, 54)","(NVDA, 48)","(TXN, 43)","(NVDA, 47)","(NVDA, 33)","(CY, 53)","(CIEN, 22)","(CY, 56)","(JDSU, 26)","(NVDA, 35)","(NVDA, 44)","(HRS, 37)","(ADBE, 43)"
9,"(HRS, 633)","(HRS, 37)","(HRS, 24)","(CIEN, 36)","(NVDA, 37)","(HRS, 31)","(LNKD, 34)","(HRS, 48)","(HRS, 47)","(HRS, 41)","(ADBE, 26)","(HRS, 31)","(NVDA, 38)","(ADBE, 21)","(CIEN, 39)","(ADBE, 25)","(WATT, 35)","(SYMC, 36)","(MSCC, 32)","(NVDA, 40)"
