In [206]:
import pandas as pd 
import dill
%matplotlib inline

# TFIDF based clustering 

- for each company and each patent, convert text desp to numeric scores
- if company has multiple patents, its score will be the average score (averaging across patents)


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [5]:
import os

code_dir = os.getcwd()
data_dir = code_dir.replace('code','data')

In [11]:
data = pd.read_csv(data_dir+'/ee_or_matched.csv', index_col=0, dtype='str')

In [207]:
titles = dill.load(open(data_dir+'/tokened_titles.pkd', 'rb'))

In [19]:
tickerlist = pd.read_csv(data_dir+'/companytickers.csv')

In [211]:
len(titles)

1377018

In [25]:
ticks = list(tickerlist['Ticker'])

In [212]:
data_s = titles.sample(frac=0.1)
len(data_s)

137702

In [38]:
from tqdm import tqdm

In [70]:
# data_s[data_s['ticker'] == 'GE']['title'].astype(str)

In [213]:
# each company's patent title combine to one document and then put all
# into documents list.
documents = []
ticker_valid = []
for ticker in tqdm(ticks):    
    uniq_titles = data_s[data_s["ticker"]==ticker]["title"].astype(str)
    if len(uniq_titles)>0:
        titles = (', '.join(list(uniq_titles)))
        documents.append(titles)
        ticker_valid.append(ticker)

100%|██████████| 6653/6653 [01:28<00:00, 74.88it/s]


In [219]:
len(documents)

465

In [220]:
vectorizer = TfidfVectorizer(ngram_range=(2,2),stop_words='english', max_df=0.7)
X = vectorizer.fit_transform(documents)

# K Means

In [221]:
true_k = 4
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=4, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [222]:
# model by kmeans
order_centroids = model.cluster_centers_.argsort()[:,::-10]
terms = vectorizer.get_feature_names()
cluster = dict()
for i in range(true_k):
    cluster[i] = [terms[ind] for ind in order_centroids[i, :10]]

In [223]:
df2 = pd.DataFrame()
df2['ticker'] = ticker_valid
df2['cluster'] = model.labels_

In [224]:
clustered = df2.sort_values(by='cluster')
clustered.groupby('cluster').size()

cluster
0     13
1    326
2    107
3     19
dtype: int64

In [225]:
cluster0_tickers_km = list(df2[df2['cluster'] == 0]['ticker'])
cluster1_tickers_km = list(df2[df2['cluster'] == 1]['ticker'])
cluster2_tickers_km = list(df2[df2['cluster'] == 2]['ticker'])
cluster3_tickers_km = list(df2[df2['cluster'] == 3]['ticker'])

In [227]:
clusters=[cluster0_tickers_km, cluster1_tickers_km, cluster2_tickers_km, cluster3_tickers_km]

In [228]:
dill.dump(clusters, open('clusters_kmeans_tokened.pkd', 'wb'))

In [226]:
cluster

{0: ['methods systems',
  'feeder programming',
  'systems dynamic',
  'entities interface',
  'mapped sensor',
  'sulfide ores',
  'monitoring defense',
  'wireless sound',
  'including arsenic',
  'values including'],
 1: ['method apparatus',
  'use thereof',
  'method providing',
  'methods using',
  'power supply',
  'devices systems',
  'method fabricating',
  'composition method',
  'structure method',
  'mobile device'],
 2: ['fpd aircraft',
  'alfalfa variety',
  'cantilever array',
  'compounds antibacterial',
  'function tester',
  'lipid rafts',
  'receipt management',
  'centralizing lock',
  'units transaction',
  'exhibit display'],
 3: ['fuel cell',
  'oxide fuel',
  'function heat',
  'electric power',
  'designs assembly',
  'magnetic recording',
  'assembly method',
  'tool retractable',
  'gas continuous',
  'ash solids']}

# affinity propagation

In [196]:
from sklearn.cluster import AffinityPropagation

ap = AffinityPropagation(damping=0.5, preference=-3.6)
ap.fit(X)

len(ap.cluster_centers_indices_)

4

In [205]:
ap.cluster_centers_indices_
print(terms[52],
    terms[218],
    terms[233],
    terms[467])

12 antibodies 2write register 360 discharge 91348793299 rubber


In [197]:
# from affinity propagation model
df = pd.DataFrame()
df['ticker'] = ticker_valid
df['cluster'] = ap.labels_

In [198]:
# from affinity propagation model
df.groupby('cluster').size()

cluster
0     83
1      8
2    301
3     76
dtype: int64

In [199]:
cluster0_tickers_ap = list(df[df['cluster'] == 0]['ticker'])
cluster1_tickers_ap = list(df[df['cluster'] == 1]['ticker'])
cluster2_tickers_ap = list(df[df['cluster'] == 2]['ticker'])
cluster3_tickers_ap = list(df[df['cluster'] == 3]['ticker'])

# others

In [147]:
companylist=pd.read_csv('companylist.csv', index_col=0)
companylist = companylist.rename(columns={'Symbol':'ticker'})
mergedlist = pd.merge(df, companylist, on='ticker', how='inner' )
tickerlist = list(mergedlist[mergedlist['cluster']==0]['ticker'].unique())

In [148]:
titles = []

for ticker in tickerlist:
    title = list(set(df_sector[df_sector['ticker']==ticker]['title']))
    titles.append(title)