<a href="https://colab.research.google.com/github/aaubs/ds-master/blob/main/UNISTRA_w2v_sci_pat_link.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://sds-aau.github.io/UNISTRA-DS-2022/workshops/2021/sci_pat_match.zip

--2023-01-30 16:35:18--  https://sds-aau.github.io/UNISTRA-DS-2022/workshops/2021/sci_pat_match.zip
Resolving sds-aau.github.io (sds-aau.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to sds-aau.github.io (sds-aau.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4134574 (3.9M) [application/zip]
Saving to: ‘sci_pat_match.zip’


2023-01-30 16:35:18 (116 MB/s) - ‘sci_pat_match.zip’ saved [4134574/4134574]



In [2]:
!unzip /content/sci_pat_match.zip

Archive:  /content/sci_pat_match.zip
  inflating: patent_nlp_match.csv    
  inflating: __MACOSX/._patent_nlp_match.csv  
  inflating: scopus_neuron.csv       
  inflating: __MACOSX/._scopus_neuron.csv  


In [2]:
!pip install -q umap-learn 
!pip install -q hdbscan
!pip install --upgrade gensim -q
!pip install --force-reinstall -qq numpy==1.22.4

In [3]:
import pandas as pd
import numpy as np

import umap
import hdbscan

import itertools
import random

from collections import Counter

# progress bar
import tqdm

#spacy instantiating English module
import spacy
nlp = spacy.load('en_core_web_sm')



In [4]:
data_sci = pd.read_csv('/content/scopus_neuron.csv')
data_pat = pd.read_csv('/content/patent_nlp_match.csv')

In [5]:
data_sci['text'] = data_sci['Title'].str.cat(data_sci['Abstract'].astype(str), sep=' ')

In [6]:
data_sci.head(2)

Unnamed: 0,Authors,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,Page end,...,Authors with affiliations,Abstract,Author Keywords,Index Keywords,Document Type,Publication Stage,Open Access,Source,EID,text
0,"Jelinčić V., von Leupoldt A.",57198435909;55665379900;,To breathe or not to breathe: Interoceptive pr...,2021,Neuron,109,24,,3904,3907,...,"Jelinčić, V., Research Group Health Psychology...",Respiration is gaining traction as an importan...,,anxiety; article; brain; breathing; interocept...,Article,Final,,Scopus,2-s2.0-85122846445,To breathe or not to breathe: Interoceptive pr...
1,"Lavin M.F., Yeo A.J.",35427118100;57412660600;,DNA damage rather than type I IFN signaling is...,2021,Neuron,109,24,,3897,3900,...,"Lavin, M.F., UQ Centre for Clinical Research, ...",Mutations in genes that function in nucleic me...,,,Article,Final,,Scopus,2-s2.0-85122845025,DNA damage rather than type I IFN signaling is...


In [7]:
data_pat.head(2)

Unnamed: 0,patent_id,claim_len,section_id,subsection_id,group_id,subgroup_id,text,number,country,date,abstract,title,kind,num_claims,uuid
0,8688141,257,H,H04,H04L,H04L51/20,A method comprising: receiving one or more net...,8688141,US,4/1/2014,"In certain embodiments, a method for proximity...",System and method for providing communication ...,B2,26,0bcf7529-ebc1-11ea-a344-121df0c29c1e
1,8688140,231,"G, G","G01, G01","G01S, G01S","G01S3/48, G01S3/043",A method for locating a radio frequency tag us...,8688140,US,4/1/2014,Determination of the location and bearing of a...,Radio frequency tag location system and method,B2,28,03637655-ebba-11ea-a344-121df0c29c1e


In [8]:
# run progress bare and clean up using spacy but without some heavy parts of the pipeline

%%time
clean_sci = []


pbar = tqdm.tqdm(total=len(data_sci['text']),position=0, leave=True)

for text in nlp.pipe(data_sci['text'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_sci.append(txt)

  pbar.update(1)

 99%|█████████▉| 1610/1629 [00:15<00:00, 176.29it/s]

CPU times: user 14.6 s, sys: 588 ms, total: 15.2 s
Wall time: 15.3 s


In [9]:
data_sci['text_cl'] = clean_sci

In [10]:
# run progress bare and clean up using spacy but without some heavy parts of the pipeline

%%time
clean_pat = []


pbar = tqdm.tqdm(total=len(data_pat['text']),position=0, leave=True)

for text in nlp.pipe(data_pat['text'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_pat.append(txt)

  pbar.update(1)

100%|██████████| 1629/1629 [00:15<00:00, 105.54it/s]
100%|█████████▉| 4979/5002 [00:41<00:00, 176.15it/s]

CPU times: user 40 s, sys: 224 ms, total: 40.2 s
Wall time: 41.3 s


In [11]:
data_pat['text_cl'] = clean_pat

In [12]:
all_sent = clean_pat + clean_sci

In [13]:
# get tooling for Word2Vec model
from gensim.models import Word2Vec

In [18]:
# Logging settings
import logging

for handler in logging.root.handlers[:]:
   logging.root.removeHandler(handler)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [15]:
# train word2vec model
w2v_model = Word2Vec(sentences=all_sent, vector_size=300, window=5, min_count=2, workers=2, epochs=5)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# function that does absolutely nothing...
# to be able to use TfidfVectorizer on already tokenized text
def dummy_fun(doc):
    return doc

In [19]:
# we turn of any preprocessing and align vocabulary with the one
# used by our embeddings
# that will allow us to use TFIDF vectors to weight the embeddings

tfidf_sci = TfidfVectorizer(vocabulary=w2v_model.wv.key_to_index.keys(),
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  

In [20]:
# create TFIDF matrix (we could also just use that one for search)
sci_tfidf = tfidf_sci.fit_transform(data_sci['text_cl'])

In [21]:
# for the whole matrix

sci_w2v_tfidf = sci_tfidf @ w2v_model.wv.vectors

In [22]:
sci_w2v_tfidf.shape

(1629, 300)

In [23]:
umap_reducer_sci = umap.UMAP(random_state=42, n_components=2)
embeddings_sci = umap_reducer_sci.fit_transform(sci_w2v_tfidf)

In [24]:
embeddings_sci

array([[ 3.30014   , 11.025271  ],
       [ 1.0435172 , 11.1762705 ],
       [ 3.4412305 ,  8.806387  ],
       ...,
       [-0.20638339,  5.7905617 ],
       [-1.1322123 ,  6.770193  ],
       [-0.27902526,  5.7675257 ]], dtype=float32)

In [34]:
clusterer_sci = hdbscan.HDBSCAN(min_cluster_size=15, min_samples=3)
clusterer_sci.fit(embeddings_sci)
data_sci['cluster'] = clusterer_sci.labels_

In [35]:
data_sci['cluster'].unique()

array([14,  2, 18,  3,  9, 20, 10, 19, 13, 17, -1,  7,  1,  5, 16,  0, 15,
       12,  6, 11,  4, 21,  8])

In [36]:
import altair as alt

In [37]:
df_plot = pd.DataFrame(embeddings_sci, columns=['x','y'])

In [38]:
df_plot['Title'] = data_sci['Title']
df_plot['doctype'] = data_sci['Document Type']
df_plot['abstract'] = data_sci['Abstract']
df_plot['year'] = data_sci['Year']
df_plot['cluster'] = clusterer_sci.labels_

In [39]:
df_plot = df_plot[df_plot['cluster']!= -1]

In [40]:
alt.Chart(df_plot).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster:N', scale=alt.Scale(scheme='category20')),
    tooltip=['Title', 'abstract', 'cluster','year']
).properties(
    width=800,
    height=600
).interactive()

In [41]:
# we turn of any preprocessing and align vocabulary with the one
# used by our embeddings
# that will allow us to use TFIDF vectors to weight the embeddings

tfidf_pat = TfidfVectorizer(vocabulary=w2v_model.wv.key_to_index.keys(),
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  

In [42]:
# create TFIDF matrix (we could also just use that one for search)
pat_tfidf = tfidf_pat.fit_transform(data_pat['text_cl'])

In [43]:
# for the whole matrix

pat_w2v_tfidf = pat_tfidf @ w2v_model.wv.vectors

In [44]:
umap_reducer_pat = umap.UMAP(random_state=42, n_components=2)
embeddings_pat = umap_reducer_pat.fit_transform(pat_w2v_tfidf)

In [45]:
clusterer_pat = hdbscan.HDBSCAN(min_cluster_size=8, min_samples=5)
clusterer_pat.fit(embeddings_pat)
data_pat['cluster'] = clusterer_pat.labels_

In [46]:
data_pat['cluster'].unique()

array([ 4, -1, 51, 10,  9, 40, 90, 54, 75, 49, 48, 55, 31, 27,  8, 64, 36,
       50, 59, 16, 68, 30, 86,  7, 20, 17, 11,  1, 15, 26, 14, 24,  3, 21,
        0,  2, 12, 18,  6, 13, 19, 23, 45, 41, 32, 58, 34, 88, 66, 53, 46,
       87, 60, 81, 33, 67, 83, 42, 35, 39, 61, 92, 38, 25, 91, 76, 78, 84,
       22, 69, 57, 71, 43, 82, 73, 63, 65, 72, 70, 85, 95, 74, 94, 37, 79,
        5, 89, 77, 29, 28, 52, 47, 62, 44, 93, 56, 80])

In [47]:
df_plot_p = pd.DataFrame(embeddings_pat, columns=['x','y'])

In [48]:
df_plot_p['title'] = data_pat['title']
df_plot_p['abstract'] = data_pat['abstract']
df_plot_p['section_id'] = data_pat['section_id']
df_plot_p['subsection_id'] = data_pat['subsection_id']
df_plot_p['cluster'] = clusterer_pat.labels_

In [49]:
df_plot_p = df_plot_p[df_plot_p['cluster']!= -1]

In [50]:
df_plot_p = df_plot_p.sample(1500)

In [51]:
#df_plot_p = df_plot_p[df_plot_p.cluster.isin([43,13])]

In [52]:
alt.Chart(df_plot_p).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster', scale=alt.Scale(scheme='category20')),
    tooltip=['title', 'abstract', 'section_id','subsection_id', 'cluster']
).properties(
    width=800,
    height=600
).interactive()

In [53]:
w2v_dict = list(w2v_model.wv.key_to_index.keys())

In [73]:
tf_ix = data_sci[data_sci['cluster'] == 18].index
tf_w_ix = np.flip(np.argsort(np.mean(sci_tfidf[tf_ix], axis=0))).tolist()[0][:100]
topic_kws = [w2v_dict[i] for i in tf_w_ix]

In [55]:
tf_ix = data_pat[data_pat['cluster'] == 31].index
tf_w_ix = np.flip(np.argsort(np.mean(pat_tfidf[tf_ix], axis=0))).tolist()[0][:100]
topic_kws = [w2v_dict[i] for i in tf_w_ix]

In [56]:
# slightly more complex function that includes preprocessing with Spacy
# TFIDF transformation and embeddings

def get_tfidf_vector(word2vec_model, model_tfidf, query):
  
    if len(query) >= 1:
      words = model_tfidf.transform([query])
      return words @ word2vec_model.wv.vectors
    else:
        return []

In [57]:
!pip install -q annoy

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/648.0 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m648.0/648.0 KB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for annoy (setup.py) ... [?25l[?25hdone


In [62]:
from annoy import AnnoyIndex

# instatiate a search tree (with shape n/300)
t = AnnoyIndex(pat_w2v_tfidf.shape[1], 'angular') 

In [63]:
# we will build that on disk (can reuse later if we store it somwhere)

t.on_disk_build('patents_search_tree.annoy')

True

In [64]:
# now we add all our vectors - line by line to the tree
# along with an index (here i - running index)
for i in tqdm.tqdm(range(pat_w2v_tfidf.shape[0]),position=0, leave=True):
    t.add_item(i, pat_w2v_tfidf[i])

100%|██████████| 5002/5002 [00:00<00:00, 8218.83it/s]


In [65]:
# now we build the search tree (that creates partitions within the data-a bit like clustering)
# thereafter search will be performed within the nearest partitions (that reduces search time A LOT)
t.build(50, n_jobs=-1)

True

In [66]:
v = get_tfidf_vector(w2v_model,tfidf_sci,['circuit',
 'circadian',
 'neuron',
 'sleep',
 'cell',
 'remodel',
 'neural',
 'progenitor'])

In [67]:
r = t.get_nns_by_vector(v[0], n=10, include_distances=True)

In [68]:
r

([317, 275, 4696, 2463, 4264, 2500, 416, 3017, 419, 261],
 [0.42572659254074097,
  0.4299052357673645,
  0.44022247195243835,
  0.449957937002182,
  0.4537235498428345,
  0.45614469051361084,
  0.4603694975376129,
  0.4662367105484009,
  0.4668695032596588,
  0.4754190146923065])

In [69]:
data_pat.loc[r[0]]['title']

317     Methods to enhance T-cell mediated immune resp...
275     Methods of using proteinacious channels to ide...
4696    Approximate functional matching in electronic ...
2463    Methods for eliminating at least a substantial...
4264         Electrochemical molecular recognition probes
2500    Molecules with effects on cellular development...
416     Compositions and methods for the treatment of ...
3017    Methods for producing a non human model for ao...
419            Medical methods and agents for use therein
261     Methods and compositions for the detection of ...
Name: title, dtype: object

In [75]:
topic_search_strings = [random.sample(topic_kws, 25) for _ in range(10)]

In [76]:
topic_search_vecs = [get_tfidf_vector(w2v_model,tfidf_sci, l) for l in topic_search_strings]

In [77]:
search_results_ix = []
search_results_dist = []

for v in topic_search_vecs:
  r = t.get_nns_by_vector(v[0], n=10, include_distances=True)
  search_results_ix.extend(r[0])
  search_results_dist.extend(r[1])

In [78]:
pd.DataFrame(zip(search_results_ix,search_results_dist))

Unnamed: 0,0,1
0,4782,0.374288
1,653,0.400098
2,4188,0.409338
3,4696,0.416741
4,1973,0.417328
...,...,...
95,1989,0.400118
96,2195,0.423319
97,4234,0.424487
98,761,0.426044


In [79]:
search_results_ix = []
search_results_dist = []
search_results_clusternr = []


for j in data_sci['cluster'].unique():
  tf_ix = data_sci[data_sci['cluster'] == j].index
  tf_w_ix = np.flip(np.argsort(np.mean(sci_tfidf[tf_ix], axis=0))).tolist()[0][:100]
  topic_kws = [w2v_dict[i] for i in tf_w_ix]
  
  topic_search_strings = [random.sample(topic_kws, 25) for _ in range(10)]
  topic_search_vecs = [get_tfidf_vector(w2v_model,tfidf_sci, l) for l in topic_search_strings]

  for v in topic_search_vecs:
    r = t.get_nns_by_vector(v[0], n=10, include_distances=True)
    search_results_ix.extend(r[0])
    search_results_dist.extend(r[1])
    search_results_clusternr.extend(len(r[1])*[j])
  


In [80]:
s_t_link_df = pd.DataFrame(zip(search_results_ix,search_results_dist,search_results_clusternr), columns=['pat_id','cos_distance','sci_cluster'])

In [81]:
s_t_link_df.drop_duplicates(subset=['pat_id','sci_cluster'], inplace=True)

In [82]:
s_t_link_df = s_t_link_df[s_t_link_df.sci_cluster != -1]

In [83]:
s_t_link_df.groupby('sci_cluster').cos_distance.mean().sort_values()

sci_cluster
13    0.251789
1     0.256481
0     0.275059
19    0.279914
8     0.285980
21    0.286326
20    0.300806
3     0.302143
5     0.306953
6     0.318011
14    0.321299
17    0.326421
11    0.329610
7     0.336041
15    0.347213
2     0.347924
10    0.363382
9     0.393041
4     0.393050
12    0.437040
18    0.447740
16    0.459018
Name: cos_distance, dtype: float64

In [85]:
pat_ix_select = list(s_t_link_df[s_t_link_df.sci_cluster == 13].pat_id)

In [86]:
data_pat.loc[pat_ix_select].cluster.value_counts()

 17    10
 18     6
-1      4
 24     2
 19     1
Name: cluster, dtype: int64