# FOS Similarity

This notebook contains instructions how to obtain similarities between FOS'es

In [1]:
import pickle 
import json
from sklearn.metrics.pairwise import cosine_similarity

### Load Dependencies

Download the following zip file which contains the dependencies needed for proceeding the next steps: https://drive.google.com/file/d/1QtYXkXXfYUgooj_00OQJI4GFfR6kfnQ0/view?usp=sharing

In [2]:
#Load the three files download previously
# Mapping between FOS IDs and Names
with open("FOSMAP.json") as file :
    fos_map = json.loads(file.read()) #Map the fos ids to the fos names
    
#Mapping between FOS ID and its row number in the matrix
with open("FOSIndex.json") as file :
    fos_index = json.loads(file.read()) #perform an indexing on the fos ids
    
fos_index2 = {item:index for index, item in enumerate(fos_index)}

#FOS TF-IDF vector matrix
with open("CombinedNGRAMMatrixCSR.pkl", "rb") as file :
    fos_matrix = pickle.load(file)

### Define similarity function

In [18]:
def get_similarity_scores( fos_id ):
    """
    Function to get top 100 most similar FOS'es to a given FOS ID
    PARAMS
    fos_id (string) FOS ID
    
    RETURNS
    list of tuples [(fos_id, similarity score)]
    """
    
    # Take a row from FOS matrix 
    fos_row = fos_matrix.getrow( fos_index2[fos_id] )
    
    #Calculate similarity between the row and matrix
    fos_result_mat = cosine_similarity( fos_matrix, Y = fos_row , dense_output=True )
    
    #Convert result vector to dict
    fos_result_raw = [ (x,i[0]) for x,i in enumerate( fos_result_mat ) ]

    #Sort Select TOP 100 FOS
    fos_final_result = [ (fos_index[ i[0] ] , i[1]) for i in sorted( fos_result_raw , key = lambda kv : kv[1] ,reverse = True )[0:100] ]
    
    return fos_final_result

### This section can get a fos id and give the most similar fos'es to the given fos id.

In [11]:
fos_map["2779652045"]

'Pharmaceutical industry'

In [21]:
#recives the fos id and gives the top five similar fos'es
similar_fos = get_similarity_scores( "2779652045" )

similar_fos[0:5]

[('2779652045', 1.0000000000000018),
 ('178855305', 0.7003413520864034),
 ('2909735440', 0.6761556891037742),
 ('2779535233', 0.661582139881108),
 ('2781419290', 0.6511705997499511)]

In [22]:
#fos names can be mached to the fos ids
for item in similar_fos[0:10]:
    print( fos_map[item[0]])

Pharmaceutical industry
Pharmaceutical sciences
Drug marketing
Pharmaceutical drug
Pharmaceutical marketing
Drug Company
Drug development
Drug pipeline
New chemical entity
Pharmaceutical policy


### This section will get rang of fos ids and give the similarity between them in an adjacency matrix

In [13]:
from scipy.sparse import vstack

def get_similarity_scores_v3( fos_ids ):
    """
    Function to get similarity scores to given list of FOS IDs
    PARAMS
    fos_ids (list of strings)[fos_id] FOS IDs
   
    RETURNS
    array 153247 x len(fos_ids)
    """
    fos_rows = []
    # Take a row from FOS matrix
    for fos_id in fos_ids :
        fos_row = fos_matrix.getrow( fos_index2[fos_id] )
        fos_rows.append( fos_row )
   
    fos_mat = vstack( fos_rows )
   
    #Calculate similarity between the row and matrix
    fos_result_mat = cosine_similarity( fos_mat , dense_output=True )
   
   
    return fos_result_mat

In [14]:
#for example five fos ids are given and the result is showing the fos similarities.
similar_fos = get_similarity_scores_v3 (["178855305", "2779652045", "2909735440", "2779535233", "2781419290"])

similar_fos

array([[1.        , 0.70034135, 0.62248533, 0.86785788, 0.61750199],
       [0.70034135, 1.        , 0.67615569, 0.66158214, 0.6511706 ],
       [0.62248533, 0.67615569, 1.        , 0.73287974, 0.67798056],
       [0.86785788, 0.66158214, 0.73287974, 1.        , 0.59640948],
       [0.61750199, 0.6511706 , 0.67798056, 0.59640948, 1.        ]])

### Annex: Input data types/shape

In [23]:
fos_map

{'1443462': 'Immobiliser',
 '1576492': 'Matrix pencil',
 '2657588': 'Combinatorial topology',
 '3079626': 'Quantum electrodynamics',
 '3535393': 'Mesophase',
 '5688416': 'Neutron cross section',
 '7271767': 'Primer (molecular biology)',
 '10389098': 'Batch file',
 '11045955': 'ElGamal encryption',
 '12404463': 'Chemical industry',
 '13818915': '2–3 tree',
 '14981831': 'Market clearing',
 '16287357': 'U.S. Standard Atmosphere',
 '19044487': 'Control zone',
 '19159745': 'Social policy',
 '19513336': 'Snapshot isolation',
 '20137635': 'Bionomics',
 '20692236': 'Algorism',
 '22081632': "Turán's inequalities",
 '22664368': 'Oracle Unified Method',
 '24914591': 'Specific force',
 '24992206': 'Roadway noise',
 '26268613': 'Cell disruption',
 '30658544': 'Mobile collaboration',
 '31571166': 'Picard horn',
 '33107880': 'Retarder',
 '33341308': 'Terminal node controller',
 '33493656': 'Elliott–Halberstam conjecture',
 '35294091': 'Loading dose',
 '37914503': 'Mathematical physics',
 '39078644': 

In [25]:
fos_index2

{'10000559': 0,
 '100012643': 1,
 '100017401': 2,
 '100020394': 3,
 '100022809': 4,
 '100028377': 5,
 '100036529': 6,
 '100044566': 7,
 '100044655': 8,
 '10005042': 9,
 '10005230': 10,
 '100053769': 11,
 '100065037': 12,
 '100068826': 13,
 '100082104': 14,
 '100083437': 15,
 '100086909': 16,
 '100087112': 17,
 '100094221': 18,
 '100094513': 19,
 '100100210': 20,
 '100102862': 21,
 '10010492': 22,
 '100106864': 23,
 '100107663': 24,
 '100129307': 25,
 '100134115': 26,
 '100136789': 27,
 '100139498': 28,
 '100142294': 29,
 '100143864': 30,
 '100148910': 31,
 '100149206': 32,
 '100155856': 33,
 '100157288': 34,
 '100158260': 35,
 '100175707': 36,
 '100175905': 37,
 '100185428': 38,
 '100187453': 39,
 '100188561': 40,
 '10020367': 41,
 '100203831': 42,
 '100206155': 43,
 '100206748': 44,
 '100207259': 45,
 '100207952': 46,
 '10021369': 47,
 '10021790': 48,
 '100220312': 49,
 '100220837': 50,
 '100229915': 51,
 '100234860': 52,
 '100243477': 53,
 '100245971': 54,
 '100253034': 55,
 '1002591

In [26]:
fos_matrix

<153247x8956464 sparse matrix of type '<class 'numpy.int64'>'
	with 125789134 stored elements in Compressed Sparse Row format>

In [27]:
print(fos_matrix)

  (0, 0)	88
  (0, 1)	64
  (0, 2)	24
  (0, 3)	14
  (0, 4)	38
  (0, 5)	80
  (0, 6)	32
  (0, 7)	33
  (0, 8)	26
  (0, 9)	133
  (0, 10)	17
  (0, 11)	64
  (0, 12)	48
  (0, 13)	34
  (0, 15)	34
  (0, 16)	60
  (0, 17)	17
  (0, 18)	93
  (0, 19)	28
  (0, 20)	27
  (0, 21)	29
  (0, 22)	25
  (0, 23)	20
  (0, 24)	37
  (0, 25)	13
  :	:
  (153246, 264973)	11
  (153246, 289952)	11
  (153246, 305295)	11
  (153246, 365543)	35
  (153246, 378647)	11
  (153246, 396136)	17
  (153246, 398650)	15
  (153246, 522081)	48
  (153246, 523629)	16
  (153246, 584468)	11
  (153246, 624096)	29
  (153246, 696572)	25
  (153246, 836243)	24
  (153246, 886922)	11
  (153246, 913500)	14
  (153246, 1048998)	11
  (153246, 1086800)	15
  (153246, 1097751)	17
  (153246, 1383734)	13
  (153246, 1387256)	19
  (153246, 1974659)	12
  (153246, 2839478)	13
  (153246, 5424177)	11
  (153246, 6446782)	17
  (153246, 6446783)	14


In [29]:
fos_map["10000559"]

'Brooks–Iyengar algorithm'