In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

df1 = pd.read_csv("1045.txt", delimiter = "\t")

In [5]:
df1

Unnamed: 0,MeSH Symptom Term,MeSH Disease Term,PubMed occurrence,TFIDF score
0,"Aging, Premature",Respiratory Syncytial Virus Infections,1,3.464551
1,"Aging, Premature",Orthomyxoviridae Infections,1,3.464551
2,"Aging, Premature",HIV Infections,3,10.393654
3,"Aging, Premature",Acquired Immunodeficiency Syndrome,3,10.393654
4,"Aging, Premature",Breast Neoplasms,1,3.464551
...,...,...,...,...
147973,Hirsutism,Tobacco Use Disorder,1,2.483722
147974,Hirsutism,Radius Fractures,1,2.483722
147975,Hirsutism,Burns,1,2.483722
147976,Hirsutism,Colles' Fracture,1,2.483722


In [6]:
df1.drop('PubMed occurrence',axis=1,inplace=True)
df1

Unnamed: 0,MeSH Symptom Term,MeSH Disease Term,TFIDF score
0,"Aging, Premature",Respiratory Syncytial Virus Infections,3.464551
1,"Aging, Premature",Orthomyxoviridae Infections,3.464551
2,"Aging, Premature",HIV Infections,10.393654
3,"Aging, Premature",Acquired Immunodeficiency Syndrome,10.393654
4,"Aging, Premature",Breast Neoplasms,3.464551
...,...,...,...
147973,Hirsutism,Tobacco Use Disorder,2.483722
147974,Hirsutism,Radius Fractures,2.483722
147975,Hirsutism,Burns,2.483722
147976,Hirsutism,Colles' Fracture,2.483722


In [7]:
combine_product_review = df1.dropna(axis = 0, subset = ['MeSH Symptom Term'])
product_reviewCount = (combine_product_review.
     groupby(by = ['MeSH Symptom Term'])['TFIDF score'].
     count().
     reset_index().
     rename(columns = {'TFIDF score': 'totalTFIDFscore'})
     [['MeSH Symptom Term','totalTFIDFscore']]
    )
product_reviewCount.head()

Unnamed: 0,MeSH Symptom Term,totalTFIDFscore
0,"Abdomen, Acute",1002
1,Abdominal Pain,1599
2,Acute Coronary Syndrome,314
3,Aerophagy,72
4,Ageusia,117


In [8]:
review_with_totalReviewCount  = combine_product_review.merge(product_reviewCount, left_on = 'MeSH Symptom Term', right_on = 'MeSH Symptom Term', how = 'left')
review_with_totalReviewCount.head()

Unnamed: 0,MeSH Symptom Term,MeSH Disease Term,TFIDF score,totalTFIDFscore
0,"Aging, Premature",Respiratory Syncytial Virus Infections,3.464551,132
1,"Aging, Premature",Orthomyxoviridae Infections,3.464551,132
2,"Aging, Premature",HIV Infections,10.393654,132
3,"Aging, Premature",Acquired Immunodeficiency Syndrome,10.393654,132
4,"Aging, Premature",Breast Neoplasms,3.464551,132


In [9]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(product_reviewCount['totalTFIDFscore'].describe())

count    322.000
mean     459.559
std      456.715
min        1.000
25%      123.250
50%      310.000
75%      682.500
max     2593.000
Name: totalTFIDFscore, dtype: float64


In [11]:
product_features_df= review_with_totalReviewCount.pivot_table(index='MeSH Symptom Term',columns='MeSH Disease Term',values='TFIDF score').fillna(0)
product_features_df.head()

MeSH Disease Term,22q11 Deletion Syndrome,"46, XX Disorders of Sex Development","46, XY Disorders of Sex Development","ACTH Syndrome, Ectopic",ACTH-Secreting Pituitary Adenoma,"AIDS Arteritis, Central Nervous System",AIDS Dementia Complex,AIDS-Associated Nephropathy,AIDS-Related Complex,AIDS-Related Opportunistic Infections,...,Zygomatic Fractures,Zygomycosis,alpha 1-Antitrypsin Deficiency,alpha-Mannosidosis,alpha-Thalassemia,beta-Mannosidosis,beta-Thalassemia,von Hippel-Lindau Disease,"von Willebrand Disease, Type 2",von Willebrand Diseases
MeSH Symptom Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Abdomen, Acute",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.188,...,0.0,0.0,0.0,0.0,0.0,0.0,2.875,0.0,0.0,2.875
Abdominal Pain,0.0,0.0,0.0,0.848,0.0,0.0,0.0,0.0,0.0,14.424,...,0.0,1.697,0.848,0.0,1.697,0.0,2.545,0.848,0.0,0.848
Acute Coronary Syndrome,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.598,0.0,0.0,0.0
Aerophagy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ageusia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
from scipy.sparse import csr_matrix

product_features_df_matrix = csr_matrix(product_features_df.values)

In [13]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(product_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [14]:
query_index = np.random.choice(product_features_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(product_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

154


In [15]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(product_features_df.index[query_index]))
    else:
        print('{0}: {1},  with distance of {2}:'.format(i, product_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Hypercalciuria:

1: Renal Colic,  with distance of 0.5855605086864419:
2: Flank Pain,  with distance of 0.7224051136152105:
3: Colic,  with distance of 0.7889499831437368:
4: Proteinuria,  with distance of 0.8296214878293613:
5: Dysuria,  with distance of 0.8571587275137172:
