In [207]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [121]:
df_base = pd.read_csv('hay_fever.csv', low_memory = False)

In [124]:
df_base.head()

Unnamed: 0,PMID,OWN,STAT,LR,IS,VI,IP,DP,PG,LID,...,ORI,SPIN,UIN,CP,RF,RPF,RPI,RIN,GS,Year
0,33191381,NLM,In-Data-Review,20201116.0,0021-4884 (Print) 0021-4884 (Linking),69.0,9.0,2020,928-929,10.15036/arerugi.69.928 [doi],...,,,,,,,,,,2020
1,33171477,NLM,Publisher,20201110.0,1423-0097 (Electronic) 1018-2438 (Linking),,,2020 Nov 10,1-10,10.1159/000510942 [doi],...,,,,,,,,,,2020
2,33160969,NLM,Publisher,20201108.0,1097-6825 (Electronic) 0091-6749 (Linking),,,2020 Nov 5,,S0091-6749(20)31561-X [pii] 10.1016/j.jaci.202...,...,,,,,,,,,,2020
3,33141493,NLM,Publisher,20201103.0,1365-2222 (Electronic) 0954-7894 (Linking),,,2020 Nov 3,,10.1111/cea.13775 [doi],...,,,,,,,,,,2020
4,33134525,NLM,PubMed-not-MEDLINE,20201103.0,2378-8038 (Print) 2378-8038 (Linking),5.0,5.0,2020 Oct,807-808,10.1002/lio2.459 [doi],...,,,,,,,,,,2020


In [125]:
df_base.shape

(15766, 78)

In [127]:
pd.options.display.max_rows = 4000

df_base.isna().sum()

PMID        0
OWN        19
STAT        0
LR         19
IS        196
VI        295
IP       1637
DP          0
PG         89
LID     12116
LA          0
PT          0
PL          0
TA         19
JT         19
JID        19
SB        423
OTO     13185
OT      13185
EDAT        0
MHDA        0
CRDT       19
PHST       19
AID      5996
PST        19
SO         19
TI          6
AB       5599
CI      14256
FAU       250
AU        250
AD       6346
DEP     12981
AUID    15507
PMC     14358
COIS    15634
DCOM      264
MH        490
GR      14847
TT      13022
CN      15513
RN       4823
DA      15747
CTDT    15754
PB      15747
BTI     15747
CDAT    15747
IR      15695
FIR     15695
SI      15616
EFR     15762
UOF     15758
PMCR    15759
EIN     15638
MID     15600
CIN     15312
CON     15509
OAB     15759
OABL    15759
PS      15742
FPS     15742
IRAD    15763
CTI     15762
OID     15006
GN      15754
ISBN    15762
FED     15763
ED      15763
ORI     15765
SPIN    15764
UIN     15758
CP    

Columns of Interest

| Column | Field |
|--------|-------|
|'AB'|Abstract|
|'AD'|Affiliation|
|'AID'|Article Identifier|
|'AU'|Author|
|'CI'|Copyright Information|
|'CIN'|Comment In|
|'DP'|Date of Publication|
|'FAU'|Full Author|
|'FIR'|Full Investigator Name|
|'IP'|Issue|
|'IR'|Investigator Name|
|'IS'|ISSN|
|'JID'|NLM Unique ID|
|'JT'|Journal Title|
|'LA'|Language|
|'LID'|Location Identifier|
|'LR'|Date Last Revised|
|'MH'|MeSH Terms|
|'OT'|Other Term|
|'OWN'|Owner|
|'PG'|Pagination|
|'PHST'|Publication History Status|
|'PL'|Place of Publication|
|'PMID'|PubMed Unique Identifier|
|'PST'|Publication Status|
|'PT'|Publication Type|
|'RF'|Number of References|
|'SB'|Subset|
|'SO'|Source|
|'STAT'|Status|
|'TA'|Journal Title Abbreviation|
|'TI'|Title|
|'VI'|Volume|
|'Year'|Year|

Removed Columns


| Column | Field |
|--------|-------|
|'OTO'|Other Term Owner|
|'RN'|Registry Number|
|'PMCR'|PubMed Central Release|
|'EIN'|Erratum in|
|'MID'|Manuscript Identifier|
|'EDAT'|Entrez Date|
|'MHDA'|MeSH Date|
|'CRDT'|Create Date|
|'COIS'|Conflict of Interest Statement|
|'AUID'|Author Identifier|
|'GR'|Grant Number|
|'DCOM'|Date Completed|
|'PMC'|Pubmed Central Identifier|
|'DEP'|Date of Electronic Publication|
|'CI'|Copyright Information|
|'LID'|Location Identifier|
|'EFR'|Erratum For|
|'SI'|Secondary Source ID|
|'CON'|Comment On|
|'CN'|Corporate Author|
|'TT'|Transliterated Title|
|'RIN'|Retraction In|
|'ECI'|Expression of Concern||'OID'|Other ID|
|'DA'|Date Created|
|'ISBN'|ISBN|
|'PB'|
|'BTI'|Book Title|
|'FPS'|Full Personal Name as Subject|
|'CDAT'|
|'CP'|
|'FED'|Full Editor Name|
|'ED'|Editor Name|
|'RPF'|Republished From|
|'PS'|Personal Name as Subject|
|'GN'|General Note|
|'DRDT'|
|'CTDT'|
|'RPI'|Republished In|
|'GS'|Gene Symbol|


In [128]:
cols = ['AB', 'AD', 'AID', 'AU', 'CI', 'CIN', 'DP', 'FAU', 'FIR', 'IP', 'IR', 'IS', 'JID', 'JT', 'LA', 'LID', 'LR', 'MH', 'OT', 'OWN', 'PG', 'PHST', 'PL', 'PMID', 'PST', 'PT', 'RF', 'SB', 'SO', 'STAT', 'TA', 'TI', 'VI', 'Year']
df = df_base[cols]


# FSM

Our FSM will consist of finding the highest referenced document in our search, and then using content-based recommendation to find 5 other documents that are similar to it in regards to the keyterms of the first document

In [130]:
df['RF'].fillna(0, inplace = True)

In [131]:
df['RF'].astype(int)

0        0
1        0
2        0
3        0
4        0
        ..
15761    0
15762    0
15763    0
15764    0
15765    0
Name: RF, Length: 15766, dtype: int32

In [156]:
df[~df['MH'].isna()].shape

(15276, 35)

In [154]:
df[~df['MH'].isna()]['MH'][5].strip('][').split('\'')

['',
 'Humans',
 ', ',
 'Japan',
 ', ',
 'Macrophages',
 ', ',
 '*Rhinitis, Allergic',
 ', ',
 '*Rhinitis, Allergic, Seasonal',
 ', ',
 'Th1 Cells',
 ', ',
 'Th2 Cells',
 '']

In [195]:
df_terms = df[~df['MH'].isna()].reset_index(drop = True)

In [196]:
df_terms.head()

Unnamed: 0,AB,AD,AID,AU,CI,CIN,DP,FAU,FIR,IP,...,PT,RF,SB,SO,STAT,TA,TI,VI,Year,terms
0,"In recent decades, many patients have been suf...","['Department of Pathobiochemistry, Osaka City ...",['10.1254/fpj.20051 [doi]'],['Iwasaki N'],,,2020,"['Iwasaki, Naruhito']",,6.0,...,['Journal Article'],0.0,IM,Nihon Yakurigaku Zasshi. 2020;155(6):369-374. ...,MEDLINE,Nihon Yakurigaku Zasshi,[Th2 cells and macrophages induce novel type-I...,155,2020,
1,Allergic rhinitis caused by pollen exposure is...,"['Department of Pulmonology, Leiden University...","['S0048-9697(20)33926-7 [pii]', '10.1016/j.sci...","['de Weger LA', 'Molster F', 'de Raat K', 'den...",['Copyright (c) 2020 The Authors. Published by...,,2020 Nov 1,"['de Weger, Letty A', 'Molster, Frank', 'de Ra...",,,...,['Journal Article'],0.0,IM,Sci Total Environ. 2020 Nov 1;741:140404. doi:...,MEDLINE,Sci Total Environ,A new portable sampler to monitor pollen at st...,741,2020,"['*Allergens', 'Betula', 'Cities', 'Humans', '..."
2,Objective:To explore the correlation between a...,['Department of Otolaryngology Head and Neck S...,['10.13201/j.issn.2096-7993.2020.06.016 [doi]'],"['Wang X', 'Wang M', 'Wang J', 'Tian H', 'Zhon...",['Copyright(c) by the Editorial Department of ...,,2020 Jun,"['Wang, Xinlan', 'Wang, Mei', 'Wang, Jinyan', ...",,6.0,...,['Journal Article'],0.0,IM,Lin Chung Er Bi Yan Hou Tou Jing Wai Ke Za Zhi...,MEDLINE,Lin Chung Er Bi Yan Hou Tou Jing Wai Ke Za Zhi,[Research on the correlation between otorhinol...,34,2020,"['*Air Pollutants', '*Air Pollution', 'Child',..."
3,The importance of grass pollen to the global b...,"['Queensland University of Technology, Brisban...","['S0048-9697(20)34718-5 [pii]', '10.1016/j.sci...","['Campbell BC', 'Al Kouba J', 'Timbrell V', 'N...",['Copyright (c) 2020 Elsevier B.V. All rights ...,,2020 Dec 10,"['Campbell, B C', 'Al Kouba, J', 'Timbrell, V'...",,,...,['Journal Article'],0.0,IM,Sci Total Environ. 2020 Dec 10;747:141189. doi...,MEDLINE,Sci Total Environ,Tracking seasonal changes in diversity of poll...,747,2020,"['Allergens', 'Australia', 'Cities', '*Poaceae..."
4,Japanese cedar (Cryptomeria japonica) pollinos...,"['Division of Molecular Immunology, Research C...","['S0161-5890(20)30397-7 [pii]', '10.1016/j.mol...","['Saito S', 'Takagi H', 'Wakasa Y', 'Ozawa K',...",['Copyright (c) 2020 Elsevier Ltd. All rights ...,,2020 Sep,"['Saito, Saburo', 'Takagi, Hidenori', 'Wakasa,...",,,...,"['Journal Article', ""Research Support, Non-U.S...",0.0,IM,Mol Immunol. 2020 Sep;125:63-69. doi: 10.1016/...,MEDLINE,Mol Immunol,Safety and efficacy of rice seed-based oral al...,125,2020,['Allergens/*administration & dosage/immunolog...


In [197]:
def clean_terms(x):
    s = x.strip('][').split('\'')
    s = [i for i in s if i != ', ']
    s = [i for i in s if i != '']
    return ' '.join(s)

In [198]:
df_terms['terms'] = df_terms['MH'].apply(clean_terms)

In [199]:
df_terms['terms']

0        Humans Japan Macrophages *Rhinitis, Allergic *...
1        *Allergens Betula Cities Humans Pollen/immunol...
2        *Air Pollutants *Air Pollution Child China Hum...
3        Allergens Australia Cities *Poaceae *Pollen Se...
4        Allergens/*administration & dosage/immunology ...
                               ...                        
15271    Anaphylaxis/*therapy Asthma/*therapy Diphenhyd...
15272    *Allergens *Biometry Humans *Pollen Rhinitis, ...
15273    *Allergens *Biometry Humans *Pollen Rhinitis, ...
15274    *Conjunctiva Humans *Immunity *Rhinitis, Aller...
15275    *Ambrosia *Anaphylaxis *Antigens *Biometry *Hy...
Name: terms, Length: 15276, dtype: object

In [200]:
df_terms = df_terms.sort_values(by = 'RF', ascending = False).reset_index(drop = True)

In [201]:
best = df_terms.head(1)

In [202]:
best

Unnamed: 0,AB,AD,AID,AU,CI,CIN,DP,FAU,FIR,IP,...,PT,RF,SB,SO,STAT,TA,TI,VI,Year,terms
0,,"['University Hospital and INSERM, Hopital Arna...","['ALL1620 [pii]', '10.1111/j.1398-9995.2007.01...","['Bousquet J', 'Khaltaev N', 'Cruz AA', 'Denbu...",,,2008 Apr,"['Bousquet, J', 'Khaltaev, N', 'Cruz, A A', 'D...",,,...,"['Journal Article', 'Practice Guideline', ""Res...",2241.0,IM,Allergy. 2008 Apr;63 Suppl 86:8-160. doi: 10.1...,MEDLINE,Allergy,Allergic Rhinitis and its Impact on Asthma (AR...,63 Suppl 86,2008,Adolescent Asthma/epidemiology/*etiology/thera...


In [203]:
cv = CountVectorizer()
X = cv.fit_transform(df_terms['terms'])
X = pd.DataFrame.sparse.from_spmatrix(X)
X.columns = sorted(cv.vocabulary_)
X = X.to_numpy()

numerators = np.array([X[0].dot(i) for i in X[1:]])
denominators = np.array([np.sqrt(sum(X[0]**2)) *\
                         np.sqrt(sum(i**2)) for i in X[1:]])

results = numerators / denominators

In [219]:
print(results)

[0.24366633 0.60328193 0.20286368 ... 0.08084521 0.2655274  0.28924061]


Results will be found by matching the 5 highest cosine similarities, and the matching them to the correct index; will finish after dinner