In [2]:
import fasttext
import pandas
import re
from scipy import spatial

In [3]:
df = pandas.read_csv('MSH.csv', encoding="utf-8")

In [4]:
df.tail()

Unnamed: 0,PMID-integer,citation-string,class
35353,20629255,Graying plague: by 2015 over half of HIV in U....,M1
35354,20629256,"Rapid aging, frailty common in older HIV. Pre...",M1
35355,20629258,Experts tips for treating older HIV patients. ...,M1
35356,20629259,Perinatal HIV: decline but disparities persist...,M1
35357,20629260,'Persistent stigmas' fueling HIV in black comm...,M1


In [5]:
def standardize_text(text_field):
    text_field = text_field.replace(r"http\S+", " ")
    text_field = text_field.replace(r"http", " ")
    text_field = text_field.replace(r"(\d)", " ")
    text_field = text_field.replace(r"@\S+", " ")
    text_field = text_field.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n,\\,/,.,:,;,'""']", " ")
    text_field = text_field.replace(r"\\", " ")
    text_field = text_field.replace(r".", " ")
    text_field = text_field.replace(r";", " ")
    text_field = text_field.replace(r",", " ")
    text_field = text_field.replace(r":", " ")
    text_field = text_field.replace(r"←", " ")
    text_field = text_field.replace(r"≠", " ")
    text_field = text_field.replace(r"'", " ")
    text_field = text_field.replace(r"(", " ")
    text_field = text_field.replace(r")", " ")
    text_field = text_field.replace(r"[", " ")
    text_field = text_field.replace(r"]", " ")
    text_field = text_field.replace(r"[]", " ")
    text_field = text_field.replace(r"?", " ")
    text_field = text_field.replace(r"()", " ")
    text_field = text_field.replace(r'"', " ")
    text_field = text_field.replace(r"-", " ")
    text_field = text_field.replace(r"{", " ")
    text_field = text_field.replace(r"}", " ")
    text_field = text_field.replace(r"*", " ")
    text_field = text_field.replace(r"!", " ")
    text_field = text_field.replace(r"~,!", " ")
    text_field = text_field.replace(r"@", " ")
    text_field = re.sub("[?]", " ", text_field)
    text_field = text_field.replace(r"#", " ")
    text_field = text_field.replace(r"$", " ")
    text_field = text_field.replace(r"%", " ")
    text_field = text_field.replace(r"^", " ")
    text_field = text_field.replace(r"&", " ")
    text_field = text_field.replace(r"=", " ")
    text_field = text_field.replace(r"+", " ")
    text_field = text_field.replace(r"`", " ")
    text_field = text_field.replace(r"<", " ")
    text_field = text_field.replace(r">", " ")
    text_field = text_field.replace(r"·", " ")
    text_field = re.sub("[”“]", " ", text_field)
    text_field = text_field.replace(r"//", " ")
    text_field = text_field.replace(r"|", " ")
    text_field = text_field.replace(r"|", " ")
    text_field = text_field.replace(r"&[A-Z][a-z][0-9]", " ")
    text_field = text_field.replace(r"[0-9]+", " ")
    text_field = text_field.replace(r"[a-z]+", " ")
    text_field = text_field.replace(r"[a-zA-z]", " ")
    text_field = text_field.replace(r"\[0-9a-zA-Z]", " ")
    text_field = re.sub("[–]", " ", text_field)
    text_field = text_field.replace(r"λ", " ")
    text_field = text_field.replace(r"@", "at")
    text_field = text_field.lower()
    text_field = re.sub("\s[0-9]+", " ", text_field)
    text_field = re.sub("\b[a-z]\b", " ", text_field)
    text_field = re.sub("—", " ", text_field)
    text_field = re.sub("_", " ", text_field)
    text_field = re.sub("™"," ", text_field)
    text_field = re.sub("/", " ", text_field)
    text_field = re.sub("[0-9]", " ", text_field)
    text_field = re.sub("[½¼¢~]", " ", text_field)
    text_field = text_field.replace('\\n', " ")
    text_field = text_field.replace("("," ")
    text_field = text_field.replace(")"," ")
    text_field = text_field.replace("#"," ")
    text_field = text_field.replace("&"," ")
    text_field = text_field.replace("\\"," ")
    text_field = text_field.replace("ã©","e")
    text_field = text_field.replace("ã ","u")
    text_field = text_field.replace("ã´","o")
    text_field = text_field.replace("ã¯","i")
    text_field = ' '.join(i for i in text_field.split() if not (i.isalpha() and len(i)==1))
    return text_field

In [11]:
l = list(df['citation-string'])
s = ''.join(l)
s = standardize_text(s)
text_file = open("concatMSH.txt", "w")
text_file.write(s)
text_file.close()

In [12]:
model_saved = False
if model_saved:
    model = fasttext.load_model("model_filename.bin")
else:
    model = fasttext.train_unsupervised("concatMSH.txt", model='skipgram')
#     model.save_model("model_filename.bin")

In [13]:
words, freqs = model.get_words(include_freq=True, on_unicode_error='replace')

In [14]:
distances = {}
polysemy_list = ["single", "growth", "evaluation", "surgery",
                 "reduction", "inhibition", "pressure", "support", 
                 "weight", "frequency", "sensitivity", "failure", 
                 "culture", "resistance", "degree", "determination",  
                 "energy", "lead", "glucose", "scale", 
                 "strains", "sex", "condition",  
                 "variation", "transport", "man", 
                 "radiation", "transient", "white", 
                 "depression", "repair", "pathology", "fat", 
                 "extraction", "ultrasound", "discharge", 
                 "nutrition", "adjustment", "japanese", "cold", 
                 "fit", "mosaic", "mole", "stem"]  # list from the paper, filtered out words with >=2 tokens.

for j in range(len(polysemy_list)):
    word_one = polysemy_list[j]
    distances[word_one] = {}
    for i in range(len(words)):
        word_two = words[i]
        distances[word_one][word_two] = 1 - spatial.distance.cosine(model[word_one], model[word_two])
    distances[word_one] = sorted(distances[word_one].items(), key=lambda x: x[1], reverse=True)

In [15]:
for i in distances:
    print(i)
    print(distances[i][1:50])
    print("\n")

single
[('double', 0.6229436993598938), ('stranded', 0.6220369935035706), ('singlet', 0.596306562423706), ('two', 0.5872929096221924), ('three', 0.5797744393348694), ('large', 0.5734736323356628), ('snapshot', 0.572464644908905), ('either', 0.5697457790374756), ('insertionally', 0.5693119168281555), ('one', 0.5678057670593262), ('gye', 0.5672317743301392), ('based', 0.5662776231765747), ('doublet', 0.565670371055603), ('sscp', 0.5621582269668579), ('microsecond', 0.5616905093193054), ('snapshots', 0.5591970086097717), ('intrachain', 0.5583614110946655), ('multiplexed', 0.556364119052887), ('intercross', 0.5510907173156738), ('interchain', 0.5496376156806946), ('kringle', 0.5435205698013306), ('consecutive', 0.5430819988250732), ('hole', 0.5407620668411255), ('oligonucleotide', 0.539962887763977), ('alternating', 0.5390279293060303), ('insertional', 0.5387564301490784), ('octapeptide', 0.5386770367622375), ('repeating', 0.5367775559425354), ('intercalating', 0.5361183285713196), ('inter