In [26]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import re

def get_fnames():
    """Read all text files in a folder.
    """
    fnames = []
    for root,_,files in os.walk("./abstracts/awards_2002"):
        for fname in files:
            if fname[-4:] == ".txt":
                fnames.append(os.path.join(root, fname))
    return fnames

print("Number of abstracts in folder awards_2002: {}".format(len(get_fnames())))

Number of abstracts in folder awards_2002: 9923


In [27]:
name_list = get_fnames()

def read_file(fname):
    with open(fname, 'r',encoding="ISO-8859-1") as f:
        # skip all lines until abstract
        for line in f:
            if "Abstract    :" in line:
                break

        # get abstract as a single string
        abstract = ' '.join([line[:-1].strip() for line in f])
        abstract = re.sub(' +', ' ', abstract)  # remove double spaces
        return abstract

In [28]:
documents = []

for i in name_list:
    documents.append(read_file(i))

In [29]:


# Fast and simple tokenization
new_vectorizer = TfidfVectorizer(stop_words = 'english', lowercase= True, ngram_range = (1,3) ,min_df=5, use_idf=True, sublinear_tf=True, max_df=1.0)
word_tokenizer = new_vectorizer.build_tokenizer()
tokenized_text = [word_tokenizer(doc) for doc in documents]



In [30]:
tokenized_text[0]

['This',
 'Small',
 'Business',
 'Innovation',
 'Research',
 'SBIR',
 'Phase',
 'II',
 'Project',
 'proposes',
 'to',
 'develop',
 'the',
 'database',
 'and',
 'associated',
 'software',
 'to',
 'enable',
 'analysis',
 'of',
 'protein',
 'trafficking',
 'and',
 'localization',
 'The',
 'system',
 'will',
 'be',
 'designed',
 'to',
 'enable',
 'drug',
 'discovery',
 'researchers',
 'to',
 'identify',
 'elucidate',
 'eliminate',
 'and',
 'design',
 'leads',
 'and',
 'targets',
 'while',
 'facilitating',
 'the',
 'general',
 'training',
 'of',
 'researchers',
 'During',
 'the',
 'Phase',
 'work',
 'proteins',
 'involved',
 'in',
 'trafficking',
 'and',
 'diseases',
 'related',
 'to',
 'mislocalization',
 'were',
 'identified',
 'and',
 'relational',
 'database',
 'to',
 'house',
 'information',
 'on',
 'protein',
 'trafficking',
 'was',
 'constructed',
 'Curation',
 'interface',
 'applications',
 'were',
 'created',
 'to',
 'allow',
 'remote',
 'data',
 'entry',
 'and',
 'graphical',
 'us

In [31]:
### Train word vectors

import gensim # Make sure you also have cython installed to accelerate computation!

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Train word2vec model
vectors = gensim.models.Word2Vec(tokenized_text, size=100, min_count=5, sg=1, workers=4) # change tokenized_text to tfidf.matrix!

2021-02-26 21:45:03,167 : INFO : collecting all words and their counts
2021-02-26 21:45:03,168 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-26 21:45:03,905 : INFO : collected 63538 word types from a corpus of 2656274 raw words and 9923 sentences
2021-02-26 21:45:03,906 : INFO : Loading a fresh vocabulary
2021-02-26 21:45:04,022 : INFO : effective_min_count=5 retains 20752 unique words (32% of original 63538, drops 42786)
2021-02-26 21:45:04,023 : INFO : effective_min_count=5 leaves 2585188 word corpus (97% of original 2656274, drops 71086)
2021-02-26 21:45:04,152 : INFO : deleting the raw counts dictionary of 63538 items
2021-02-26 21:45:04,154 : INFO : sample=0.001 downsamples 26 most-common words
2021-02-26 21:45:04,155 : INFO : downsampling leaves estimated 2005620 word corpus (77.6% of prior 2585188)
2021-02-26 21:45:04,223 : INFO : estimated required memory for 20752 words and 100 dimensions: 26977600 bytes
2021-02-26 21:45:04,224 : INFO : res

In [32]:
seed_word = [list(vectors.wv.vocab.keys())[(i+1)*1000] for i in range(5)]

In [33]:
seed_word

['greatly', 'every', 'includes', 'transcription', 'largest']

In [18]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'greatly')
print(vectors.wv.most_similar('greatly'))
print()

2021-02-26 13:33:02,414 : INFO : precomputing L2-norms of word weight vectors


Most similar to: greatly
[('significantly', 0.8013008236885071), ('substantially', 0.7442965507507324), ('dramatically', 0.7396800518035889), ('vastly', 0.667165994644165), ('enlarge', 0.6540445685386658), ('MPICH', 0.6524852514266968), ('streamline', 0.6513089537620544), ('Successful', 0.6315666437149048), ('capability', 0.6274981498718262), ('sharpen', 0.6218451261520386)]



In [19]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'every')
print(vectors.wv.most_similar('every'))
print()

Most similar to: every
[('each', 0.6906091570854187), ('Every', 0.6714961528778076), ('almost', 0.6605528593063354), ('essentially', 0.6477319002151489), ('normally', 0.6423296332359314), ('judiciously', 0.6373945474624634), ('roughly', 0.6333438158035278), ('nearly', 0.6281598806381226), ('again', 0.6260768175125122), ('morning', 0.6238182783126831)]



In [20]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'includes')
print(vectors.wv.most_similar('includes'))
print()

Most similar to: includes
[('involves', 0.7543442249298096), ('consists', 0.7009209990501404), ('comprises', 0.6838739514350891), ('encompasses', 0.6799794435501099), ('supports', 0.6580832004547119), ('emphasizes', 0.6541271805763245), ('involve', 0.6530083417892456), ('include', 0.6529262065887451), ('integrates', 0.6281484961509705), ('introduces', 0.6225360631942749)]



In [21]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'transcription')
print(vectors.wv.most_similar('transcription'))
print()

Most similar to: transcription
[('chromatin', 0.8633143305778503), ('silencing', 0.8584595918655396), ('mRNA', 0.8561317920684814), ('meiotic', 0.8526526689529419), ('transcriptional', 0.8488618731498718), ('repressor', 0.8470075130462646), ('homologous', 0.8436334729194641), ('replication', 0.8433512449264526), ('mRNAs', 0.8415851593017578), ('virulence', 0.8390907645225525)]



In [22]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'largest')
print(vectors.wv.most_similar('largest'))
print()

Most similar to: largest
[('oldest', 0.739079475402832), ('populous', 0.7280933856964111), ('southeastern', 0.7225464582443237), ('towns', 0.7119206190109253), ('rest', 0.7053264379501343), ('richest', 0.6938225626945496), ('province', 0.6938113570213318), ('northeastern', 0.6901578307151794), ('deepest', 0.6891354322433472), ('endemic', 0.6792290210723877)]



In [23]:
vectors2 = gensim.models.Word2Vec(tokenized_text, size=10, min_count=1, sg=0, workers=4)

2021-02-26 13:33:28,251 : INFO : collecting all words and their counts
2021-02-26 13:33:28,252 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-26 13:33:28,806 : INFO : collected 63538 word types from a corpus of 2656274 raw words and 9923 sentences
2021-02-26 13:33:28,808 : INFO : Loading a fresh vocabulary
2021-02-26 13:33:29,051 : INFO : effective_min_count=1 retains 63538 unique words (100% of original 63538, drops 0)
2021-02-26 13:33:29,053 : INFO : effective_min_count=1 leaves 2656274 word corpus (100% of original 2656274, drops 0)
2021-02-26 13:33:29,346 : INFO : deleting the raw counts dictionary of 63538 items
2021-02-26 13:33:29,348 : INFO : sample=0.001 downsamples 25 most-common words
2021-02-26 13:33:29,350 : INFO : downsampling leaves estimated 2081427 word corpus (78.4% of prior 2656274)
2021-02-26 13:33:29,524 : INFO : estimated required memory for 63538 words and 10 dimensions: 36852040 bytes
2021-02-26 13:33:29,525 : INFO : resetting 

In [24]:
seed_word2 = [list(vectors2.wv.vocab.keys())[(i+1)*1000] for i in range(5)]

In [25]:
seed_word2

['half', 'whether', 'neuroscience', 'Lonza', 'tightly']

In [26]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'half')
print(vectors2.wv.most_similar('half'))
print()

2021-02-26 13:35:45,115 : INFO : precomputing L2-norms of word weight vectors


Most similar to: half
[('thousand', 0.9656727313995361), ('80', 0.9651178121566772), ('percent', 0.9607995748519897), ('70', 0.9601696729660034), ('torn', 0.9574507474899292), ('parallelepipeds', 0.953171968460083), ('roughly', 0.9500912427902222), ('million', 0.9493248462677002), ('catecholamines', 0.9489266872406006), ('Dickenson', 0.9426918029785156)]



In [27]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'whether')
print(vectors2.wv.most_similar('whether'))
print()

Most similar to: whether
[('what', 0.9122190475463867), ('macromodels', 0.9094517230987549), ('why', 0.9062631130218506), ('fissions', 0.8964521288871765), ('how', 0.8916631937026978), ('farmer', 0.8860977292060852), ('SWING', 0.8760378956794739), ('Proximate', 0.870087206363678), ('if', 0.8693884611129761), ('manhood', 0.857232391834259)]



In [28]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'neuroscience')
print(vectors2.wv.most_similar('neuroscience'))
print()

Most similar to: neuroscience
[('populace', 0.9711309671401978), ('LCLUC', 0.9577138423919678), ('focusing', 0.9521087408065796), ('0231010', 0.9496864676475525), ('macroeconomics', 0.9477670192718506), ('lifeline', 0.9440898895263672), ('codimension', 0.9428080320358276), ('sustainability', 0.941412627696991), ('naturalization', 0.9316829442977905), ('arena', 0.9310978651046753)]



In [29]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'Lonza')
print(vectors2.wv.most_similar('Lonza'))
print()

Most similar to: Lonza
[('Flory', 0.9816685318946838), ('nontransforming', 0.9795741438865662), ('Numerous', 0.9769992828369141), ('SAR324', 0.9755681157112122), ('dictionaries', 0.9745925664901733), ('graphein', 0.9740005731582642), ('discordant', 0.973209798336029), ('Ibn', 0.9724632501602173), ('pyridylmethyl', 0.972217321395874), ('sulfoxides', 0.9721080660820007)]



In [30]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'tightly')
print(vectors2.wv.most_similar('tightly'))
print()

Most similar to: tightly
[('spectrally', 0.974089503288269), ('screened', 0.9697147607803345), ('seeding', 0.9688395857810974), ('reversible', 0.9639790654182434), ('sum', 0.9636118412017822), ('periodically', 0.9626333713531494), ('computa', 0.96170574426651), ('inconvenient', 0.9616869688034058), ('prescribed', 0.9607305526733398), ('corrections', 0.9584357142448425)]



In [31]:
vectors3 = gensim.models.Word2Vec(tokenized_text, size=100, min_count=5, sg=0, workers=4)

2021-02-26 13:36:05,065 : INFO : collecting all words and their counts
2021-02-26 13:36:05,067 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-26 13:36:05,624 : INFO : collected 63538 word types from a corpus of 2656274 raw words and 9923 sentences
2021-02-26 13:36:05,625 : INFO : Loading a fresh vocabulary
2021-02-26 13:36:05,781 : INFO : effective_min_count=5 retains 20752 unique words (32% of original 63538, drops 42786)
2021-02-26 13:36:05,782 : INFO : effective_min_count=5 leaves 2585188 word corpus (97% of original 2656274, drops 71086)
2021-02-26 13:36:05,888 : INFO : deleting the raw counts dictionary of 63538 items
2021-02-26 13:36:05,890 : INFO : sample=0.001 downsamples 26 most-common words
2021-02-26 13:36:05,891 : INFO : downsampling leaves estimated 2005620 word corpus (77.6% of prior 2585188)
2021-02-26 13:36:05,956 : INFO : estimated required memory for 20752 words and 100 dimensions: 26977600 bytes
2021-02-26 13:36:05,957 : INFO : res

In [32]:
seed_word3 = [list(vectors3.wv.vocab.keys())[(i+1)*1000] for i in range(5)]

In [33]:
seed_word3

['greatly', 'every', 'includes', 'transcription', 'largest']

In [34]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'greatly')
print(vectors3.wv.most_similar('greatly'))
print()

2021-02-26 13:36:24,757 : INFO : precomputing L2-norms of word weight vectors


Most similar to: greatly
[('significantly', 0.8782045245170593), ('substantially', 0.7808791399002075), ('dramatically', 0.6995285749435425), ('ultimately', 0.6894906163215637), ('capability', 0.6457507014274597), ('improved', 0.638140082359314), ('enhanced', 0.629065215587616), ('thus', 0.6160373091697693), ('increased', 0.5911352038383484), ('improvements', 0.5893326997756958)]



In [35]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'every')
print(vectors3.wv.most_similar('every'))
print()

Most similar to: every
[('ten', 0.6816279888153076), ('almost', 0.6725512742996216), ('few', 0.6707489490509033), ('half', 0.6691499352455139), ('except', 0.6654048562049866), ('roughly', 0.663583517074585), ('least', 0.6603336334228516), ('per', 0.6559103727340698), ('old', 0.6518365740776062), ('billion', 0.651665449142456)]



In [36]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'includes')
print(vectors3.wv.most_similar('includes'))
print()

Most similar to: includes
[('involves', 0.8537815809249878), ('combines', 0.7831259369850159), ('supports', 0.7734456062316895), ('integrates', 0.7514086961746216), ('emphasizes', 0.7439501881599426), ('utilizes', 0.7368146777153015), ('consists', 0.7192471623420715), ('develops', 0.7155880331993103), ('encompasses', 0.7074522972106934), ('explores', 0.6937383413314819)]



In [37]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'transcription')
print(vectors3.wv.most_similar('transcription'))
print()

Most similar to: transcription
[('eukaryotic', 0.9008668661117554), ('transcriptional', 0.900171160697937), ('replication', 0.8730258941650391), ('receptor', 0.8679792881011963), ('intracellular', 0.859769880771637), ('signaling', 0.8548659086227417), ('cis', 0.8540043234825134), ('putative', 0.847966194152832), ('silencing', 0.841754138469696), ('viral', 0.8399631977081299)]



In [38]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'largest')
print(vectors3.wv.most_similar('largest'))
print()

Most similar to: largest
[('oldest', 0.852716326713562), ('earliest', 0.7771295309066772), ('north', 0.7501647472381592), ('province', 0.7465780973434448), ('Appalachian', 0.7434969544410706), ('Great', 0.7431834936141968), ('island', 0.7421600222587585), ('richest', 0.738149881362915), ('Asian', 0.7377354502677917), ('southeastern', 0.7294509410858154)]



In [34]:
vectors4 = gensim.models.Word2Vec(tokenized_text, size=10, min_count=1, sg=1, workers=4)

2021-02-26 21:46:04,474 : INFO : collecting all words and their counts
2021-02-26 21:46:04,475 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-26 21:46:05,118 : INFO : collected 63538 word types from a corpus of 2656274 raw words and 9923 sentences
2021-02-26 21:46:05,119 : INFO : Loading a fresh vocabulary
2021-02-26 21:46:16,734 : INFO : effective_min_count=1 retains 63538 unique words (100% of original 63538, drops 0)
2021-02-26 21:46:16,736 : INFO : effective_min_count=1 leaves 2656274 word corpus (100% of original 2656274, drops 0)
2021-02-26 21:46:17,180 : INFO : deleting the raw counts dictionary of 63538 items
2021-02-26 21:46:17,183 : INFO : sample=0.001 downsamples 25 most-common words
2021-02-26 21:46:17,184 : INFO : downsampling leaves estimated 2081427 word corpus (78.4% of prior 2656274)
2021-02-26 21:46:17,408 : INFO : estimated required memory for 63538 words and 10 dimensions: 36852040 bytes
2021-02-26 21:46:17,409 : INFO : resetting 

In [35]:
seed_word4 = [list(vectors4.wv.vocab.keys())[(i+1)*1000] for i in range(5)]

In [36]:
seed_word4

['half', 'whether', 'neuroscience', 'Lonza', 'tightly']

In [37]:
len(list(vectors4.wv.vocab.keys()))

63538

In [42]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'half')
print(vectors4.wv.most_similar('half'))
print()

2021-02-26 13:37:38,820 : INFO : precomputing L2-norms of word weight vectors


Most similar to: half
[('representing', 0.9732823371887207), ('nearly', 0.972795307636261), ('black', 0.9664586186408997), ('least', 0.9610405564308167), ('few', 0.9545682668685913), ('around', 0.9482489824295044), ('every', 0.9429680705070496), ('covering', 0.9417109489440918), ('over', 0.9415010809898376), ('spanning', 0.940800130367279)]



In [43]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'whether')
print(vectors4.wv.most_similar('whether'))
print()

Most similar to: whether
[('explain', 0.9642354249954224), ('skew', 0.9559688568115234), ('hypothesized', 0.9536693096160889), ('endogenous', 0.9525479078292847), ('trait', 0.9506796598434448), ('what', 0.9503313899040222), ('ask', 0.9473053812980652), ('if', 0.9449271559715271), ('asymmetry', 0.9439817667007446), ('unknown', 0.9433817267417908)]



In [38]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'neuroscience')
print(vectors4.wv.most_similar('neuroscience'))
print()

2021-02-26 21:50:18,179 : INFO : precomputing L2-norms of word weight vectors


Most similar to: neuroscience
[('furthering', 0.9842066764831543), ('informational', 0.9749946594238281), ('relevance', 0.9741153717041016), ('neurobiology', 0.96800696849823), ('interest', 0.9663718342781067), ('informatics', 0.9641363620758057), ('technological', 0.9639126062393188), ('advancing', 0.9617303609848022), ('playing', 0.9597301483154297), ('sciences', 0.9593894481658936)]



In [45]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'Lonza')
print(vectors4.wv.most_similar('Lonza'))
print()

Most similar to: Lonza
[('domes', 0.997254490852356), ('Curation', 0.9971436262130737), ('CCC', 0.9964454174041748), ('HPF', 0.9954668283462524), ('ImmunoPrecipitation', 0.9951314330101013), ('Holographic', 0.9947695136070251), ('eyewear', 0.9944987893104553), ('electrogenerated', 0.9944267272949219), ('Quantized', 0.9943544268608093), ('HiSS', 0.9943377375602722)]



In [46]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'tightly')
print(vectors4.wv.most_similar('tightly'))
print()

Most similar to: tightly
[('routes', 0.9862256646156311), ('manipulating', 0.9775048494338989), ('tailor', 0.9773040413856506), ('inductive', 0.9732342958450317), ('reverse', 0.9725438952445984), ('render', 0.9721719622612), ('domain', 0.9718952178955078), ('channels', 0.9717416763305664), ('resultant', 0.9705893397331238), ('tuned', 0.9697235226631165)]



In [47]:
vectors5 = gensim.models.Word2Vec(tokenized_text, size=100, min_count=1, sg=1, workers=4)

2021-02-26 13:37:57,083 : INFO : collecting all words and their counts
2021-02-26 13:37:57,084 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-26 13:37:57,617 : INFO : collected 63538 word types from a corpus of 2656274 raw words and 9923 sentences
2021-02-26 13:37:57,618 : INFO : Loading a fresh vocabulary
2021-02-26 13:37:57,880 : INFO : effective_min_count=1 retains 63538 unique words (100% of original 63538, drops 0)
2021-02-26 13:37:57,881 : INFO : effective_min_count=1 leaves 2656274 word corpus (100% of original 2656274, drops 0)
2021-02-26 13:37:58,185 : INFO : deleting the raw counts dictionary of 63538 items
2021-02-26 13:37:58,188 : INFO : sample=0.001 downsamples 25 most-common words
2021-02-26 13:37:58,190 : INFO : downsampling leaves estimated 2081427 word corpus (78.4% of prior 2656274)
2021-02-26 13:37:58,373 : INFO : estimated required memory for 63538 words and 100 dimensions: 82599400 bytes
2021-02-26 13:37:58,375 : INFO : resetting

In [48]:
seed_word5 = [list(vectors5.wv.vocab.keys())[(i+1)*1000] for i in range(5)]

In [49]:
seed_word5 

['half', 'whether', 'neuroscience', 'Lonza', 'tightly']

In [50]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'half')
print(vectors5.wv.most_similar('half'))
print()

2021-02-26 13:39:16,488 : INFO : precomputing L2-norms of word weight vectors


Most similar to: half
[('25', 0.790177583694458), ('80', 0.7899731397628784), ('ten', 0.7695549726486206), ('70', 0.7545516490936279), ('percent', 0.7527369856834412), ('65', 0.749315619468689), ('trillion', 0.7484829425811768), ('About', 0.7458590865135193), ('150', 0.7433521747589111), ('60', 0.7410141229629517)]



In [51]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'whether')
print(vectors5.wv.most_similar('whether'))
print()

Most similar to: whether
[('if', 0.7773952484130859), ('how', 0.7436619400978088), ('why', 0.705829918384552), ('what', 0.6954272985458374), ('sex', 0.6872129440307617), ('androdioecy', 0.6792768239974976), ('Are', 0.6774872541427612), ('Is', 0.676652729511261), ('Or', 0.6747390031814575), ('ask', 0.6745706796646118)]



In [52]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'neuroscience')
print(vectors5.wv.most_similar('neuroscience'))
print()

Most similar to: neuroscience
[('endocrinology', 0.8928981423377991), ('neurobiology', 0.8739007115364075), ('neuroendocrinology', 0.8608860373497009), ('sociology', 0.8580520749092102), ('linguistics', 0.8549712896347046), ('epidemiology', 0.8538591861724854), ('neurophysiology', 0.8537481427192688), ('revolutionizing', 0.8456230163574219), ('archeology', 0.8387148976325989), ('informs', 0.8276082277297974)]



In [53]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'Lonza')
print(vectors5.wv.most_similar('Lonza'))
print()

Most similar to: Lonza
[('04cc', 0.9778331518173218), ('Discover', 0.9769275784492493), ('lampposts', 0.9763314723968506), ('drums', 0.9755334854125977), ('COPS', 0.9749007225036621), ('diaminopropane', 0.9746689796447754), ('lobules', 0.9742003083229065), ('Cytoplamic', 0.9737499952316284), ('mutase', 0.9733877182006836), ('CDV', 0.9733231067657471)]



In [54]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'tightly')
print(vectors5.wv.most_similar('tightly'))
print()

Most similar to: tightly
[('loosely', 0.7940118312835693), ('mimic', 0.788912296295166), ('V1', 0.7777059078216553), ('deformable', 0.7661259174346924), ('flexibly', 0.7651008367538452), ('seamlessly', 0.7630290389060974), ('behaviorally', 0.7627905607223511), ('conveniently', 0.762204647064209), ('1D', 0.7617976665496826), ('internally', 0.7611943483352661)]



In [55]:
vectors6 = gensim.models.Word2Vec(tokenized_text, size=100, min_count=1, sg=0, workers=4)

2021-02-26 13:39:46,315 : INFO : collecting all words and their counts
2021-02-26 13:39:46,317 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-26 13:39:46,850 : INFO : collected 63538 word types from a corpus of 2656274 raw words and 9923 sentences
2021-02-26 13:39:46,852 : INFO : Loading a fresh vocabulary
2021-02-26 13:39:47,018 : INFO : effective_min_count=1 retains 63538 unique words (100% of original 63538, drops 0)
2021-02-26 13:39:47,020 : INFO : effective_min_count=1 leaves 2656274 word corpus (100% of original 2656274, drops 0)
2021-02-26 13:39:47,319 : INFO : deleting the raw counts dictionary of 63538 items
2021-02-26 13:39:47,322 : INFO : sample=0.001 downsamples 25 most-common words
2021-02-26 13:39:47,324 : INFO : downsampling leaves estimated 2081427 word corpus (78.4% of prior 2656274)
2021-02-26 13:39:47,498 : INFO : estimated required memory for 63538 words and 100 dimensions: 82599400 bytes
2021-02-26 13:39:47,500 : INFO : resetting

In [56]:
seed_word6 = [list(vectors6.wv.vocab.keys())[(i+1)*1000] for i in range(5)]

In [57]:
seed_word6

['half', 'whether', 'neuroscience', 'Lonza', 'tightly']

In [58]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'half')
print(vectors6.wv.most_similar('half'))
print()

2021-02-26 13:40:36,736 : INFO : precomputing L2-norms of word weight vectors


Most similar to: half
[('80', 0.8614833354949951), ('million', 0.8565804958343506), ('70', 0.8434327840805054), ('percent', 0.8345407247543335), ('25', 0.828275203704834), ('50', 0.822907567024231), ('square', 0.8213421106338501), ('1000', 0.8211793899536133), ('150', 0.8172222375869751), ('days', 0.8170233368873596)]



In [59]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'whether')
print(vectors6.wv.most_similar('whether'))
print()

Most similar to: whether
[('if', 0.8375376462936401), ('why', 0.8201342225074768), ('how', 0.8175545930862427), ('what', 0.8073095083236694), ('How', 0.7672230005264282), ('bicoordinate', 0.6755235195159912), ('responses', 0.6288872957229614), ('inactivating', 0.6159760355949402), ('sex', 0.6069273948669434), ('Phospho', 0.5972170829772949)]



In [60]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'neuroscience')
print(vectors6.wv.most_similar('neuroscience'))
print()

Most similar to: neuroscience
[('sociology', 0.8529632091522217), ('economics', 0.842451274394989), ('biochemistry', 0.8309021592140198), ('endocrinology', 0.8099850416183472), ('medicine', 0.8027428388595581), ('ecology', 0.7961971759796143), ('conservation', 0.7931435108184814), ('microbiology', 0.7917824983596802), ('oceanography', 0.7916997075080872), ('contemporary', 0.7900481224060059)]



In [61]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'Lonza')
print(vectors6.wv.most_similar('Lonza'))
print()

Most similar to: Lonza
[('Rapt', 0.946742057800293), ('Cisco', 0.9320446848869324), ('Serbia', 0.9216082692146301), ('PBD', 0.9192733764648438), ('vegetables', 0.9182940125465393), ('inconsistency', 0.9156026840209961), ('vicariant', 0.9148354530334473), ('Sternoptychidae', 0.9147069454193115), ('0209202', 0.9143208861351013), ('your', 0.9138616919517517)]



In [62]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'tightly')
print(vectors6.wv.most_similar('tightly'))
print()

Most similar to: tightly
[('structurally', 0.8403840065002441), ('intrinsically', 0.8381390571594238), ('weakly', 0.8356295824050903), ('deformable', 0.8241686224937439), ('tunnels', 0.8216641545295715), ('gates', 0.8200643658638), ('functionally', 0.8147437572479248), ('mechanically', 0.8121119737625122), ('intimately', 0.8119672536849976), ('clustered', 0.8089224100112915)]



In [63]:
vectors7 = gensim.models.Word2Vec(tokenized_text, size=10, min_count=5, sg=1, workers=4)

2021-02-26 13:40:50,350 : INFO : collecting all words and their counts
2021-02-26 13:40:50,351 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-26 13:40:50,889 : INFO : collected 63538 word types from a corpus of 2656274 raw words and 9923 sentences
2021-02-26 13:40:50,891 : INFO : Loading a fresh vocabulary
2021-02-26 13:40:51,080 : INFO : effective_min_count=5 retains 20752 unique words (32% of original 63538, drops 42786)
2021-02-26 13:40:51,082 : INFO : effective_min_count=5 leaves 2585188 word corpus (97% of original 2656274, drops 71086)
2021-02-26 13:40:51,182 : INFO : deleting the raw counts dictionary of 63538 items
2021-02-26 13:40:51,186 : INFO : sample=0.001 downsamples 26 most-common words
2021-02-26 13:40:51,187 : INFO : downsampling leaves estimated 2005620 word corpus (77.6% of prior 2585188)
2021-02-26 13:40:51,244 : INFO : estimated required memory for 20752 words and 10 dimensions: 12036160 bytes
2021-02-26 13:40:51,245 : INFO : rese

In [66]:
seed_word7 = [list(vectors7.wv.vocab.keys())[(i+1)*1000] for i in range(5)]

In [67]:
seed_word7

['greatly', 'every', 'includes', 'transcription', 'largest']

In [68]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'greatly')
print(vectors7.wv.most_similar('greatly'))
print()

2021-02-26 14:04:14,419 : INFO : precomputing L2-norms of word weight vectors


Most similar to: greatly
[('significantly', 0.9823163747787476), ('aid', 0.9707620143890381), ('ability', 0.9605613350868225), ('improved', 0.9523959159851074), ('linking', 0.9482380151748657), ('assessing', 0.9464660286903381), ('improvement', 0.9447351694107056), ('facilitating', 0.9437584280967712), ('participatory', 0.9437243938446045), ('Identify', 0.9425539970397949)]



In [69]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'every')
print(vectors7.wv.most_similar('every'))
print()

Most similar to: every
[('one', 0.9544808864593506), ('forty', 0.9348524212837219), ('fifty', 0.9336089491844177), ('least', 0.9320022463798523), ('attracts', 0.9301199316978455), ('round', 0.9274606108665466), ('hold', 0.9169936180114746), ('given', 0.916497528553009), ('holding', 0.9150835871696472), ('ample', 0.9130387902259827)]



In [70]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'includes')
print(vectors7.wv.most_similar('includes'))
print()

Most similar to: includes
[('include', 0.9577757120132446), ('The', 0.9568184614181519), ('an', 0.9527792930603027), ('involves', 0.9490988850593567), ('Primary', 0.9430201649665833), ('for', 0.9428271055221558), ('comprises', 0.9396464228630066), ('Conduct', 0.9395472407341003), ('Through', 0.9394429922103882), ('on', 0.9357964396476746)]



In [71]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'transcription')
print(vectors7.wv.most_similar('transcription'))
print()

Most similar to: transcription
[('regulated', 0.98469078540802), ('replication', 0.9833722710609436), ('receptor', 0.9808540940284729), ('gene', 0.9795259237289429), ('signaling', 0.9783895015716553), ('splicing', 0.9763568043708801), ('pathway', 0.9750600457191467), ('silencing', 0.9727146625518799), ('secretion', 0.970633864402771), ('chloroplast', 0.9689313173294067)]



In [72]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'largest')
print(vectors7.wv.most_similar('largest'))
print()

Most similar to: largest
[('Saharan', 0.9502067565917969), ('mountainous', 0.9391655325889587), ('border', 0.935000479221344), ('northeastern', 0.9338515996932983), ('towns', 0.9337120056152344), ('southeastern', 0.9299823045730591), ('gatherer', 0.9286332726478577), ('1970s', 0.9252861738204956), ('deserts', 0.924917995929718), ('biomes', 0.9243955016136169)]



In [73]:
vectors8 = gensim.models.Word2Vec(tokenized_text, size=10, min_count=5, sg=0, workers=4)

2021-02-26 14:04:22,542 : INFO : collecting all words and their counts
2021-02-26 14:04:22,542 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-26 14:04:23,112 : INFO : collected 63538 word types from a corpus of 2656274 raw words and 9923 sentences
2021-02-26 14:04:23,113 : INFO : Loading a fresh vocabulary
2021-02-26 14:04:23,189 : INFO : effective_min_count=5 retains 20752 unique words (32% of original 63538, drops 42786)
2021-02-26 14:04:23,190 : INFO : effective_min_count=5 leaves 2585188 word corpus (97% of original 2656274, drops 71086)
2021-02-26 14:04:23,298 : INFO : deleting the raw counts dictionary of 63538 items
2021-02-26 14:04:23,300 : INFO : sample=0.001 downsamples 26 most-common words
2021-02-26 14:04:23,301 : INFO : downsampling leaves estimated 2005620 word corpus (77.6% of prior 2585188)
2021-02-26 14:04:23,360 : INFO : estimated required memory for 20752 words and 10 dimensions: 12036160 bytes
2021-02-26 14:04:23,361 : INFO : rese

In [74]:
seed_word8 = [list(vectors8.wv.vocab.keys())[(i+1)*1000] for i in range(5)]

In [75]:
seed_word8

['greatly', 'every', 'includes', 'transcription', 'largest']

In [76]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'greatly')
print(vectors8.wv.most_similar('greatly'))
print()

2021-02-26 14:04:45,154 : INFO : precomputing L2-norms of word weight vectors


Most similar to: greatly
[('significantly', 0.9625788331031799), ('ability', 0.9101622104644775), ('substantially', 0.8781238794326782), ('to', 0.8764215707778931), ('necessary', 0.8735238313674927), ('us', 0.8630886077880859), ('better', 0.8587857484817505), ('effectively', 0.8520721793174744), ('strategies', 0.850286602973938), ('should', 0.8486435413360596)]



In [77]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'every')
print(vectors8.wv.most_similar('every'))
print()

Most similar to: every
[('later', 0.924665093421936), ('few', 0.9168813824653625), ('classified', 0.9053330421447754), ('chosen', 0.8952016830444336), ('thirty', 0.8723949193954468), ('rarely', 0.8720383644104004), ('just', 0.8637912273406982), ('inaccessible', 0.8631974458694458), ('millions', 0.8623626232147217), ('least', 0.8604147434234619)]



In [78]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'includes')
print(vectors8.wv.most_similar('includes'))
print()

Most similar to: includes
[('involves', 0.9217371344566345), ('include', 0.9062110781669617), ('combines', 0.8593859672546387), ('capitalizes', 0.8403268456459045), ('consists', 0.8359469771385193), ('provides', 0.8328524231910706), ('emphasize', 0.826043426990509), ('supports', 0.8258134126663208), ('brings', 0.8222417831420898), ('upon', 0.8124222755432129)]



In [79]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'transcription')
print(vectors8.wv.most_similar('transcription'))
print()

Most similar to: transcription
[('receptor', 0.9635428786277771), ('galaxy', 0.9552401900291443), ('eukaryotic', 0.9497318267822266), ('mRNA', 0.9465463161468506), ('transcriptional', 0.9457406997680664), ('MHC', 0.9427085518836975), ('muscle', 0.9385768175125122), ('conserved', 0.9299783110618591), ('silencing', 0.9290547370910645), ('phenotypic', 0.9256062507629395)]



In [80]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'largest')
print(vectors8.wv.most_similar('largest'))
print()

Most similar to: largest
[('North', 0.9387719631195068), ('earliest', 0.923382043838501), ('America', 0.9213746190071106), ('eighteenth', 0.9158466458320618), ('gatherers', 0.9120859503746033), ('Asian', 0.908501148223877), ('Andean', 0.9053014516830444), ('Asia', 0.902595043182373), ('island', 0.8988133668899536), ('19th', 0.8920904397964478)]



### QUESTION 2B

In [39]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import re

def get_fnames():
    """Read all text files in a folder.
    """
    fnames = []
    for root,_,files in os.walk("./abstracts"):
        for fname in files:
            if fname[-4:] == ".txt":
                fnames.append(os.path.join(root, fname))
    return fnames

print("Number of abstracts in folder awards: {}".format(len(get_fnames())))

Number of abstracts in folder awards: 132372


In [40]:
name_list = get_fnames()

def read_file(fname):
    with open(fname, 'r',encoding="ISO-8859-1") as f:
        # skip all lines until abstract
        for line in f:
            if "Abstract    :" in line:
                break

        # get abstract as a single string
        abstract = ' '.join([line[:-1].strip() for line in f])
        abstract = re.sub(' +', ' ', abstract)  # remove double spaces
        return abstract

In [41]:
documents_full = []

for i in name_list:
    documents_full.append(read_file(i))

In [42]:
new_vectorizer1 = TfidfVectorizer(stop_words = 'english', lowercase= True, ngram_range = (1,3) ,min_df=5, use_idf=True, sublinear_tf=True, max_df=1.0)
word_tokenizer1 = new_vectorizer1.build_tokenizer()
tokenized_text1 = [word_tokenizer1(doc) for doc in documents_full]

In [43]:
tokenized_text1[0]

[]

In [44]:
len(tokenized_text1)

132372

In [46]:
vectors9 = gensim.models.Word2Vec(tokenized_text1, size= 10, min_count=1, sg=1, workers=4)

2021-02-26 21:56:34,048 : INFO : collecting all words and their counts
2021-02-26 21:56:34,049 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-26 21:56:34,422 : INFO : PROGRESS: at sentence #10000, processed 1565323 words, keeping 47155 word types
2021-02-26 21:56:34,922 : INFO : PROGRESS: at sentence #20000, processed 3528075 words, keeping 79367 word types
2021-02-26 21:56:35,403 : INFO : PROGRESS: at sentence #30000, processed 5442667 words, keeping 105577 word types
2021-02-26 21:56:35,944 : INFO : PROGRESS: at sentence #40000, processed 7205533 words, keeping 127561 word types
2021-02-26 21:56:36,840 : INFO : PROGRESS: at sentence #50000, processed 9646664 words, keeping 149006 word types
2021-02-26 21:56:37,624 : INFO : PROGRESS: at sentence #60000, processed 12202268 words, keeping 167318 word types
2021-02-26 21:56:38,278 : INFO : PROGRESS: at sentence #70000, processed 14413047 words, keeping 184849 word types
2021-02-26 21:56:38,745 : INFO :

2021-02-26 21:58:59,064 : INFO : EPOCH 1 - PROGRESS: at 79.08% examples, 333728 words/s, in_qsize 7, out_qsize 0
2021-02-26 21:59:00,070 : INFO : EPOCH 1 - PROGRESS: at 81.21% examples, 333726 words/s, in_qsize 7, out_qsize 0
2021-02-26 21:59:01,081 : INFO : EPOCH 1 - PROGRESS: at 83.51% examples, 333543 words/s, in_qsize 7, out_qsize 0
2021-02-26 21:59:02,110 : INFO : EPOCH 1 - PROGRESS: at 85.22% examples, 333409 words/s, in_qsize 7, out_qsize 0
2021-02-26 21:59:03,139 : INFO : EPOCH 1 - PROGRESS: at 86.83% examples, 333327 words/s, in_qsize 7, out_qsize 0
2021-02-26 21:59:04,169 : INFO : EPOCH 1 - PROGRESS: at 88.40% examples, 333333 words/s, in_qsize 7, out_qsize 0
2021-02-26 21:59:05,191 : INFO : EPOCH 1 - PROGRESS: at 89.84% examples, 333297 words/s, in_qsize 7, out_qsize 0
2021-02-26 21:59:06,217 : INFO : EPOCH 1 - PROGRESS: at 91.39% examples, 333259 words/s, in_qsize 7, out_qsize 0
2021-02-26 21:59:07,227 : INFO : EPOCH 1 - PROGRESS: at 92.67% examples, 332987 words/s, in_qsiz

2021-02-26 22:00:10,178 : INFO : EPOCH 2 - PROGRESS: at 86.12% examples, 317109 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:00:11,181 : INFO : EPOCH 2 - PROGRESS: at 87.74% examples, 317299 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:00:12,186 : INFO : EPOCH 2 - PROGRESS: at 89.20% examples, 317494 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:00:13,188 : INFO : EPOCH 2 - PROGRESS: at 90.69% examples, 317745 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:00:14,194 : INFO : EPOCH 2 - PROGRESS: at 92.12% examples, 317898 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:00:15,199 : INFO : EPOCH 2 - PROGRESS: at 93.36% examples, 318066 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:00:16,214 : INFO : EPOCH 2 - PROGRESS: at 94.56% examples, 318198 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:00:17,233 : INFO : EPOCH 2 - PROGRESS: at 95.73% examples, 318434 words/s, in_qsize 8, out_qsize 0
2021-02-26 22:00:18,253 : INFO : EPOCH 2 - PROGRESS: at 97.02% examples, 318766 words/s, in_qsiz

2021-02-26 22:01:20,929 : INFO : EPOCH 3 - PROGRESS: at 88.29% examples, 302906 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:01:21,932 : INFO : EPOCH 3 - PROGRESS: at 89.58% examples, 302847 words/s, in_qsize 8, out_qsize 0
2021-02-26 22:01:22,948 : INFO : EPOCH 3 - PROGRESS: at 90.93% examples, 302619 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:01:23,967 : INFO : EPOCH 3 - PROGRESS: at 92.06% examples, 301973 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:01:24,988 : INFO : EPOCH 3 - PROGRESS: at 93.21% examples, 301958 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:01:26,014 : INFO : EPOCH 3 - PROGRESS: at 94.34% examples, 301916 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:01:27,028 : INFO : EPOCH 3 - PROGRESS: at 95.15% examples, 300663 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:01:28,086 : INFO : EPOCH 3 - PROGRESS: at 95.97% examples, 299602 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:01:29,093 : INFO : EPOCH 3 - PROGRESS: at 96.94% examples, 298909 words/s, in_qsiz

2021-02-26 22:02:31,555 : INFO : EPOCH 4 - PROGRESS: at 75.96% examples, 270187 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:02:32,582 : INFO : EPOCH 4 - PROGRESS: at 77.49% examples, 270657 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:02:33,592 : INFO : EPOCH 4 - PROGRESS: at 79.35% examples, 271176 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:02:34,605 : INFO : EPOCH 4 - PROGRESS: at 81.26% examples, 271651 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:02:35,609 : INFO : EPOCH 4 - PROGRESS: at 82.85% examples, 271087 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:02:36,626 : INFO : EPOCH 4 - PROGRESS: at 84.37% examples, 270472 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:02:37,674 : INFO : EPOCH 4 - PROGRESS: at 85.55% examples, 270123 words/s, in_qsize 6, out_qsize 1
2021-02-26 22:02:38,687 : INFO : EPOCH 4 - PROGRESS: at 86.76% examples, 269717 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:02:39,692 : INFO : EPOCH 4 - PROGRESS: at 88.18% examples, 270345 words/s, in_qsiz

2021-02-26 22:03:42,915 : INFO : EPOCH 5 - PROGRESS: at 66.69% examples, 261265 words/s, in_qsize 8, out_qsize 0
2021-02-26 22:03:43,960 : INFO : EPOCH 5 - PROGRESS: at 68.02% examples, 261638 words/s, in_qsize 8, out_qsize 0
2021-02-26 22:03:44,973 : INFO : EPOCH 5 - PROGRESS: at 69.06% examples, 261570 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:03:46,006 : INFO : EPOCH 5 - PROGRESS: at 70.15% examples, 261834 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:03:47,046 : INFO : EPOCH 5 - PROGRESS: at 71.08% examples, 261110 words/s, in_qsize 7, out_qsize 1
2021-02-26 22:03:48,054 : INFO : EPOCH 5 - PROGRESS: at 72.27% examples, 261335 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:03:49,083 : INFO : EPOCH 5 - PROGRESS: at 73.27% examples, 260942 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:03:50,089 : INFO : EPOCH 5 - PROGRESS: at 74.15% examples, 260153 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:03:51,119 : INFO : EPOCH 5 - PROGRESS: at 75.08% examples, 259663 words/s, in_qsiz

In [47]:
vectors9.train(tokenized_text1, total_examples=vectors9.corpus_count, epochs=vectors9.epochs)

2021-02-26 22:04:39,229 : INFO : training model with 4 workers on 257022 vocabulary and 10 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
2021-02-26 22:04:40,279 : INFO : EPOCH 1 - PROGRESS: at 2.55% examples, 358649 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:04:41,286 : INFO : EPOCH 1 - PROGRESS: at 4.38% examples, 334630 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:04:42,296 : INFO : EPOCH 1 - PROGRESS: at 6.53% examples, 334164 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:04:43,362 : INFO : EPOCH 1 - PROGRESS: at 8.34% examples, 325247 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:04:44,393 : INFO : EPOCH 1 - PROGRESS: at 9.82% examples, 320936 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:04:45,427 : INFO : EPOCH 1 - PROGRESS: at 11.49% examples, 318451 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:04:46,453 : INFO : EPOCH 1 - PROGRESS: at 12.82% examples, 316871 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:04:47,555 : INFO : EPOCH 1 - PROGRESS: at 14.35% e

2021-02-26 22:05:50,263 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-02-26 22:05:50,290 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-02-26 22:05:50,291 : INFO : EPOCH - 1 : training on 27374377 raw words (21282478 effective words) took 71.1s, 299516 effective words/s
2021-02-26 22:05:51,307 : INFO : EPOCH 2 - PROGRESS: at 2.10% examples, 287341 words/s, in_qsize 6, out_qsize 1
2021-02-26 22:05:52,371 : INFO : EPOCH 2 - PROGRESS: at 3.79% examples, 283194 words/s, in_qsize 8, out_qsize 0
2021-02-26 22:05:53,386 : INFO : EPOCH 2 - PROGRESS: at 5.57% examples, 281405 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:05:54,392 : INFO : EPOCH 2 - PROGRESS: at 7.58% examples, 296127 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:05:55,400 : INFO : EPOCH 2 - PROGRESS: at 9.13% examples, 294371 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:05:56,406 : INFO : EPOCH 2 - PROGRESS: at 10.63% examples, 300044 words/s, in_qsize 7, out_qsize 0
2021-

2021-02-26 22:07:00,352 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-02-26 22:07:00,353 : INFO : EPOCH - 2 : training on 27374377 raw words (21281193 effective words) took 70.1s, 303760 effective words/s
2021-02-26 22:07:01,391 : INFO : EPOCH 3 - PROGRESS: at 2.26% examples, 310603 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:07:02,393 : INFO : EPOCH 3 - PROGRESS: at 4.05% examples, 310919 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:07:03,395 : INFO : EPOCH 3 - PROGRESS: at 6.26% examples, 321531 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:07:04,406 : INFO : EPOCH 3 - PROGRESS: at 7.99% examples, 316419 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:07:05,408 : INFO : EPOCH 3 - PROGRESS: at 9.61% examples, 318558 words/s, in_qsize 8, out_qsize 0
2021-02-26 22:07:06,422 : INFO : EPOCH 3 - PROGRESS: at 10.98% examples, 313642 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:07:07,446 : INFO : EPOCH 3 - PROGRESS: at 12.41% examples, 310638 words/s, in_qsiz

2021-02-26 22:08:10,393 : INFO : EPOCH - 3 : training on 27374377 raw words (21283801 effective words) took 70.0s, 303896 effective words/s
2021-02-26 22:08:11,409 : INFO : EPOCH 4 - PROGRESS: at 2.05% examples, 279551 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:08:12,415 : INFO : EPOCH 4 - PROGRESS: at 3.83% examples, 294681 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:08:13,419 : INFO : EPOCH 4 - PROGRESS: at 5.88% examples, 305061 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:08:14,441 : INFO : EPOCH 4 - PROGRESS: at 7.94% examples, 314716 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:08:15,447 : INFO : EPOCH 4 - PROGRESS: at 9.68% examples, 321438 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:08:16,485 : INFO : EPOCH 4 - PROGRESS: at 11.49% examples, 323747 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:08:17,491 : INFO : EPOCH 4 - PROGRESS: at 12.96% examples, 326587 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:08:18,498 : INFO : EPOCH 4 - PROGRESS: at 14.69% examples, 3

2021-02-26 22:09:20,052 : INFO : EPOCH 5 - PROGRESS: at 4.47% examples, 340064 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:09:21,067 : INFO : EPOCH 5 - PROGRESS: at 6.67% examples, 339634 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:09:22,099 : INFO : EPOCH 5 - PROGRESS: at 8.61% examples, 339490 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:09:23,124 : INFO : EPOCH 5 - PROGRESS: at 10.23% examples, 340052 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:09:24,134 : INFO : EPOCH 5 - PROGRESS: at 12.02% examples, 340838 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:09:25,137 : INFO : EPOCH 5 - PROGRESS: at 13.51% examples, 341138 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:09:26,192 : INFO : EPOCH 5 - PROGRESS: at 15.39% examples, 340185 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:09:27,249 : INFO : EPOCH 5 - PROGRESS: at 17.11% examples, 340880 words/s, in_qsize 7, out_qsize 0
2021-02-26 22:09:28,271 : INFO : EPOCH 5 - PROGRESS: at 19.06% examples, 342930 words/s, in_qsize 8

(106412714, 136871885)

In [48]:
seed_word9= [list(vectors6.wv.vocab.keys())[(i+1)*1000] for i in range(5)]

In [49]:
seed_word9

['Conceptual', 'fluent', 'seventh', 'manage', 'publicity']

In [50]:
len(list(vectors9.wv.vocab.keys()))

257022

In [51]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'neuroscience')
print(vectors9.wv.most_similar('neuroscience'))
print()

2021-02-26 22:11:50,646 : INFO : precomputing L2-norms of word weight vectors


Most similar to: neuroscience
[('neurobiology', 0.9721717834472656), ('psychology', 0.971102774143219), ('interdisciplinarity', 0.9698944091796875), ('imaginative', 0.9662922620773315), ('cultivating', 0.9651345014572144), ('neurosciences', 0.9640445709228516), ('metacognition', 0.958501935005188), ('collegiality', 0.9539251327514648), ('psychobiology', 0.9532018899917603), ('biopsychology', 0.9521270990371704)]



In [44]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'neuroscience')
print(vectors4.wv.most_similar('neuroscience'))
print()

Most similar to: neuroscience
[('biology', 0.9733627438545227), ('furthering', 0.9680370688438416), ('neurobiology', 0.9664344787597656), ('epidemiology', 0.9656774997711182), ('psychology', 0.9603279829025269), ('informatics', 0.9592504501342773), ('genomics', 0.9549347162246704), ('relevance', 0.9538663029670715), ('discovery', 0.9513333439826965), ('STS', 0.9497036933898926)]



In [36]:
seed_word4

['half', 'whether', 'neuroscience', 'Lonza', 'tightly']

In [37]:
len(list(vectors4.wv.vocab.keys()))

63538

In [38]:
# Inspect words with vectors most similar to a given word
print("Most similar to:", 'neuroscience')
print(vectors4.wv.most_similar('neuroscience'))
print()

2021-02-26 21:50:18,179 : INFO : precomputing L2-norms of word weight vectors


Most similar to: neuroscience
[('furthering', 0.9842066764831543), ('informational', 0.9749946594238281), ('relevance', 0.9741153717041016), ('neurobiology', 0.96800696849823), ('interest', 0.9663718342781067), ('informatics', 0.9641363620758057), ('technological', 0.9639126062393188), ('advancing', 0.9617303609848022), ('playing', 0.9597301483154297), ('sciences', 0.9593894481658936)]

