### Meta data for Syllabi¶

This notebook explores the WoS data to see how the texts can be cleaned, while also storing meta data to a file. This notebook should be converted to a script when running on the full data.


In [1]:
import pandas as pd

In [8]:
import spacy

In [9]:
my_stop_words = [ high increase method effect
                 analysis time base level associate control group include compare 
                 system treatment rate protein suggest find provide risk test factor 
                 gene change show present disease report measure response year identify 
                 function cancer conclusion expression significant process type activity 
                 similar sample significantly large determine demonstrate clinical reduce
                 different case human follow structure observe state mouse examine relate
                 "right" design population approach role difference indicate health range 
                 elsevier induce surface condition potential research decrease specific 
                 development image signal species evaluate outcome reserve important mean 
                 mechanism low region improve numb individual child develope value]

In [10]:
nlp = spacy.load("en")

In [11]:
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [12]:
def clean_text(text, max_len=1500000):
    nlp.max_length = max_len
    if len(text) > max_len:
        text = text[0:max_len]
    doc = nlp(text.lower(), disable=["parser", "tagger", "ner"])
    clean_text = []
    for w in doc:
        if (not '\n' in w.text and not w.is_stop and not w.is_punct and not w.like_num and not '’' in w.text 
            and not ')' in w.text and not '\xa0' in w.text and not '\r' in w.text and not '\t' in w.text 
            and not '\v' in w.text and len(w.text) > 3 and "@" not in w.text and "https" not in w.text
            and w.lemma_ not in my_stop_words):
                # we add the lematized version of the word
                clean_text.append(w.lemma_)
    return clean_text

In [3]:
wos_2010 = pd.read_csv("../data_files_USA/data_2010_US.csv")

In [14]:
cleaned_texts = []

In [15]:
for row in wos_2010.itertuples(index=True):
    try:
        text = clean_text(row.abstract.lower())
    except AttributeError:
        continue
    cleaned_texts.append(text)

In [16]:
del wos_2010

In [17]:
word_count = {}

In [18]:
for text in cleaned_texts:
    for word in text:
        if word not in word_count:
            word_count[word] = 0
        if word in word_count:
            word_count[word] += 1

In [19]:
import operator
sorted_words = sorted(word_count.items(), key=operator.itemgetter(1))

In [21]:
for word_ in sorted_words[:100]:
    word, n = word_
    print(word, end=' ')

study result patient cell high model increase datum method effect analysis time base level associate control group include compare system treatment rate protein suggest find provide risk test factor gene change show present disease report measure response year identify function cancer conclusion expression significant process type activity similar sample significantly large determine demonstrate clinical reduce different case human follow structure observe state mouse examine relate right design population approach role difference indicate health range elsevier induce surface condition potential research decrease specific development image signal species evaluate outcome reserve important mean mechanism low region improve numb individual child develope value 