In [2]:
import os
import string
import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

In [3]:
def process_text (documents, stop_words):
    """
    Takes documents and a list of stop words and performs the following functions:
    - Removes punctuation
    - Stems the words using the Snoball Stemmer
    - Removes stopwords
    Returns a list of tokenized words, as well as the processed text, for each document.
    """
    processed_tokens = []
    processed_text = []
    for i in np.arange(0,len(documents),1):
        text = documents[i].translate(str.maketrans(" ", " ", string.punctuation+'0123456789'))
        #print(text)
        tokens = [SnowballStemmer("english").stem(w) for w in word_tokenize(text)]
        #print(tokens)
        tokens = [w for w in tokens if not w in stop_words]
        #print(filtered_string)
        processed_tokens.append(tokens)
        #create compiled text
        text = ' '.join([w for w in tokens])
        processed_text.append(text)
    
    #processed_text = ' '.join([w for w in processed_text])
    return processed_text, processed_tokens

In [4]:
df = pd.read_csv('abstract_table.csv')

#create titles document 
titles = df['ArticleTitle']
#create abstract document 
text = df['AbstractText'].dropna()
#combine these two
####May want to do something about weighting but only if there's time
total_text = (df['ArticleTitle']+df['AbstractText'].fillna('')).tolist()

In [5]:
#open the database, get the table
stop_words = set(stopwords.words('english'))
processed_text, processed_tokens = process_text(total_text, stop_words)

In [6]:
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer = CountVectorizer(min_df = 4)
tf = tf_vectorizer.fit_transform(processed_text)

In [7]:
lm_dict = {}
lm_dict['PMID'] = df['PMID']
lm_dict['pdf'] = normalize(tf, axis=1, norm='l1')
lm_dict['terms'] = tf_vectorizer.get_feature_names()

In [8]:
import pickle

with open('language_model.pickle', 'wb') as handle:
    pickle.dump(lm_dict, handle)