In [17]:
from pymongo import MongoClient
import pandas as pd
import en_core_web_sm
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.externals import joblib
import spacy
import re
import string
from multiprocessing import Pool
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# The sentence database

The sentence database will have one doc per sentence, which has the following structure:

        {
        'sentence': sentence,
        'paper_id' : id from origin paper in paper DB,
        'score': sentence_score predicted by classification model,
        'keywords: list of k_important words
        'similar_sentences: ID1, ID2, ...
        'type_entity': [type, word]
        }
        
__To do__:  
1. Calculate features for all sentences.
2. Calculate score for each sentence.
4. Construct list of keywords in sentence.
5. construct list of entity type for each sentence.

In [3]:
client = MongoClient()
db = client.lingbuzz

In [None]:
# db.create_collection('sentences')
# db.create_collection('keywords')

In [4]:
papers = db.get_collection('papers')
sentences = db.get_collection('sentences')
keywords = db.get_collection('keywords')

### Calculate scores

In [19]:
df_papers = pd.DataFrame(columns=['paperID', 'paper'])
for doc in papers.find({'paper':{'$exists': True}}):
    df_papers = df_papers.append({'paperID': doc['_id'], 'paper': doc['paper']}, ignore_index = True)

In [21]:
df_papers.head()

Unnamed: 0,paperID,paper
0,598b44c407d7df07719383e2,ANALYTIC PASSIVES IN CZECH Ludmila Veselovs...
1,598b44c407d7df07719383e5,UNIVERSAL DP-ANALYSIS IN ARTICLELESS LANGUAGE:...
2,598b44c407d7df07719383e8,Strong Pronominals in ASL and LSF* Philippe ...
3,598b44c407d7df07719383f0,THE UNIVERSITY OF CHICAGO INFLECTIONAL DEPEND...
4,598b44c407d7df07719383fc,"Multiple Sluicing, Scope, and Superiority: Con..."


In [15]:
nlp = spacy.load('en_core_web_sm')
top_k_words = joblib.load('top_k_words')
authors = joblib.load('authors')

In [146]:
def count_k_important(sent):
    count = 0
    keywords = []
    for w in sent:
        if w.lower_ in top_k_words:
            count+=1
            keywords.append(w.lower_)
    return count, keywords

def eliminate_non_english_words(s):
    """takes list of words and eliminates all words that contain non-english characters, digits or punctuation"""
    english_words = []
    for word in s:
        if word.lower() in authors:
            english_words.append(word)
        else: 
            try:
                word.encode(encoding='utf-8').decode('ascii')
                # if re.sub('-', '', word).isalpha():
                    # english_words.append(re.sub('[%s]' % re.escape(string.punctuation), '', word))
                word = re.sub('[%s]' % re.escape(string.punctuation), '', word)
                if word.isalpha():
                    english_words.append(word) 
            except UnicodeDecodeError:
                pass
    return ' '.join(english_words)

def calculate_named_entities(sent):
    count = 0
    entities = []
    for ent in sent.ents:
        count+=1
        entities.append((ent.label_, ent.text))
    return count, entities

def calculate_pos(sent):
    n = 0
    v = 0
    a = 0
    for w in sent:
        if w.pos_ == 'VERB':
            v+=1
        if w.pos_ == 'ADJ':
            a += 1
        if w.pos_ == 'NOUN':
            n+=1
    return n, v, a

def calculate_upper(sent):
    counter = -1
    if sent[0].prefix_.islower():
        return counter
    else:
        for w in sent:
            if not w.is_lower:
                counter += 1
        return counter    

def calculate_and_to_db(df):
    """takes a df, calculates features, score, tfidf-vector and stores in MongoDB"""
    for index, row in df.iterrows():
        posi = 0
        for sent in nlp(row['paper']).sents:
            _id = sentences.insert({'sentence': str(sent), 'paperID': row['paperID']})
            out = [len(list(sent))]
            upper = calculate_upper(sent)
            sent = nlp(eliminate_non_english_words(str(sent).split()))
            named_entities, type_entity = calculate_named_entities(sent)
            k_important, keywords = count_k_important(sent)
            pos = 100/len(list(nlp(row['paper']).sents))*posi
            nouns, verbs, adjectives = calculate_pos(sent)
            out+=[named_entities, k_important, pos, upper, nouns, verbs, adjectives]
            # probability of being informative
            score = scorer.predict_proba(out)[0][1]
            sentences.update_one({'_id': _id}, {'$set': {'type_entity': type_entity, 'keywords': keywords, 
                                                                  'score': score}})
            posi += 1
            
            # no match on sentence because I have two different instances of sentence.
            # drop all docs. retrieve the _id when doc is created and match on that one.
            
            
num_partitions = 3 #number of partitions to split dataframe
num_cores = 3 #number of cores on your machine

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


In [150]:
parallelize_dataframe(df_papers, calculate_and_to_db)

  "MongoClient opened before fork. Create MongoClient "
  "MongoClient opened before fork. Create MongoClient "
  "MongoClient opened before fork. Create MongoClient "


Process ForkPoolWorker-2:
Traceback (most recent call last):
  File "/home/aleksandra/anaconda3/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/aleksandra/anaconda3/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "/home/aleksandra/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/aleksandra/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "<ipython-input-146-45d5c507bafd>", line 70, in calculate_and_to_db
    pos = 100/len(list(nlp(row['paper']).sents))*posi
  File "/home/aleksandra/anaconda3/lib/python3.6/site-packages/spacy/language.py", line 341, in __call__
    doc = self.make_doc(text)
  File "/home/aleksandra/anaconda3/lib/python3.6/site-packages/spacy/language.py", line 315, in <lambda>
    self.make_doc = lambda text: self.tokenizer(text)


KeyboardInterrupt: 

Process ForkPoolWorker-3:
Traceback (most recent call last):
  File "/home/aleksandra/anaconda3/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/aleksandra/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/aleksandra/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/aleksandra/anaconda3/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "<ipython-input-146-45d5c507bafd>", line 70, in calculate_and_to_db
    pos = 100/len(list(nlp(row['paper']).sents))*posi
  File "/home/aleksandra/anaconda3/lib/python3.6/site-packages/spacy/language.py", line 350, in __call__
    proc(doc)
  File "spacy/syntax/parser.pyx", line 205, in spacy.syntax.parser.Parser.__call__ (spacy/syntax/parser.cpp:7682)
KeyboardInterrupt
Process ForkPoolWorker-1:
Traceback (most r

In [51]:
# The classification model that gives probabilities
scorer = joblib.load('scorer')

In [88]:
df_papers.head(2)

Unnamed: 0,paperID,paper
0,598b44c407d7df07719383e2,ANALYTIC PASSIVES IN CZECH Ludmila Veselovs...
1,598b44c407d7df07719383e5,UNIVERSAL DP-ANALYSIS IN ARTICLELESS LANGUAGE:...


In [None]:
calculate_and_to_db(df_papers.head(1))

In [151]:
for doc in sentences.find()[:5]:
    print(doc)

{'_id': ObjectId('59a85acdb18b146ddb84ff2b'), 'sentence': 'To Appear in H. Harley ed.', 'paperID': ObjectId('598b44c407d7df0771938b4e'), 'type_entity': [['ORG', 'Harley']], 'keywords': ['appear', 'h', 'harley', 'ed'], 'score': 0.9812410356893851}
{'_id': ObjectId('59a85aceb18b146ddb84ff2c'), 'sentence': 'The Proceedings of the Penn/MIT Workshop on Aspect, Argument  Structure, and Events, May 1997, MITWPL  Voice Systems and the Syntax/Morphology Interface\x00  David Embick, University of Pennsylvania/MIT  1  ', 'paperID': ObjectId('598b44c407d7df0771938b4e'), 'type_entity': [['ORG', 'the PennMIT Workshop'], ['ORG', 'Aspect Argument Structure'], ['ORG', 'David Embick University of PennsylvaniaMIT']], 'keywords': ['proceedings', 'workshop', 'aspect', 'argument', 'structure', 'events', 'mitwpl', 'voice', 'systems', 'syntaxmorphology', 'david', 'embick', 'university'], 'score': 0.11010114041868405}
{'_id': ObjectId('59a85aceb18b146ddc84ff2b'), 'sentence': 'Polish Stress: looking for phoneti