In [56]:
import os
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from string import punctuation
import numpy as np
import random
from sklearn.model_selection import cross_val_score

DATASET = "lpi"

abstractFile = "abstracts_data/pubmed_ecology_abstracts.txt"
with open(abstractFile) as f:
    negatives = ' '.join([line.strip() for line in f])
negatives = [paragraph for paragraph in negatives.split('  ') if len(paragraph) > 300]
random.shuffle(negatives)
len(negatives)

35938

In [57]:
if DATASET.lower() == "predicts":
    df = pd.read_csv('abstracts_data/unique_id_fields.csv')
    positives = list(df['Abstract'].dropna())
    positives = [item.lower().replace('abstract', '').replace('unavailable','').strip() for item in positives]
elif DATASET.lower() == "lpi":
    with open("abstracts_data/lpi_abstracts_updated.txt") as f:
        positives = []
        for line in f:
            if line.strip():
                positives.append(line.strip().lower().replace("abstract",""))
len(positives)

1240

In [58]:
def split_data(ids, docs, successes, perc_train, shuffle_data):
    """Split the document and classification data into training and testing sets."""

    ndx_shuffle = list(range(len(docs)))
    # Shuffle lists
    if shuffle_data:
        random.shuffle(ndx_shuffle)

    num_train = int(len(docs) * perc_train)
    docs_train = [docs[i].lower() for i in ndx_shuffle[:num_train]]
    docs_test = [docs[i].lower() for i in ndx_shuffle[num_train:]]
    successes_train = [successes[i] for i in ndx_shuffle[:num_train]]
    successes_test = [successes[i] for i in ndx_shuffle[num_train:]]
    ids_train = [ids[i] for i in ndx_shuffle[:num_train]]
    ids_test = [ids[i] for i in ndx_shuffle[num_train:]]

    return ids_train, ids_test, docs_train, docs_test, successes_train, successes_test


def rm_punctuation(string, replacement='', exclude="'-'"):
    """Remove punctuation from an input string """
    string = string.replace('-', ' ')  # Always replace hyphen with space
    for p in set(list(punctuation)) - set(list(exclude)):
        string = string.replace(p, replacement)

    string = ' '.join(string.split())  # Remove excess whitespace
    return string

def train_classifier(
        docs_success,
        docs_background,
        perc_train=1.0,
        shuffle_data=True,
        ngram_range=(1, 1),
        filter_params={},
        limit_data=None,
        cross_val_folds=10
):
    """
    Train a classifier given the classification data in input_file.

    @param perc_train: percentage of data points to train from input file (between 0 and 1). 0.4 by default.
    @param shuffle_data: whether to shuffle the input data before training. True by default.
    @return: the keywords and their corresponding coefficients as ndarrays, sorted by coefficient value.
    """

    docs = docs_success + docs_background
    successes = [1 for i in range(len(docs_success))] + [0 for j in range(len(docs_background))]
    ids = range(len(docs))

    # Remove punctuation from docs
    docs = [rm_punctuation(str(_)) for _ in docs]

    # Split data into training and test sets
    ids_train, ids_test, docs_train, docs_test, y_train, y_test = \
        split_data(ids, docs, successes, perc_train, shuffle_data)

    # Initialise vectorizer to convert text documents into matrix of token counts
    vect = CountVectorizer(min_df=2, ngram_range=ngram_range, stop_words='english')
    # Extract features from training dataset using sparse vectorizer
    X_train = vect.fit_transform(docs_train)
    print("LENGTH OF TEST DOCS",len(docs_test))

    # Logistic regression classifier
    lr_classifier = LogisticRegression(penalty='l2')
    if cross_val_folds > 0:
        cv_precision = cross_val_score(lr_classifier, X_train, y_train, cv=cross_val_folds, scoring='precision')
        cv_recall = cross_val_score(lr_classifier, X_train, y_train, cv=cross_val_folds, scoring='recall')
        print("%d-fold cross validation scores:\nPrecision:%.4f\nRecall:%.4f" %
              (cross_val_folds, cv_precision.mean(), cv_recall.mean()))
        pd.DataFrame([dict(precision=p, recal=r) for p, r in zip(cv_precision, cv_recall)
                    ]).to_csv("cross_validation_metrics.csv", index=False)

    lr_classifier = LogisticRegression(penalty='l2').fit(X_train, y_train)
    
    def get_top_feats(feature_names, classifier, plot=True, N=10, bar_height=0.5):
        """Sort keywords by their coefficients"""
        sorted_feats = np.argsort(classifier.coef_[0])  # Sorted by coefficients (descending)
        sorted_coeffs = classifier.coef_[0][sorted_feats]

        return sorted_feats, sorted_coeffs

    features = np.array(vect.get_feature_names())
    feat_ids, coeffs = get_top_feats(features, lr_classifier, plot=False, N=20)

    if len(docs_test) > 0:
        X_test = vect.transform(docs_test)
        # Predict test data
        y_test_predicted = lr_classifier.predict(X_test)

        print('Classifier has precision %.3f and recall %.3f' % \
            (metrics.precision_score(y_test, y_test_predicted),
            metrics.recall_score(y_test, y_test_predicted)))

        # Examples of misclassified positives/negatives
        positive_misses = [docs_test[i] for i in range(len(docs_test)) if (y_test[i] and not y_test_predicted[i])]
        negative_misses = [docs_test[i] for i in range(len(docs_test)) if (not y_test[i] and y_test_predicted[i])]

        print('\nSome positive misses:')

        for i in range(5):
            try:
                print(str(i+1) + ') ' + positive_misses[i] + "\n")
            except IndexError:
                break

        print('\nSome negative misses:')
        for i in range(5):
            try:
                print(str(i+1) + ') ' + negative_misses[i] + "\n")
            except IndexError:
                break
        print('')

    return features[feat_ids], coeffs, lr_classifier, vect

In [59]:
features, coefficients, model, vect = train_classifier(
    positives,
    negatives[:2000],
    perc_train=1.0,
    ngram_range=(1, 3),
    cross_val_folds=10
)

LENGTH OF TEST DOCS 0
10-fold cross validation scores:
Precision:0.9483
Recall:0.9556


In [60]:
print("\nBottom features")
for f,c in list(zip(features,coefficients))[:10]:
    print(f,c)
    
print("\nTop features")
for f,c in list(zip(features,coefficients))[-10:]:
    print(f,c)


Bottom features
water -0.912787291803
host -0.740907633041
ecological -0.692803071195
marine -0.626999316566
reported -0.611649865936
concentrations -0.608082138246
health -0.596875865228
development -0.590329188653
paper -0.589969479913
genetic -0.58407576602

Top features
native 0.482693777895
spawning 0.499944187609
trends 0.501204318287
fishing 0.509631048264
habitat 0.51987665516
conservation 0.534402804678
nesting 0.546626345317
population 0.593140078847
decline 0.632559515068
abundance 0.687061499621


In [61]:
# Save model
from sklearn.externals import joblib

joblib.dump(vect, "models/%s_LR_vectorizer.pkl" % (DATASET.lower()), compress=1)
joblib.dump(model, "models/%s_LR_model.pkl" % (DATASET.lower()), compress=1)

['models/lpi_LR_model.pkl']

In [23]:
joblib.load("models/%s_LR_model.pkl" % (DATASET.lower()))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
plos_abstracts = pd.read_csv("PLoS_One/all_abstracts.csv")

In [34]:
plos_abstracts["lpi_probability"] = model.predict_proba(vect.transform(
        [rm_punctuation(str(a)) for a in plos_abstracts.abstract.values]))[:,1]
plos_abstracts.head()

Unnamed: 0.1,Unnamed: 0,abstract,file,lpi_probability
0,0,Invasive candidiases are life threatening oppo...,PLoS_One_2013_Jul_23_8(7)_e69664.txt,0.268799
1,1,Multiple sclerosis (MS) is a chronic inflammat...,PLoS_One_2011_Oct_20_6(10)_e26262.txt,0.000257
2,2,There is a strong ongoing debate on the catego...,PLoS_One_2012_Feb_17_7(2)_e30727.txt,0.002204
3,3,Assay of the T cell response to antigens of My...,PLoS_One_2013_Aug_8_8(8)_e71351.txt,0.006375
4,4,Motor disability is one of the most common def...,PLoS_One_2014_Jan_8_9(1)_e84729.txt,0.010826


In [36]:
plos_abstracts["doi"] = ["http://dx.doi.org/10.1371/journal.pone.%s" % (filename.split("e")[-1].split(".")[0])
                         for filename in plos_abstracts.file.values]
plos_abstracts.head()

Unnamed: 0.1,Unnamed: 0,abstract,file,lpi_probability,doi
0,0,Invasive candidiases are life threatening oppo...,PLoS_One_2013_Jul_23_8(7)_e69664.txt,0.268799,http://dx.doi.org/10.1371/journal.pone.69664
1,1,Multiple sclerosis (MS) is a chronic inflammat...,PLoS_One_2011_Oct_20_6(10)_e26262.txt,0.000257,http://dx.doi.org/10.1371/journal.pone.26262
2,2,There is a strong ongoing debate on the catego...,PLoS_One_2012_Feb_17_7(2)_e30727.txt,0.002204,http://dx.doi.org/10.1371/journal.pone.30727
3,3,Assay of the T cell response to antigens of My...,PLoS_One_2013_Aug_8_8(8)_e71351.txt,0.006375,http://dx.doi.org/10.1371/journal.pone.71351
4,4,Motor disability is one of the most common def...,PLoS_One_2014_Jan_8_9(1)_e84729.txt,0.010826,http://dx.doi.org/10.1371/journal.pone.84729


In [41]:
plos_abstracts[["file", "lpi_probability"]].to_csv("PLoS_abstract_probabilities.csv", index=False)

In [44]:
plos_abstracts.sort_values("lpi_probability").abstract.head(20)

35119     Proteins released in the blood-stream reflect ...
78746     Most biomedical research projects are planned ...
35794     Water transfer engineering has been used succe...
91099     The generation of leachate remains an inevitab...
28008     Monoclonal antibodies (mAbs) are among the lar...
47969     Tyrosine kinase inhibitors (TKIs) are nowadays...
59787     B-cell lymphomas are a species of lymphomas de...
13580     The principal function of red blood cells (RBC...
100234    The development of microfluidic platforms for ...
103882    The viral proteins hijack cellular machinery b...
108608    The insufficient availability of tissue donors...
57755     Successful control of diabetes greatly depends...
89017     On March 11, 2011, an earthquake of magnitude ...
38968     Mine water remains one of the major problems o...
13001     The association of Reactive Oxygen Species (RO...
54452     Apoptosis is an intricate pathway triggered by...
90580     Small chain alkylbenzenes are 

In [45]:
plos_abstracts.sort_values("lpi_probability", ascending=False).abstract.head(20)

71024     The hangul, or Kashmir red deer (Cervus elaphu...
76247     Non-breeding individuals can represent a subst...
99664     Maintaining or increasing the population numbe...
68194     The effects of predation on ungulate populatio...
44837     Typically, fish species may be split into popu...
8035      High fishing pressure can affect the size and ...
116973    Over the past 2 centuries, several fur seal an...
73822     Wetland quantity and quality are a primary con...
22633     In summer and autumn, schools of Atlantic blue...
52111     Estimating and monitoring bird populations are...
86607     The reproductive biology of Atlantic bluefin t...
26404     Knowledge of population and distribution is cr...
6932      Animal distribution shifts in relation to glob...
31089     Observed in many taxa, spatial population sync...
39486     Baltic seals are recovering after a population...
87398     Wildlife populations can compete with humans o...
74183     A central tenet of population 