In [1]:
import os
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from string import punctuation
import numpy as np
import random

DATASET = "lpi"

sourceDir = 'PLoS_One/'
fileList = os.listdir(sourceDir)
len(fileList)

abstractFile = "abstracts_data/pubmed_ecology_abstracts.txt"
with open(abstractFile) as f:
    negatives = ' '.join([line.strip() for line in f])
negatives = [paragraph for paragraph in negatives.split('  ') if len(paragraph) > 300]
random.shuffle(negatives)
len(negatives)

35938

In [2]:
if DATASET.lower() == "predicts":
    df = pd.read_csv('abstracts_data/unique_id_fields.csv')
    positives = list(df['Abstract'].dropna())
    positives = [item.lower().replace('abstract', '').replace('unavailable','').strip() for item in positives]
elif DATASET.lower() == "lpi":
    with open("abstracts_data/lpi_abstracts.txt") as f:
        positives = []
        for line in f:
            if line.strip():
                positives.append(line.strip().lower().replace("abstract",""))
len(positives)

691

In [3]:
def split_data(ids, docs, successes, perc_train, shuffle_data):
    """Split the document and classification data into training and testing sets."""

    ndx_shuffle = list(range(len(docs)))
    # Shuffle lists
    if shuffle_data:
        random.shuffle(ndx_shuffle)

    num_train = int(len(docs) * perc_train)
    docs_train = [docs[i].lower() for i in ndx_shuffle[:num_train]]
    docs_test = [docs[i].lower() for i in ndx_shuffle[num_train:]]
    successes_train = [successes[i] for i in ndx_shuffle[:num_train]]
    successes_test = [successes[i] for i in ndx_shuffle[num_train:]]
    ids_train = [ids[i] for i in ndx_shuffle[:num_train]]
    ids_test = [ids[i] for i in ndx_shuffle[num_train:]]

    return ids_train, ids_test, docs_train, docs_test, successes_train, successes_test


def rm_punctuation(string, replacement='', exclude="'-'"):
    """Remove punctuation from an input string """
    string = string.replace('-', ' ')  # Always replace hyphen with space
    for p in set(list(punctuation)) - set(list(exclude)):
        string = string.replace(p, replacement)

    string = ' '.join(string.split())  # Remove excess whitespace
    return string

def train_classifier(
        docs_success,
        docs_background,
        perc_train=0.7,
        shuffle_data=True,
        ngram_range=(1, 1),
        filter_params={},
        limit_data=None
):
    """
    Train a classifier given the classification data in input_file.

    @param perc_train: percentage of data points to train from input file (between 0 and 1). 0.4 by default.
    @param shuffle_data: whether to shuffle the input data before training. True by default.
    @return: the keywords and their corresponding coefficients as ndarrays, sorted by coefficient value.
    """

    docs = docs_success + docs_background
    successes = [1 for i in range(len(docs_success))] + [0 for j in range(len(docs_background))]
    ids = range(len(docs))

    # Remove punctuation from docs
    docs = [rm_punctuation(str(_)) for _ in docs]

    # Split data into training and test sets
    ids_train, ids_test, docs_train, docs_test, y_train, y_test = \
        split_data(ids, docs, successes, perc_train, shuffle_data)

    # Initialise vectorizer to convert text documents into matrix of token counts
    vect = CountVectorizer(min_df=2, ngram_range=ngram_range, stop_words='english')
    # Extract features from training dataset using sparse vectorizer
    X_train = vect.fit_transform(docs_train)
    print("LENGTH OF TEST DOCS",len(docs_test))

    # Logistic regression classifier
    lr_classifier = LogisticRegression(penalty='l2').fit(X_train, y_train)

    def get_top_feats(feature_names, classifier, plot=True, N=10, bar_height=0.5):
        """Sort keywords by their coefficients"""
        sorted_feats = np.argsort(classifier.coef_[0])  # Sorted by coefficients (descending)
        sorted_coeffs = classifier.coef_[0][sorted_feats]

        return sorted_feats, sorted_coeffs

    features = np.array(vect.get_feature_names())
    feat_ids, coeffs = get_top_feats(features, lr_classifier, plot=False, N=20)

    if len(docs_test) > 0:
        X_test = vect.transform(docs_test)
        # Predict test data
        y_test_predicted = lr_classifier.predict(X_test)

        print('Classifier has precision %.3f and recall %.3f' % \
            (metrics.precision_score(y_test, y_test_predicted),
            metrics.recall_score(y_test, y_test_predicted)))

        # Examples of misclassified positives/negatives
        positive_misses = [docs_test[i] for i in range(len(docs_test)) if (y_test[i] and not y_test_predicted[i])]
        negative_misses = [docs_test[i] for i in range(len(docs_test)) if (not y_test[i] and y_test_predicted[i])]

        print('\nSome positive misses:')

        for i in range(5):
            try:
                print(str(i+1) + ') ' + positive_misses[i] + "\n")
            except IndexError:
                break

        print('\nSome negative misses:')
        for i in range(5):
            try:
                print(str(i+1) + ') ' + negative_misses[i] + "\n")
            except IndexError:
                break
        print('')

    return features[feat_ids], coeffs, lr_classifier, vect

In [4]:
features, coefficients, model, vect = train_classifier(
    positives,
    negatives[:2000],
    perc_train=.7,
    ngram_range=(1, 3)
)

LENGTH OF TEST DOCS 808
Classifier has precision 0.884 and recall 0.957

Some positive misses:
1) 1 the presenceabsence of a species at a particular site is the simplest form of data that can be collected during ecological field studies we used 13 years 1990 1990 of survey data to parameterize a stochastic patch occupancy model for a metapopulation of the yellow bellied marmot in colorado and investigated the significance of particular patches and the influence of site quality network characteristics and regional stochasticity on the metapopulation persistence 2 persistence of the yellow bellied marmot metapopulation was strongly dependent on the high quality colony sites and persistence probability was highly sensitive to small changes in the quality of these sites 3 a relatively small number of colony sites was ultimately responsible for the regional persistence however lower quality satellite sites also made a significant contribution to long term metapopulation persistence especial

In [5]:
print("\nBottom features")
for f,c in list(zip(features,coefficients))[:5]:
    print(f,c)
    
print("\nTop features")
for f,c in list(zip(features,coefficients))[-5:]:
    print(f,c)


Bottom features
water -1.11698966594
health -0.632240832625
paper -0.61989555873
analysis -0.614726106445
control -0.603707585186

Top features
2004 0.419978915629
year 0.467869514905
conservation 0.478065956089
population 0.594435780407
abundance 0.947052859406


In [8]:
import datetime
datetime.datetime.now().strftime("%Y%M%d-%H%M")

'20174917-1249'

In [14]:
# Save model
from sklearn.externals import joblib

joblib.dump(vect, "models/%s_LR_vectorizer.pkl" % (DATASET.lower()), compress=1)
joblib.dump(model, "models/%s_LR_model.pkl" % (DATASET.lower()), compress=1)

['models/lpi_LR_model.pkl']

In [16]:
joblib.load("models/%s_LR_model.pkl" % (DATASET.lower()))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
import pickle
with open("models/%s_LR_vectorizer.pkl" % (DATASET.lower()), 'wb') as f:
    pickle.dump(vect, f)
with open("models/%s_LR_model.pkl" % (DATASET.lower()), 'wb') as f:
    pickle.dump(model, f)

In [13]:
with open("models/%s_LR_model.pkl" % (DATASET.lower()), 'rb') as f:
    model2 = pickle.load(f)

In [10]:
model.predict_proba(vect.transform(["hello world"]))[:,1]

array([ 0.70930308])

In [31]:
def suggest(model,vect):
    suggestions = []
    for fileName in fileList[:1000]:
        with open(os.path.join(sourceDir,fileName)) as f:
            if f.readline().strip().lower()=='introduction':
                candidate = f.readline()
                if len(candidate)>50:
                    prediction = model.predict(vect.transform([rm_punctuation(str(candidate))]))
                    if prediction[0]:
                        suggestions.append(dict(file=fileName, abstract=candidate))
    return suggestions
suggestions = suggest(model,vect)
len(suggestions)


IOError: [Errno 2] No such file or directory: 'PLoS_One/all_abstracts.csv'

In [25]:
pd.DataFrame(suggestions).to_csv('suggested_{0}.csv'.format(DATASET.lower()))