In [148]:
import pickle
from collections import Counter
from nltk.corpus import stopwords
import nltk
import numpy as np
from statistics import mean
import scipy.stats
import nltk
from nltk.tag import pos_tag 
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import json
import sklearn
from sklearn.linear_model import LogisticRegression

eng_stopwords = set(stopwords.words('english'))

DATA_DIR = "../../../../data/"
OUTPUT_DIR = "outputs/"

In [115]:
# Throughout the rest of the code, it is assumed that the keys of the following dictionary
# are global variables, available to freely use. These global variables are modified only in this block. 
json_files = {'abstract_sentences': "AbstractSentences2007.json", 
                'full_text_sentences': "FullTextSentences2007.json", 
                'labels': "labels.json",
                'train_indices': 'train_indices.json',
                'test_indices': 'test_indices.json'}

for varname, j in json_files.items():
    qualified_name = DATA_DIR + j
    file = open(qualified_name, "rb")
    exec(varname + " = json.load(file)")
    
relevant_file_numbers = [x[0] for x in labels]

full_text_sentences = np.array(full_text_sentences)

In [53]:
def keep_word(word):
    return word not in eng_stopwords and word != ',' and word != '.' and word != '\n'

In [54]:
def get_wc_map(full_text_sentences):
    word_counts_map = []
    for document in full_text_sentences:
        cnt = Counter()
        for sentence in document:
            for word in sentence:
                word = word.lower()
                if keep_word(word):
                    cnt[word] += 1
            word_counts_map.append(cnt)
    return word_counts_map

word_counts_map = get_wc_map(full_text_sentences)

In [94]:
def get_doc_scores(full_text_sentences, word_counts_map):
    document_sentence_scores = [] # List of maps from sentence_id to score.
    
    for i, document in enumerate(full_text_sentences):
        sentence_scores = {} # Map for this document.
        document_word_counts = word_counts_map[i] 
        num_doc_words = sum(document_word_counts.values())

        for sentence_id, sentence in enumerate(document):
            sentence_word_freq_sum = 0
            num_words_in_sentence = 0

            for word in sentence:
                word = word.lower()
                if keep_word(word):
                    word_freq = document_word_counts[word] / num_doc_words
                    sentence_word_freq_sum += word_freq
                    num_words_in_sentence += 1

            sentence_score = sentence_word_freq_sum / num_words_in_sentence if num_words_in_sentence != 0 else 0
            sentence_scores[sentence_id] = sentence_score 

        document_sentence_scores.append(sentence_scores)
    return document_sentence_scores

document_sentence_scores = get_doc_scores(full_text_sentences, word_counts_map)

# Experimenting

In [56]:
def sentence_position(document_num, sentence_num):
    return sentence_num / len(full_text_sentences[document_num])

In [57]:
def sentence_length(document_num, sentence_num, mean_sent_length=None, std_dev=5):
    sentence = full_text_sentences[document_num][sentence_num]
    return len(sentence)

In [58]:
def proper_noun(document_num, sentence_num):
    sentence = full_text_sentences[document_num][sentence_num]
    tagged_sent = pos_tag(sentence)
    propernouns = [word for word, pos in tagged_sent if pos == 'NNP']    
    return len(propernouns)

In [102]:
def sentence_freq_score(document_num, sentence_num):
    score = 1000 * document_sentence_scores[document_num][sentence_num]
    return score

In [103]:
features_functions = [sentence_position, sentence_length, proper_noun, sentence_freq_score]

# Neural Net

# Labels

In [116]:
print(document_sentence_scores[28])

file_numbers = [x[0] for x in labels]

# Make sure these are regular lists and not numpy arrays. Things will break if they are numpy arrays.
assert type(train_indices) == list
assert type(test_indices) == list

# print(train_indices)
# print(test_indices)

{0: 0.0, 1: 0.0002698327037236913, 2: 0.00030581039755351685, 3: 0.0012232415902140672, 4: 0.001146788990825688, 5: 0.0007312857332801489, 6: 0.00012742099898063202, 7: 0.0008737439930100481, 8: 0.003475118154017237, 9: 0.0005096839959225281, 10: 6.371049949031601e-05, 11: 0.00027800945232137893, 12: 0.0019113149847094803, 13: 0.0, 14: 0.00019113149847094801, 15: 0.0001529051987767584, 16: 0.0, 17: 0.0, 18: 0.0012592192840438928, 19: 0.0, 20: 0.0004704775346977182, 21: 0.00016989466530750936, 22: 0.0005096839959225281, 23: 0.0015290519877675841, 24: 0.00191131498470948, 25: 0.0011467889908256881, 26: 0.0015290519877675841, 27: 0.002378525314305131, 28: 0.00038226299694189603, 29: 0.00218435998252512, 30: 0.0, 31: 0.0, 32: 0.00021843599825251202, 33: 0.0, 34: 0.0006689602446483181, 35: 0.0024464831804281344, 36: 0.0007645259938837921, 37: 0.0011761938367442955, 38: 0.0012742099898063201, 39: 0.0, 40: 9.556574923547401e-05, 41: 0.0061162079510703364, 42: 0.0, 43: 0.0, 44: 0.0001911314984

In [121]:
# Create feature matrix
def create_ft_matrix(test=True, number=10):
    # Creates an X matrix with the train index file numbers' sentences appearing FIRST<
    # followed by test index file numbers' sentences.
    X = np.zeros((1, 4))
    
    for i in train_indices + test_indices:
        document = full_text_sentences[i]
        for j, sentence in enumerate(document):
            X = np.vstack([X, [function(i, j) for function in features_functions]])
    X = X[1:]
    return X

def get_corr_labels(test=True, number=10):
    relevant_file_numbers = train_indices + test_indices
    corr_labels = []
    
    for file_num, labels_list in labels:
        if file_num in relevant_file_numbers:
            corr_labels.append(labels_list)
    
    return corr_labels

def get_num_sentences(file_numbers):
    total_num_sentences = 0
    for i in file_numbers:
        sentences = full_text_sentences[i]
        total_num_sentences += len(sentences) 
    return total_num_sentences
        

def flatten(lst):
    flattened_list = []
    for sublist in lst:
        for item in sublist:
            flattened_list.append(item)
    return flattened_list

In [126]:
X = create_ft_matrix()
corr_labels = flatten(get_corr_labels())
y = corr_labels

In [124]:
assert X.shape[0] == len(corr_labels)

print(X.shape)

(1352, 4)


In [132]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Number of sentences (i.e. rows) to use for training.
nts = get_num_sentences(train_indices)

X_train, X_test, y_train, y_test = X[0:nts], X[nts:], y[0:nts], y[nts:]

threshold = 0.5
model = LogisticRegression().fit(X_train, y_train)
probabilities = model.predict_proba(X_test)
predictions = [1 if x[1] > 0.15 else 0 for x in probabilities]
print("Accuracy: ", accuracy_score(y_test, predictions))

Accuracy:  0.8238636363636364


In [133]:
print(X_train.shape, X_test.shape, len(y_train), len(y_test))

(648, 4) (704, 4) 648 704


In [137]:
assert len(probabilities) == get_num_sentences(test_indices) # Must have a probability for each sentence.

In [139]:
probabilities[0]

array([0.55162852, 0.44837148])

In [142]:
def generate_output(probabilities):
    output = dict()
    
    used_so_far = 0
    for test_index in test_indices:
        doc = full_text_sentences[test_index]
        ns = len(doc) # number of sentences in this document
        
        output[test_index] = probabilities[used_so_far : used_so_far + ns].tolist()
        used_so_far += ns
    return output

output = generate_output(probabilities)

In [149]:
def save_output(output):
    json.dump(output, open(OUTPUT_DIR + "baseline_probabilities.json", 'w'))

save_output(output)