In [146]:
import pickle
from collections import Counter
from nltk.corpus import stopwords
import nltk



# nltk.download('stopwords')

eng_stopwords = set(stopwords.words('english'))

In [147]:
pickle_files = {'abstract_sentences': "AbstractSentences2007.pkl", 
                'abstracts': "Abstracts2007.pkl",
               'full_text_sentences': "FullTextSentences2007.pkl", 
                'full_text': "FullTexts2007.pkl",
                'trained_vectors': "trainedVectors2007.pkl",
                'labels': "labels.pkl"}

p_abstract_sentences = open("FullTextSentences2007.pkl","rb")

for varname, p in pickle_files.items():
    pickled_opened_file = open(p, "rb")
    exec(varname + " = pickle.load(pickled_opened_file)")
    

In [148]:
def keep_word(word):
    return word not in eng_stopwords and word != ',' and word != '.' and word != '\n'

In [149]:
word_counts_map = []

for document in full_text:
    cnt = Counter()
    for word in document:
        word = word.lower()
        if keep_word(word):
            cnt[word] += 1
    word_counts_map.append(cnt)

In [150]:
document_sentence_scores = []
for i, document in enumerate(full_text_sentences):
    sentence_scores = {}
    document_word_counts = word_counts_map[i]
    num_doc_words = sum(document_word_counts.values())
    
    for sentence_id, sentence in enumerate(document):
        sentence_word_freq_sum = 0
        num_words_in_sentence = 0
        
        for word in sentence:
            word = word.lower()
            if keep_word(word):
                word_freq = document_word_counts[word] / num_doc_words
                sentence_word_freq_sum += word_freq
                num_words_in_sentence += 1
        
        sentence_score = sentence_word_freq_sum / num_words_in_sentence if num_words_in_sentence != 0 else 0
        sentence_scores[sentence_id] = sentence_score
        
    document_sentence_scores.append(sentence_scores)

In [151]:
def get_top_sentences(doc_id, N=5):
    sentence_scores_map = document_sentence_scores[doc_id]
    
    sentence_scores = [(idx,score) for idx, score in sentence_scores_map.items()]
    sentence_scores.sort(key = lambda x: x[1], reverse=True)
    
    sentence_scores = [(full_text_sentences[doc_id][idx], score) for idx, score in sentence_scores]
    
    return sentence_scores[:N]
    
    

# Experimenting

In [152]:
# def func(document_num, sentence_num):
#     return single_number

In [153]:
def sentence_position(document_num, sentence_num):
    return sentence_num / len(full_text_sentences[document_num])

In [154]:
from statistics import mean
import scipy.stats

def sentence_length(document_num, sentence_num, mean_sent_length=None, std_dev=5):
#     if not mean_sent_length:
#         mean_sent_length = mean([len(item) for item in full_text_sentences[document_num]])
#     sentence = full_text_sentences[document_num][sentence_num]
        
#     print(scipy.stats.norm(mean_sent_length, std_dev).pdf(len(sentence)))
    return len(sentence)

In [155]:
import nltk
from nltk.tag import pos_tag 

def proper_noun(document_num, sentence_num):
    sentence = full_text_sentences[document_num][sentence_num]
    tagged_sent = pos_tag(sentence)
    propernouns = [word for word, pos in tagged_sent if pos == 'NNP']
    
    return len(propernouns)

In [156]:
proper_noun(2, 1)

5

In [157]:
def sentence_freq_score(document_num, sentence_num):
    return 1000 * document_sentence_scores[document_num][sentence_num]

In [158]:
features_functions = [sentence_position, sentence_length, proper_noun, sentence_freq_score]

# Neural Net

In [159]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

1.11.0


In [160]:
# # Create feature matrix
# X = np.zeros((1, 4))
# for i, document in enumerate(full_text_sentences):
#     print(i)
#     for j, sentence in enumerate(document):
#         X = np.vstack([X, [function(i, j) for function in features_functions]])
# print(X)

In [161]:
# print(sum([len(document) for document in full_text_sentences]))

In [162]:
# X[1:]

In [163]:
# import pickle

# # Saving the objects:
# with open('feature_matrix.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
#     pickle.dump([X], f)

# Labels

In [164]:
labels = pickle.load(open('labels.pkl', 'rb'))
labels[0]

[0,
 [1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]]

In [165]:
file_numbers = [x[0] for x in labels]
file_numbers

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 109,
 110,
 111,
 112,
 114,
 115,
 116,
 117,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,

In [166]:
# Create feature matrix
test = True
def create_ft_matrix(test=True, number=10):
    X = np.zeros((1, 4))
    
    end = number if test else len(file_numbers)
    for i in file_numbers[:end]:
        document = full_text_sentences[i]
        for j, sentence in enumerate(document):
            X = np.vstack([X, [function(i, j) for function in features_functions]])
    X = X[1:]
    return X

def get_corr_labels(test=True, number=10):
    end = number if test else len(labels)
    corr_labels = [x[1] for x in labels[:end]]
    return corr_labels

In [167]:
X = create_ft_matrix(number=1000)
corr_labels = get_corr_labels(number=1000)

In [168]:
flat_list = []
for sublist in corr_labels:
    for item in sublist:
        flat_list.append(item)

print(X.shape)
print(len(flat_list))

import sklearn
from sklearn.linear_model import LogisticRegression

(38841, 4)
38841


In [169]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

y = flat_list
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
threshold = 0.5
model = LogisticRegression().fit(X_train, y_train)
probabilities = model.predict_proba(X_test)
predictions = [1 if x[1] > 0.15 else 0 for x in probabilities]
print("Accuracy: ", accuracy_score(y_test, predictions))

Accuracy:  0.8143785801634807


In [170]:
print(X_train.shape, X_test.shape, len(y_train), len(y_test))

(23304, 4) (15537, 4) 23304 15537


In [171]:
from nltk.translate.bleu_score import sentence_bleu
from pythonrouge.pythonrouge import Pythonrouge

def bleu(lst, target_sentence, weights=(0.25, 0.25, 0.25, 0.25)):
	sent_scores_map = dict()
	max_score = None
	best_sentence = None
	best_idx = None
	
	for i, sentence in enumerate(lst):
	    sentence = [sentence]
	    score = sentence_bleu(sentence, target_sentence, weights)
	    sent_scores_map[i] = score
	    if max_score is None or score > max_score:
	        max_score = score
	        best_sentence = sentence
	        best_idx = i
	return best_idx, sent_scores_map

def untokenize(lst):
	untokenized = []
	for item in lst:
		sentence = ' '.join(item)
		untokenized.append([sentence])
	return untokenized

def rouge(lst, target_sentence, weights=None):
	# The weights parameter is currently ignored
	untokenized_list = untokenize(lst) # list of lists
	target_sentence = [target_sentence]

	sent_scores_map = dict()
	max_score = None
	best_sentence = None
	best_idx = None

	for i, sentence in enumerate(untokenized_list):
		sentence = [[sentence]]
		rouge = Pythonrouge(summary_file_exist=False,
                    summary=target_sentence, reference=sentence,
                    n_gram=2, ROUGE_SU4=True, ROUGE_L=False,
                    recall_only=True, stemming=True, stopwords=True,
                    word_level=True, length_limit=True, length=50,
                    use_cf=False, cf=95, scoring_formula='average',
                    resampling=True, samples=1000, favor=True, p=0.5)
		score = rouge.calc_score()['ROUGE-1']
		sent_scores_map[i] = score
		if max_score is None or score > max_score:
			max_score = score
			best_sentence = sentence
			best_idx = i

	return best_idx, sent_scores_map

def f1_score(bleu_score, rouge_score):
	return float((2*(bleu_score*rouge_score))) / float((bleu_score + rouge_score))

In [172]:
# predictions = model.predict(X_test)

# print("Bleu: ", bleu(list(predictions), y_test))

reference = [['this', 'is', 'a', 'test'], ['this', 'is', 'test']]
candidate = ['this', 'is', 'a', 'test']

print(bleu(reference, candidate))
print()

(0, {0: 1.0, 1: 0.7071067811865475})



Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [173]:
predicted = model.predict(X_test)
print(len([p for p in predictions if p == 1]))
# print(y_test)

3103


In [180]:
# summaries = []
from functools import reduce
def document_indices(doc_id):
    prev_docs = full_text_sentences[0:doc_id]
    prev_docs_length = sum([len(x) for x in prev_docs])
    doc = full_text_sentences[doc_id]
    doc_len = len(doc)
    
    start = prev_docs_length
    end = start + doc_len
    
    return start, end

def flatten(lst):
    flattened_list = []
    for sublist in lst:
        for item in sublist:
            flattened_list.append(item)
    return flattened_list

def average(scores):
    return sum(scores) / len(scores)

def get_average_scores():
    bleu_logistic_scores = []
    bleu_label_scores = []
    rouge_logistic_scores = []
    rouge_label_scores = []
    for k, i in enumerate(file_numbers[:1000]):
        flattened_abstract_sentences = flatten(abstract_sentences[i])
    #     summaries.append(flattened_abstract_sentences)
    #     print(flattened_abstract_sentences)
        reference = [flattened_abstract_sentences]
        start, end = document_indices(i)
        candidate_logistic = flatten([full_text_sentences[i][index] for index in predictions[start:end]])
        candidate_labels = flatten([full_text_sentences[i][index] for index in labels[k][1]])
        best_idx, sent_map = bleu(reference, candidate_logistic)
        bleu_logistic_scores.append(sent_map[0])
        best_idx, sent_map = bleu(reference, candidate_labels)
        bleu_label_scores.append(sent_map[0])
        
#         best_idx, sent_map = rouge(reference, candidate_logistic)
#         rouge_label_scores.append(sent_map[0])
#         best_idx, sent_map = rouge(reference, candidate_labels)
#         rouge_label_scores.append(sent_map[0])

    return average(bleu_logistic_scores), average(bleu_label_scores)#, average(rouge_logistic_scores), average(rouge_label_scores)

In [181]:
get_average_scores()

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


(0.073473928921661, 0.1785201062177081)