In [37]:
import pickle
from collections import Counter
from nltk.corpus import stopwords
import nltk
from statistics import mean
import scipy.stats
import nltk
from nltk.tag import pos_tag 

nltk.download('stopwords')
ENG_STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mudit2103/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
# Loads in the following keys as variable names for the pickled files.
pickle_files = {'abstract_sentences': "AbstractSentences2007.pkl", 
                'abstracts': "Abstracts2007.pkl",
               'full_text_sentences': "FullTextSentences2007.pkl", 
                'full_text': "FullTexts2007.pkl",
                'trained_vectors': "trainedVectors2007.pkl",
                'labels': "labels.pkl"}

for varname, p in pickle_files.items():
    pickled_opened_file = open(p, "rb")
    exec(varname + " = pickle.load(pickled_opened_file)")
    

In [39]:
def keep_word(word):
    """Takes in a word and determines whether or not to keep it."""
    return word not in ENG_STOPWORDS and word != ',' and word != '.' and word != '\n'

In [40]:
def get_wc_map(full_text):
    """Given the full text data, returns a word-count map.
    word_counts_map is a list, ordered by document_id. 
    Elements of the list are counter objects, which are a map
    from the words in the document to the number of times a word
    appears in corresponding document."""
    word_counts_map = []

    for document in full_text:
        cnt = Counter()
        for word in document:
            word = word.lower()
            if keep_word(word):
                cnt[word] += 1
        word_counts_map.append(cnt)
    return word_counts_map

In [41]:
def get_doc_sent_scores(full_text_sentences, word_counts_map):
    """document_sentence_scores is a list, indexed by document ID. 
    an element of the list is a map from sentence ID to sentence score.
    Sentence score is defined as the $[sum( word frequency ) over all words in sentence]$
    divided by the $num_words(sentence)$.
    
    Given full_text_sentences and word_counts_map, returns document_sentence_scores."""
    document_sentence_scores = []  
    for i, document in enumerate(full_text_sentences):  # Iterate through all documents.
        sentence_scores = {}  # Map from sentence_id to sentence_score
        document_word_counts = word_counts_map[i]  # Counter object. Map from word->count(word) in this document.
        num_doc_words = sum(document_word_counts.values())  # Total # of words in document.

        for sentence_id, sentence in enumerate(document):  # Iterate through sentences in document.
            sentence_word_freq_sum = 0
            num_words_in_sentence = 0

            for word in sentence:
                word = word.lower()
                if keep_word(word):
                    word_freq = document_word_counts[word] / num_doc_words  # count(word)/total_words(document)
                    sentence_word_freq_sum += word_freq  
                    num_words_in_sentence += 1  

            # See defn of sentence_score in function comment.
            sentence_score = sentence_word_freq_sum / num_words_in_sentence if num_words_in_sentence != 0 else 0
            sentence_scores[sentence_id] = sentence_score

        document_sentence_scores.append(sentence_scores)
    return document_sentence_scores

In [42]:
word_counts_map = get_wc_map(full_text)
document_sentence_scores = get_doc_sent_scores(full_text_sentences, word_counts_map)

In [43]:
def get_top_sentences(doc_id, N=5):
    sentence_scores_map = document_sentence_scores[doc_id]
    
    sentence_scores = [(idx,score) for idx, score in sentence_scores_map.items()]
    sentence_scores.sort(key = lambda x: x[1], reverse=True)
    
    sentence_scores = [(full_text_sentences[doc_id][idx], score) for idx, score in sentence_scores]
    
    return sentence_scores[:N]
    
    

# Experimenting

In [44]:
def sentence_position(document_num, sentence_num, full_text_sentences):
    """Function which takes in the ID of the document, and the number of the sentence. 
    It returns the position of the sentence within the document, normalized.
    This is to be used as one of the features in the featurized matrix for
    logistic regression."""
    return sentence_num / len(full_text_sentences[document_num])

In [45]:
def sentence_length(document_num, sentence_num, full_text_sentences):
    """Function which takes in document ID, and the number of the sentence.
    Returns the length of the sentence. 
    This function is written as such to match the header and style of the 
    other feature-functions, to allow us to easily use them in the feature matrix."""
    sentence = full_text_sentences[document_num][sentence_num]
    return len(sentence)

In [46]:
def proper_noun(document_num, sentence_num, full_text_sentences):
    """Function which takes in document ID, and the number of the sentence. 
    Returns the number of proper nouns within sentence. 
    Uses nltk's tagging to determine which words are proper nouns."""
    sentence = full_text_sentences[document_num][sentence_num]
    tagged_sent = pos_tag(sentence)
    propernouns = [word for word, pos in tagged_sent if pos == 'NNP']
    
    return len(propernouns)

In [47]:
def sentence_freq_score(document_num, sentence_num, full_text_sentences):
    """Function which takes in document ID, and the number of the sentence. 
    Returns document_sentence_score for the corresponding document and sentence."""
    return 1000 * document_sentence_scores[document_num][sentence_num]

In [48]:
# List of feature-functions, to allow us to easily create the feature matrix.
features_functions = [sentence_position, sentence_length, proper_noun, sentence_freq_score]

# Neural Net

In [3]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

print(tf.VERSION)

1.11.0


# Labels

In [18]:
# Indices of files used to generate the labels. 
# Note that not all files were used, since some of them were formatted poorly. 
# The first element of 'labels' contains the indices of the file/document numbers.
file_numbers = [x[0] for x in labels] 

In [19]:
# Create feature matrix
def create_ft_matrix(fts, test=10):
    """Returns the featurized matrix, given the full_text_sentences"""
    X = np.zeros((1, 4))  # Feature matrix. Initialize as a row of zeros.
    
    end = test if test else len(file_numbers)  # Test can be a number for using a subset of documents, else False.
    for i in file_numbers[:end]: 
        document = full_text_sentences[i]
        for j, sentence in enumerate(document):
            X = np.vstack([X, [function(i, j, fts) for function in features_functions]])
    X = X[1:]  # Remove the initial row of zeros.
    return X  # Return the feature matrix.

def get_corr_labels(labels, test=10):
    """Given all the labels information in a list of (file number, actual-labels) tuples, 
    returns  labels (b vector) corresponding to the feature matrix.
    """
    end = test if test else len(labels)
    corr_labels = [x[1] for x in labels[:end]]
    return corr_labels

In [167]:
X = create_ft_matrix(number=1000)
corr_labels = get_corr_labels(number=1000)

In [168]:
flat_list = []
for sublist in corr_labels:
    for item in sublist:
        flat_list.append(item)

print(X.shape)
print(len(flat_list))

import sklearn
from sklearn.linear_model import LogisticRegression

(38841, 4)
38841


In [169]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

y = flat_list
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
threshold = 0.5
model = LogisticRegression().fit(X_train, y_train)
probabilities = model.predict_proba(X_test)
predictions = [1 if x[1] > 0.15 else 0 for x in probabilities]
print("Accuracy: ", accuracy_score(y_test, predictions))

Accuracy:  0.8143785801634807


In [170]:
print(X_train.shape, X_test.shape, len(y_train), len(y_test))

(23304, 4) (15537, 4) 23304 15537


In [171]:
from nltk.translate.bleu_score import sentence_bleu
from pythonrouge.pythonrouge import Pythonrouge

def bleu(lst, target_sentence, weights=(0.25, 0.25, 0.25, 0.25)):
	sent_scores_map = dict()
	max_score = None
	best_sentence = None
	best_idx = None
	
	for i, sentence in enumerate(lst):
	    sentence = [sentence]
	    score = sentence_bleu(sentence, target_sentence, weights)
	    sent_scores_map[i] = score
	    if max_score is None or score > max_score:
	        max_score = score
	        best_sentence = sentence
	        best_idx = i
	return best_idx, sent_scores_map

def untokenize(lst):
	untokenized = []
	for item in lst:
		sentence = ' '.join(item)
		untokenized.append([sentence])
	return untokenized

def rouge(lst, target_sentence, weights=None):
	# The weights parameter is currently ignored
	untokenized_list = untokenize(lst) # list of lists
	target_sentence = [target_sentence]

	sent_scores_map = dict()
	max_score = None
	best_sentence = None
	best_idx = None

	for i, sentence in enumerate(untokenized_list):
		sentence = [[sentence]]
		rouge = Pythonrouge(summary_file_exist=False,
                    summary=target_sentence, reference=sentence,
                    n_gram=2, ROUGE_SU4=True, ROUGE_L=False,
                    recall_only=True, stemming=True, stopwords=True,
                    word_level=True, length_limit=True, length=50,
                    use_cf=False, cf=95, scoring_formula='average',
                    resampling=True, samples=1000, favor=True, p=0.5)
		score = rouge.calc_score()['ROUGE-1']
		sent_scores_map[i] = score
		if max_score is None or score > max_score:
			max_score = score
			best_sentence = sentence
			best_idx = i

	return best_idx, sent_scores_map

def f1_score(bleu_score, rouge_score):
	return float((2*(bleu_score*rouge_score))) / float((bleu_score + rouge_score))

In [172]:
# predictions = model.predict(X_test)

# print("Bleu: ", bleu(list(predictions), y_test))

reference = [['this', 'is', 'a', 'test'], ['this', 'is', 'test']]
candidate = ['this', 'is', 'a', 'test']

print(bleu(reference, candidate))
print()

(0, {0: 1.0, 1: 0.7071067811865475})



Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [173]:
predicted = model.predict(X_test)
print(len([p for p in predictions if p == 1]))
# print(y_test)

3103


In [180]:
# summaries = []
from functools import reduce
def document_indices(doc_id):
    prev_docs = full_text_sentences[0:doc_id]
    prev_docs_length = sum([len(x) for x in prev_docs])
    doc = full_text_sentences[doc_id]
    doc_len = len(doc)
    
    start = prev_docs_length
    end = start + doc_len
    
    return start, end

def flatten(lst):
    flattened_list = []
    for sublist in lst:
        for item in sublist:
            flattened_list.append(item)
    return flattened_list

def average(scores):
    return sum(scores) / len(scores)

def get_average_scores():
    bleu_logistic_scores = []
    bleu_label_scores = []
    rouge_logistic_scores = []
    rouge_label_scores = []
    for k, i in enumerate(file_numbers[:1000]):
        flattened_abstract_sentences = flatten(abstract_sentences[i])
    #     summaries.append(flattened_abstract_sentences)
    #     print(flattened_abstract_sentences)
        reference = [flattened_abstract_sentences]
        start, end = document_indices(i)
        candidate_logistic = flatten([full_text_sentences[i][index] for index in predictions[start:end]])
        candidate_labels = flatten([full_text_sentences[i][index] for index in labels[k][1]])
        best_idx, sent_map = bleu(reference, candidate_logistic)
        bleu_logistic_scores.append(sent_map[0])
        best_idx, sent_map = bleu(reference, candidate_labels)
        bleu_label_scores.append(sent_map[0])
        
#         best_idx, sent_map = rouge(reference, candidate_logistic)
#         rouge_label_scores.append(sent_map[0])
#         best_idx, sent_map = rouge(reference, candidate_labels)
#         rouge_label_scores.append(sent_map[0])

    return average(bleu_logistic_scores), average(bleu_label_scores)#, average(rouge_logistic_scores), average(rouge_label_scores)

In [181]:
get_average_scores()

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


(0.073473928921661, 0.1785201062177081)