<a href="https://colab.research.google.com/github/a-gasior/Text-Summarization/blob/master/Product_Summaries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#Alari Maricq
#Andrew Gasiorowski
#CIS 411

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import numpy as _np
import os
import re
import string
import nltk
from nltk import word_tokenize
nltk.download('punkt')
from sklearn.metrics import pairwise_distances_argmin#_min
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from operator import itemgetter
from collections import Counter

In [0]:
NGRAMS = 2
def make_ngrams(text, ngrams=NGRAMS):
    """Breaks array of words into array of n-tuples representing N-Grams"""
    if ngrams < 1:
        raise ValueError("ngrams must be greater than 0")

    text = word_tokenize(text)
    text = [x for x in text if x not in string.punctuation]
    grams = []
    off = ngrams - 1  # offset for 0-indexing
    for ind in range(off, len(text)):
        grams.append(tuple(text[ind - off:ind + 1]))
    return grams


def rouge_comp(tests, gold, ngrams=NGRAMS):
    """Compare generated test summaries to gold standard using ROUGE-N

    Args:
        test: list of system generated summary strings
        gold: list of gold-standard summary strings.
        ngrams (opt): size of N-Grams to tokenize summaries into.

    Returns: list of floats for the ROUGE recall score of each test summary
    """
    if not isinstance(tests, list):
        tests = [tests]

    goldngrams = [make_ngrams(x, ngrams) for x in gold]
    scores = []
    for test in tests:
        grams = make_ngrams(test)
        # sum of common words between test and each reference (gold)
        # print(grams)
        # print(goldngrams)
        matches = sum(len(set(grams) & set(x)) for x in goldngrams)
        # print(matches)
        total_gold = sum(len(x) for x in goldngrams)
        scores.append(matches / total_gold)
    return scores

In [0]:
#takes the name of a directory for a topic
#returns list of sentences (product reviews) AND list of gold standard summaries for this topic
def get_data(topic_dir):
  filename_topic = 'gdrive/Team Drives/NLP_Text_Summarizer/Data/opinosis/topics/' + topic_dir
  filename_goldstnd = 'gdrive/Team Drives/NLP_Text_Summarizer/Data/opinosis/summaries-gold/' + topic_dir + '/'
  sentences_topic = []
  sentences_goldstnd = []
  
  #get list of product reviews for this topic
  with open(filename_topic, errors='replace') as f:
    for line in f:
      cleaned = line.replace('\n', '')
      cleaned = re.sub(r'[^\w\s]','',cleaned)
      sentences_topic.append(cleaned)
      #append to the list of reviews
      
  gold_topic = topic_dir.replace('.txt.data','')
  gold_list = os.listdir('gdrive/Team Drives/NLP_Text_Summarizer/Data/opinosis/summaries-gold/' + gold_topic)
  #There may be many gold standard summaries. Get list of all summary file names
  sentences_goldstnd = []
  for gld_std in gold_list:
    #for every gold standard summary file
    filename_goldstnd = 'gdrive/Team Drives/NLP_Text_Summarizer/Data/opinosis/summaries-gold/' + gold_topic + '/' + gld_std
    with open(filename_goldstnd, errors='ignore') as f:
      for line in f:
        #for every line in this gold standard summary
        cleaned = line.replace('\n', '')
        cleaned = re.sub(r'[^\w\s]','',cleaned)
        sentences_goldstnd.append(cleaned)
        #append the line to the summary list
  
  #return the product reviews AND the gold standard summaries
  return sentences_topic, sentences_goldstnd


#Takes the name of a .npy file
#Returns ndarray of word embeddings
def get_embedding_from_disk(_file_name):
  _file_path = 'gdrive/Team Drives/NLP_Text_Summarizer/Data/opinosis/topics_embeddings/'+_file_name
  return _np.load(_file_path)

In [0]:
#Takes a topic's word embeddings
#Returns list of best sentences, and optionally thier indexes in the topic
#determines this using kmeans clustering
def summarize_topic(topic_embeddings, sentences, num_clusters):
  #Establish parameters for clustering
  kmeans = KMeans(n_clusters=num_clusters)
  kmeans.fit(topic_embeddings)
  #closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, topic_embeddings)
  closest = pairwise_distances_argmin(kmeans.cluster_centers_, topic_embeddings)
  return [sentences[i] for i in closest], closest


#Takes list of tuples(alpha, acc) and range of clusters
#returns list of tuples(alpha, avg_acc)
#returns max score value
def aggregate_accuracy(acc_list, cluster_range):
  avg_acc_list = Counter()
  
  for i in cluster_range:
    for tup in acc_list:
      if tup[0] == i:
        avg_acc_list[i] += tup[1]
  num_samples = len(acc_list_all_topics)/max(cluster_range)
  for key, value in avg_acc_list.items():
    avg_acc_list[key] = value/num_samples
  avg_acc_list = [(k, v) for k, v in avg_acc_list.items()]
  max_alpha = max(avg_acc_list,key=itemgetter(1))[0]
  
  return avg_acc_list, max_alpha

#takes avg_acc_list
#saves chart to file
def save_plot_to_file(data, file_name, title):
  filename = 'gdrive/Team Drives/NLP_Text_Summarizer/Data/data_out/new_' + file_name + '.png'
  title = title
  plt.scatter(*zip(*data))
  plt.title(title)
  plt.xlabel("Number of Clusters")
  plt.ylabel("ROUGE Score")
  plt.savefig(filename ,bbox_inches='tight')
  plt.clf()

In [0]:
#each list item is the file name of a .npy containing a topic's word embeddings
embedding_file_list = os.listdir('gdrive/Team Drives/NLP_Text_Summarizer/Data/opinosis/topics_embeddings')
topics_list = os.listdir('gdrive/Team Drives/NLP_Text_Summarizer/Data/opinosis/topics')
embedding_file_list.sort()
topics_list.sort()


#split training and testing data:
training_topics = topics_list[:36]
testing_topics = topics_list[36:]
#Specifiy N-Gram type for ROUGE-N compare

#prepare test parameters
cluster_range = range(1,11)
validation_range = range(1,101)
num_validations = max(validation_range)

#Basically what we're doing is the following:
#We want to find out what the optimal number of clusters is
#Because we want a summary we decided it shouldn't be any longer than 10 sentences (ten clusters)
#thus we learn the optimal number of clusters
#We do this by, for every number of clusters
#Run the clustering algorithm 100 times
#Keeping track of the ROUGE score for each run
#we then average the ROUGE score accross the 100 runs for that number of clusters
#This allows us to figure out the optimal number of clusters-->turns out it's two

acc_list_all_topics = []    
for idx, topic in enumerate(training_topics):
    #for every training topic
    print('summarizing', topic)
    acc_dict = dict.fromkeys(cluster_range,0)
    for num_clusters in cluster_range:
        for this_validation in validation_range:
            #run this N times to find average ROUGE score
            this_topic_embedings = get_embedding_from_disk(embedding_file_list[idx])
            #gets sentence embeddings from file
            sentences, gold_summaries = get_data(topic)
            #gets corresponding raw text AND gold standard summaries for this topic
            this_topic_best_sentences = summarize_topic(this_topic_embedings, sentences, num_clusters)
            #summarize the topic
            scores_train = rouge_comp(this_topic_best_sentences[0], gold_summaries)
            #use ROUGE-N to calculate score
            acc_dict[num_clusters] += (sum(scores_train)/len(scores_train))*100
            #update the running total for the score
    acc_dict.update((key, value/num_validations) for key, value in acc_dict.items())
    #divide the sums by the number of validations to get the average score by  number of clusters
    acc_list_all_topics.extend([(key, value) for key, value in acc_dict.items()])
avg_accs_train, optimal_clust = aggregate_accuracy(acc_list_all_topics, cluster_range)
save_plot_to_file(avg_accs_train, 'training_summary', 'Training Topics')



#now that we've learned the optimal number of clusters(2) we can test our test data
#We will do something similar to what we did above
#For every topic in the test set
#We will run the clustering algorithm 100 times
#The clustering algorithm won't always return the same sentences
#By running it 100 times and selecting the top 2 that appear most frequently
#We have the best chance of selecting the ideal summary
#We then print the metrics to output

acc_list_test = []
avg_score = 0
for idx, topic in enumerate(testing_topics):
  this_topic_embedings = get_embedding_from_disk(embedding_file_list[idx + 36])
  sentences, gold_summaries = get_data(topic)
  best_indexes_dict = Counter()
  for validation_iter in validation_range:
    this_topic_best_sentences, best_idx = summarize_topic(this_topic_embedings, sentences, optimal_clust)
    for sent_idx in best_idx:
      best_indexes_dict[sent_idx] = sent_idx
  topic_best_sentence_idx = best_indexes_dict.most_common(optimal_clust)
  #list of tuple. tuple[0] is a best index  
  best_sentences_test = []
  for sent_idx in topic_best_sentence_idx:
    best_sentences_test.append(sentences[sent_idx[0]])
  score = rouge_comp(best_sentences_test, gold_summaries)
  score = (sum(score)/len(score))*100
  avg_score += score
  print('Topic: ', topic, '\nAccuracy: ', score,'\n\nSystem Generated Summary:\n')
  for sent in best_sentences_test:
    print(sent, end='\n\n')
  print('\nGold Standard Summary:\n')
  for sent in gold_summaries:
    print(sent)
  print('\n\n\n-----------------NEW TOPIC---------------\n\n\n')
avg_score = avg_score/14
print('Average Rouge Score: ', avg_score)