In [79]:
cd wcep-mds-dataset/experiments

[Errno 2] No such file or directory: 'wcep-mds-dataset/experiments'
/home/aryan/Documents/2-1/CL 2/Project/Midsem_Team 1/wcep-mds-dataset/experiments


In [80]:
import utils
from rouge_score import rouge_scorer

val_data = list(utils.read_jsonl_gz('WCEP/val.jsonl.gz'))

print(val_data[0].keys())

dict_keys(['id', 'date', 'reference_urls', 'articles', 'summary', 'wiki_links', 'category'])


In [81]:
import pickle
import numpy as np
import pandas as pd
import spacy
import string
import re
import sys
import nltk
from nltk.corpus import stopwords
import random
nltk.download('punkt')

nlp = spacy.load("en_core_web_sm") #returns language object
from spacy.lang.en.stop_words import STOP_WORDS as stop_words

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF

np.seterr(divide='ignore', invalid='ignore')

[nltk_data] Downloading package punkt to /home/aryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [82]:
def preProcess(content, title):
    
#Sentence Segmentation & Case Folding
    a = content
    a = a.replace(', ',',')
    a = a.replace(',',', ')
    data = a
    data= re.sub(r"\n", " ", data)
    data= re.sub("\"","\"",data)
    data = re.sub("''","\"",data)
    data = re.sub("``","\"",data)
    data = re.sub(" +"," ",data)
    data = data.replace('www.','')
    data = data.replace('.com','com')
    data = data.replace('.COM','COM')
    data = data.replace('.','. ')
    data = data.encode("ascii", "ignore")
    data = data.decode("ascii")
    
#splitting data into list of sentences
    import nltk.data
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    lines = tokenizer.tokenize(data.strip())
    
#Case Folding
    low = []
    for x in lines:
        a = re.sub(r'[^a-zA-Z0-9\s]', ' ', x)
        a = ''.join([i for i in a if not i.isdigit()])
        a = a.lower()
        low.append(a)

#Preprocess for title
    b = title
    # b = b.encode("ascii", "ignore")
    # b = b.decode("ascii")
    b = b.replace('\n','')
    b = re.sub(r'[^a-zA-Z0-9\s]', ' ', b)
    b = str(filter(lambda c: not c.isdigit(), b))
    b = b.lower()
    
    return lines, b

In [83]:
def cosineSim(a,b) :
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [84]:
def bestSentence(sentences, query):
    best_sentence = None
    maxVal = float("-inf")
    
    for sent in sentences:
        content = []
        content.append(sent)
        content.append(query)
        
        vectorizer = CountVectorizer()
        tf = vectorizer.fit_transform(content)
        tf = tf.toarray()
        tf_sent = tf[0]
        tf_query = tf[1]
        
        similarity = cosineSim(tf_sent, tf_query)
        
        if similarity > maxVal:
            best_sentence = sent
            maxVal = similarity
    
    sentences.remove(best_sentence)
    return best_sentence

In [85]:
def MMR(Si, query, Sj, lamda):
    
    content = []
    content += Sj
    content.append(query)
        
    vectorizer = CountVectorizer()
    tf = vectorizer.fit_transform(content)
    tf = tf.toarray()
    tf_Si = tf[0]
    tf_query = tf[-1]
    
    Sim1 = cosineSim(tf_Si, tf_query)
    l_expr = lamda * Sim1
    redundancy = []

    for tf_sent in tf:        
        Sim2 = cosineSim(tf_Si, tf_sent)
        redundancy.append(Sim2)

    try: r_expr = (1-lamda) * max(redundancy)
    except: r_expr = 0
    MMR_SCORE = l_expr - r_expr

    return MMR_SCORE    

In [86]:
def makeSummary(sentences, best_sentence, query, max_length, lamda):
    summary = [best_sentence]
    sum_len = len(summary)
    
    while sum_len < max_length :
        MMRval = {}
        
        for sent in sentences:
            MMRval[sent] = MMR(sent, query, summary, lamda)
        
        try: maxmmr = max(MMRval, key=MMRval.get)
        except: return summary
        summary.append(maxmmr)
        sentences.remove(maxmmr)
        sum_len = len(summary)
    
    return summary

In [87]:
def processCluster(cluster):
    no_of_articles = 1
    articles = []
    contents = []
    title = cluster['articles'][0]['title']
    # for i in range(no_of_articles):
    #     articles.append(random.choice(cluster['articles'])['text'])
    articles = list(map(lambda article: article['text'], random.sample(cluster['articles'], no_of_articles)))
    
    for article in articles:
        content, title = preProcess(article, title)
        contents.append(content)
    # f = open("articles.txt", "a")
    # f.write("\n".join([" ".join(content) for content in contents]))
    # f.write('\n')
    # f.close()
    content = []
    for article in contents:
        content += article
    # content, title = preProcess(content, title)
    best_sentence = bestSentence(content, title)
    summary = makeSummary(content, best_sentence, title, 2, 0.5)

    # f = open("summari.txt", "a")
    # f.write(title)
    # f.write("\nGenerated summary:\n")
    # f.writelines(summary)
    # f.write("\n\nGold summary:\n")
    # f.write(cluster['summary'])
    # f.close()
    
    return summary
    

In [88]:
def evaluate(summary, gold_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    scores = scorer.score(" ".join(summary),gold_summary)
    
    with open("random-attempts.txt", "a") as f:
        f.write(f"  {scores['rouge1']}\n")

In [89]:
def main():
    dataset = list(utils.read_jsonl_gz('WCEP/val.jsonl.gz'))
    for i in range(500):
        cluster = dataset[i]
        summary = processCluster(cluster)
        with open("random-attempts.txt", "a") as f:
            f.write(f"{i+1}) ",)
        evaluate(summary, cluster['summary'])
    
if __name__ == '__main__':
    main()