<a href="https://colab.research.google.com/github/cfong32/key-sentence-extraction/blob/main/exp17_preprocess_tfidf_sbert_chatgptDS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview

In this notebook, we
1. Download Mohammad's chatGPT-generated dataset, store it in a dataframe `df`
2. Break down each `article` into `sentences`
3. Compute TF-IDF cosine-similarity of every sentence to its source `article`
4. Compute ROUGE of every sentence to the `highlights`, the gold summary
5. Analyze results
    - Verify correlation between TFIDF cosine-similarity and ROUGE
    - Evaluate F1 score of "top-K%-sentence classification"
        - E.g., for an article of 20 sentences, the "top-10%-sentence classification" is to predict the most important 2 key-sentences.

# I. Install and Import

In [None]:
!pip install -q datasets rouge_score sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# import packages

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from datasets import load_dataset
from spacy.lang.en import English
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score
from rouge_score.rouge_scorer import RougeScorer
from sentence_transformers import SentenceTransformer
from itertools import cycle
from functools import partial
from textwrap import wrap
from IPython.display import HTML as html_print
pd.set_option('display.min_rows', 4)
tqdm.pandas()
tqdm = partial(tqdm, position=0, leave=True)
Ks = [1, 5, 10, 20, 40, 60, 80, 100]

# II. Computation

In [None]:
# for running !wget on a GPU instance, please uncomment the following two lines
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
# load dataset into a dataframe

!wget -qO "Mohammad_dataset_result(3).json" https://uoguelphca-my.sharepoint.com/:u:/g/personal/mhavaled_uoguelph_ca/ET_4hgQABD5JhtiDtFGUG4gBASpIv6WURezHphMTlPCJqg?download=1
data = json.load(open('Mohammad_dataset_result(3).json'))
df = (pd.DataFrame(data)
      .rename(columns={'paragraphs': 'article'})
      )
df

Unnamed: 0,article,key_sentences
0,Climate change is one of the most pressing iss...,We need to act urgently to reduce our greenhou...
1,Education is the key to unlocking a better fut...,It is crucial that we invest in education at a...
...,...,...
1258,The environment plays a critical role in our l...,It is our responsibility to take care of the e...
1259,Family is an essential component of our lives....,"In such cases, it is important to communicate ..."


In [None]:
# split articles into sentences
# every entry of df['sentences'] will contain a list of strings

spacy_eng_nlp = English()
spacy_eng_nlp.add_pipe("sentencizer")

df['sentences'] = df.progress_apply(
    lambda x: (
        [str(s) for s in spacy_eng_nlp(x.article).sents]
    ),
    axis=1
)
df

100%|██████████| 1260/1260 [00:00<00:00, 2510.97it/s]


Unnamed: 0,article,key_sentences,sentences
0,Climate change is one of the most pressing iss...,We need to act urgently to reduce our greenhou...,[Climate change is one of the most pressing is...
1,Education is the key to unlocking a better fut...,It is crucial that we invest in education at a...,[Education is the key to unlocking a better fu...
...,...,...,...
1258,The environment plays a critical role in our l...,It is our responsibility to take care of the e...,[The environment plays a critical role in our ...
1259,Family is an essential component of our lives....,"In such cases, it is important to communicate ...",[Family is an essential component of our lives...


In [None]:
df['is_key'] = df.progress_apply(
    lambda x: (
        [int(x.key_sentences in s) for s in x.sentences]
    ),
    axis=1
)
df = df[df.is_key.apply(sum) == 1]      # keep those paragraphs which have only one key sentence
df = df[df.is_key.apply(len) > 1]       # keep those paragraphs which consist of multiple sentences
df = df.reset_index()
df

100%|██████████| 1260/1260 [00:00<00:00, 6602.95it/s]


Unnamed: 0,index,article,key_sentences,sentences,is_key
0,0,Climate change is one of the most pressing iss...,We need to act urgently to reduce our greenhou...,[Climate change is one of the most pressing is...,"[0, 0, 0, 1]"
1,1,Education is the key to unlocking a better fut...,It is crucial that we invest in education at a...,[Education is the key to unlocking a better fu...,"[0, 0, 0, 1]"
...,...,...,...,...,...
1241,1258,The environment plays a critical role in our l...,It is our responsibility to take care of the e...,[The environment plays a critical role in our ...,"[0, 0, 0, 1]"
1242,1259,Family is an essential component of our lives....,"In such cases, it is important to communicate ...",[Family is an essential component of our lives...,"[0, 0, 0, 0, 1]"


In [None]:
# calculate TF-IDF (Term Frequency-Inverse Document Frequency)
# then calculate the cosine-similarity of each sentence to the "article"
# every entry of df['TFIDF_sim'] will be an ndarray indicating cossim of the sentences

articles = df.article.tolist()
tfidf = TfidfVectorizer().fit(articles)

df['TFIDF_sim'] = df.progress_apply(
    lambda x: (
        cosine_similarity(
            tfidf.transform([x.article]),
            tfidf.transform(x.sentences)
        )[0]
    ),
    axis=1
)
df

100%|██████████| 1243/1243 [00:02<00:00, 481.20it/s]


Unnamed: 0,index,article,key_sentences,sentences,is_key,TFIDF_sim
0,0,Climate change is one of the most pressing iss...,We need to act urgently to reduce our greenhou...,[Climate change is one of the most pressing is...,"[0, 0, 0, 1]","[0.3214220469953667, 0.6107510335598709, 0.474..."
1,1,Education is the key to unlocking a better fut...,It is crucial that we invest in education at a...,[Education is the key to unlocking a better fu...,"[0, 0, 0, 1]","[0.47808946064127095, 0.45111889985171527, 0.4..."
...,...,...,...,...,...,...
1241,1258,The environment plays a critical role in our l...,It is our responsibility to take care of the e...,[The environment plays a critical role in our ...,"[0, 0, 0, 1]","[0.4467579283397811, 0.6413572604770067, 0.556..."
1242,1259,Family is an essential component of our lives....,"In such cases, it is important to communicate ...",[Family is an essential component of our lives...,"[0, 0, 0, 0, 1]","[0.4073544528009398, 0.42108696945190444, 0.57..."


In [None]:
# predict the one with the highest tfidf_cossim

df['pred_by_TFIDF'] = df.TFIDF_sim.map(lambda x: (x==x.max()).astype(int))
df

Unnamed: 0,index,article,key_sentences,sentences,is_key,TFIDF_sim,pred_by_TFIDF
0,0,Climate change is one of the most pressing iss...,We need to act urgently to reduce our greenhou...,[Climate change is one of the most pressing is...,"[0, 0, 0, 1]","[0.3214220469953667, 0.6107510335598709, 0.474...","[0, 1, 0, 0]"
1,1,Education is the key to unlocking a better fut...,It is crucial that we invest in education at a...,[Education is the key to unlocking a better fu...,"[0, 0, 0, 1]","[0.47808946064127095, 0.45111889985171527, 0.4...","[0, 0, 0, 1]"
...,...,...,...,...,...,...,...
1241,1258,The environment plays a critical role in our l...,It is our responsibility to take care of the e...,[The environment plays a critical role in our ...,"[0, 0, 0, 1]","[0.4467579283397811, 0.6413572604770067, 0.556...","[0, 1, 0, 0]"
1242,1259,Family is an essential component of our lives....,"In such cases, it is important to communicate ...",[Family is an essential component of our lives...,"[0, 0, 0, 0, 1]","[0.4073544528009398, 0.42108696945190444, 0.57...","[0, 0, 1, 0, 0]"


In [None]:
sbert = SentenceTransformer('all-MiniLM-L6-v2')

df['sbert_embeddings'] = df.progress_apply(
    lambda x: sbert.encode(x.sentences + [x.article]),
    axis=1
)

df['SBERT_s2a_sim'] = df.progress_apply(
    lambda x: (
        cosine_similarity(
            x.sbert_embeddings[[-1]],   # x.article encoded
            x.sbert_embeddings[:-1]     # x.sentences encoded
        )[0]
    ),
    axis=1
)

df = df.drop(columns='sbert_embeddings')

df

100%|██████████| 1243/1243 [00:15<00:00, 79.87it/s] 
100%|██████████| 1243/1243 [00:00<00:00, 1663.07it/s]


Unnamed: 0,index,article,key_sentences,sentences,is_key,TFIDF_sim,pred_by_TFIDF,SBERT_s2a_sim
0,0,Climate change is one of the most pressing iss...,We need to act urgently to reduce our greenhou...,[Climate change is one of the most pressing is...,"[0, 0, 0, 1]","[0.3214220469953667, 0.6107510335598709, 0.474...","[0, 1, 0, 0]","[0.6913383, 0.67684746, 0.6024083, 0.76702094]"
1,1,Education is the key to unlocking a better fut...,It is crucial that we invest in education at a...,[Education is the key to unlocking a better fu...,"[0, 0, 0, 1]","[0.47808946064127095, 0.45111889985171527, 0.4...","[0, 0, 0, 1]","[0.7876349, 0.5870048, 0.78181076, 0.79647756]"
...,...,...,...,...,...,...,...,...
1241,1258,The environment plays a critical role in our l...,It is our responsibility to take care of the e...,[The environment plays a critical role in our ...,"[0, 0, 0, 1]","[0.4467579283397811, 0.6413572604770067, 0.556...","[0, 1, 0, 0]","[0.8151461, 0.4661282, 0.7739822, 0.8015023]"
1242,1259,Family is an essential component of our lives....,"In such cases, it is important to communicate ...",[Family is an essential component of our lives...,"[0, 0, 0, 0, 1]","[0.4073544528009398, 0.42108696945190444, 0.57...","[0, 0, 1, 0, 0]","[0.75987756, 0.4105369, 0.7154193, 0.5800866, ..."


In [None]:
# predict the one with the highest tfidf_cossim

df['pred_by_SBERT'] = df.SBERT_s2a_sim.map(lambda x: (x==x.max()).astype(int))
df

Unnamed: 0,index,article,key_sentences,sentences,is_key,TFIDF_sim,pred_by_TFIDF,SBERT_s2a_sim,pred_by_SBERT
0,0,Climate change is one of the most pressing iss...,We need to act urgently to reduce our greenhou...,[Climate change is one of the most pressing is...,"[0, 0, 0, 1]","[0.3214220469953667, 0.6107510335598709, 0.474...","[0, 1, 0, 0]","[0.6913383, 0.67684746, 0.6024083, 0.76702094]","[0, 0, 0, 1]"
1,1,Education is the key to unlocking a better fut...,It is crucial that we invest in education at a...,[Education is the key to unlocking a better fu...,"[0, 0, 0, 1]","[0.47808946064127095, 0.45111889985171527, 0.4...","[0, 0, 0, 1]","[0.7876349, 0.5870048, 0.78181076, 0.79647756]","[0, 0, 0, 1]"
...,...,...,...,...,...,...,...,...,...
1241,1258,The environment plays a critical role in our l...,It is our responsibility to take care of the e...,[The environment plays a critical role in our ...,"[0, 0, 0, 1]","[0.4467579283397811, 0.6413572604770067, 0.556...","[0, 1, 0, 0]","[0.8151461, 0.4661282, 0.7739822, 0.8015023]","[1, 0, 0, 0]"
1242,1259,Family is an essential component of our lives....,"In such cases, it is important to communicate ...",[Family is an essential component of our lives...,"[0, 0, 0, 0, 1]","[0.4073544528009398, 0.42108696945190444, 0.57...","[0, 0, 1, 0, 0]","[0.75987756, 0.4105369, 0.7154193, 0.5800866, ...","[1, 0, 0, 0, 0]"


In [None]:
for col in ['pred_by_TFIDF', 'pred_by_SBERT']:
    accuracy = (df[col].map(np.argmax) == df.is_key.map(np.argmax)).mean()
    print(f'{col} accuracy:', accuracy)

pred_by_TFIDF accuracy: 0.40466613032984716
pred_by_SBERT accuracy: 0.15526950925181013


In [None]:
df.to_pickle('exp17.dfpkl')

In [None]:
# from google.colab import drive
# drive.mount('/gdrive')
# !cp exp17.dfpkl /gdrive/MyDrive/Shared/