<a href="https://colab.research.google.com/github/cfong32/key-sentence-extraction/blob/main/exp15_tfidf_chatgptDS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install -q datasets rouge_score

In [1]:
# import packages

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from datasets import load_dataset
from spacy.lang.en import English
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score.rouge_scorer import RougeScorer
from itertools import cycle
from textwrap import wrap
from IPython.display import HTML as html_print
pd.set_option('display.min_rows', 4)
tqdm.pandas()

# Calculate TF-IDF Cosine-similarity on the Mohammad's ChatGPT-generated Dataset

In [2]:
# load dataset into a dataframe

!wget -qO "Mohammad_dataset_result(3).json" https://uoguelphca-my.sharepoint.com/:u:/g/personal/mhavaled_uoguelph_ca/ET_4hgQABD5JhtiDtFGUG4gBASpIv6WURezHphMTlPCJqg?download=1
data = json.load(open('Mohammad_dataset_result(3).json'))
df = (pd.DataFrame(data)
      .rename(columns={'paragraphs': 'article'})
      )
df

Unnamed: 0,article,key_sentences
0,Climate change is one of the most pressing iss...,We need to act urgently to reduce our greenhou...
1,Education is the key to unlocking a better fut...,It is crucial that we invest in education at a...
...,...,...
1258,The environment plays a critical role in our l...,It is our responsibility to take care of the e...
1259,Family is an essential component of our lives....,"In such cases, it is important to communicate ..."


In [3]:
# split articles into sentences
# every entry of df['sentences'] will contain a list of strings

spacy_eng_nlp = English()
spacy_eng_nlp.add_pipe("sentencizer")

df['sentences'] = df.progress_apply(
    lambda x: (
        [str(s) for s in spacy_eng_nlp(x.article).sents]
    ),
    axis=1
)
df

100%|██████████| 1260/1260 [00:00<00:00, 2497.40it/s]


Unnamed: 0,article,key_sentences,sentences
0,Climate change is one of the most pressing iss...,We need to act urgently to reduce our greenhou...,[Climate change is one of the most pressing is...
1,Education is the key to unlocking a better fut...,It is crucial that we invest in education at a...,[Education is the key to unlocking a better fu...
...,...,...,...
1258,The environment plays a critical role in our l...,It is our responsibility to take care of the e...,[The environment plays a critical role in our ...
1259,Family is an essential component of our lives....,"In such cases, it is important to communicate ...",[Family is an essential component of our lives...


In [4]:
df['is_key'] = df.progress_apply(
    lambda x: (
        [int(x.key_sentences in s) for s in x.sentences]
    ),
    axis=1
)
df = df[df.is_key.apply(sum) == 1]      # keep those paragraphs which have only one key sentence
df = df[df.is_key.apply(len) > 1]       # keep those paragraphs which consist of multiple sentences
df

100%|██████████| 1260/1260 [00:00<00:00, 22948.06it/s]


Unnamed: 0,article,key_sentences,sentences,is_key
0,Climate change is one of the most pressing iss...,We need to act urgently to reduce our greenhou...,[Climate change is one of the most pressing is...,"[0, 0, 0, 1]"
1,Education is the key to unlocking a better fut...,It is crucial that we invest in education at a...,[Education is the key to unlocking a better fu...,"[0, 0, 0, 1]"
...,...,...,...,...
1258,The environment plays a critical role in our l...,It is our responsibility to take care of the e...,[The environment plays a critical role in our ...,"[0, 0, 0, 1]"
1259,Family is an essential component of our lives....,"In such cases, it is important to communicate ...",[Family is an essential component of our lives...,"[0, 0, 0, 0, 1]"


In [5]:
# calculate TF-IDF (Term Frequency-Inverse Document Frequency)
# then calculate the cosine-similarity of each sentence to the "article"
# every entry of df['tfidf_cossim'] will be an ndarray indicating cossim of the sentences

articles = df.article.tolist()
tfidf = TfidfVectorizer().fit(articles)

df['tfidf_cossim'] = df.progress_apply(
    lambda x: (
        cosine_similarity(
            tfidf.transform([x.article]),
            tfidf.transform(x.sentences)
        )[0]
    ),
    axis=1
)
df

100%|██████████| 1243/1243 [00:03<00:00, 394.33it/s]


Unnamed: 0,article,key_sentences,sentences,is_key,tfidf_cossim
0,Climate change is one of the most pressing iss...,We need to act urgently to reduce our greenhou...,[Climate change is one of the most pressing is...,"[0, 0, 0, 1]","[0.3214220469953667, 0.6107510335598709, 0.474..."
1,Education is the key to unlocking a better fut...,It is crucial that we invest in education at a...,[Education is the key to unlocking a better fu...,"[0, 0, 0, 1]","[0.47808946064127095, 0.45111889985171527, 0.4..."
...,...,...,...,...,...
1258,The environment plays a critical role in our l...,It is our responsibility to take care of the e...,[The environment plays a critical role in our ...,"[0, 0, 0, 1]","[0.4467579283397811, 0.6413572604770067, 0.556..."
1259,Family is an essential component of our lives....,"In such cases, it is important to communicate ...",[Family is an essential component of our lives...,"[0, 0, 0, 0, 1]","[0.4073544528009398, 0.42108696945190444, 0.57..."


In [8]:
# rank sentences within each article
# predict the one with the highest tfidf_cossim

def cal_ranking(x):
    return np.argsort(np.argsort(x)) / (len(x)-1)

df['rank_by_tfidf'] = df.tfidf_cossim.map(cal_ranking)
df['pred_by_tfidf'] = df.tfidf_cossim.map(lambda x: (x==x.max()).astype(int))
df

Unnamed: 0,article,key_sentences,sentences,is_key,tfidf_cossim,rank_by_tfidf,pred_by_tfidf
0,Climate change is one of the most pressing iss...,We need to act urgently to reduce our greenhou...,[Climate change is one of the most pressing is...,"[0, 0, 0, 1]","[0.3214220469953667, 0.6107510335598709, 0.474...","[0.0, 1.0, 0.3333333333333333, 0.6666666666666...","[0, 1, 0, 0]"
1,Education is the key to unlocking a better fut...,It is crucial that we invest in education at a...,[Education is the key to unlocking a better fu...,"[0, 0, 0, 1]","[0.47808946064127095, 0.45111889985171527, 0.4...","[0.6666666666666666, 0.0, 0.3333333333333333, ...","[0, 0, 0, 1]"
...,...,...,...,...,...,...,...
1258,The environment plays a critical role in our l...,It is our responsibility to take care of the e...,[The environment plays a critical role in our ...,"[0, 0, 0, 1]","[0.4467579283397811, 0.6413572604770067, 0.556...","[0.0, 1.0, 0.6666666666666666, 0.3333333333333...","[0, 1, 0, 0]"
1259,Family is an essential component of our lives....,"In such cases, it is important to communicate ...",[Family is an essential component of our lives...,"[0, 0, 0, 0, 1]","[0.4073544528009398, 0.42108696945190444, 0.57...","[0.0, 0.25, 1.0, 0.75, 0.5]","[0, 0, 1, 0, 0]"


# Analyze Results

## Calculate accuracy

In [9]:
accuracy = (df.pred_by_tfidf.map(np.argmax) == df.is_key.map(np.argmax)).mean()
print('accuracy:', accuracy)

accuracy: 0.40466613032984716
