<a href="https://colab.research.google.com/github/cfong32/key-sentence-extraction/blob/main/exp14_final_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [86]:
# install necessary programs 
!pip install -q datasets rouge_score
!pip install tqdm
!pip install -q datasets rouge_score sentence-transformers
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.4-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 KB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.4


In [87]:
# import packages
import openai
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from datasets import load_dataset
from spacy.lang.en import English
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score
from rouge_score.rouge_scorer import RougeScorer
from sentence_transformers import SentenceTransformer 
from itertools import cycle
from functools import partial
from textwrap import wrap
from IPython.display import HTML as html_print

In [3]:
# Defining options and values:
pd.set_option('display.min_rows', 4)
tqdm.pandas()
tqdm = partial(tqdm, position=0, leave=True)

# A list of how many setence to show
Ks = [1, 5, 10, 20, 40, 60, 80, 100]


In [59]:
# function that splits articles into sentences
# input: "df" = dataframe of interest, "column" = column containing the paragraphs
# output: data frame with "sentences"
def tokenizer(df):
  spacy_eng_nlp = English()
  spacy_eng_nlp.add_pipe("sentencizer")

  df['sentences'] = df.progress_apply(
    lambda x: (
      [str(s) for s in spacy_eng_nlp(x.paragraphs).sents]
    ),
  axis=1
  )
  return df

In [66]:
# calculate TF-IDF (Term Frequency-Inverse Document Frequency)
# then calculate the cosine-similarity of each sentence to the "article"
# every entry of df['tfidf_cossim'] will be an ndarray indicating cossim of the sentences
def TFIDF_cossim_s2a(df):
  articles = df.paragraphs.tolist()
  tfidf = TfidfVectorizer().fit(articles)

  df['tfidf_cossim'] = df.progress_apply(
    lambda x: (
      cosine_similarity(
        tfidf.transform([x.paragraphs]),
        tfidf.transform(x.sentences)
        )[0]
    ),
    axis=1
  )
  return df

In [76]:
def BERT_cossim_s2a(df):
  # Define model
  sbert = SentenceTransformer('all-MiniLM-L6-v2')

  # Embeddings:
  df['sbert_embeddings'] = df.progress_apply(
    lambda x: sbert.encode(x.sentences + [x.paragraphs, x.summaries]),
    axis=1
  )

  df['SBERT_s2a_sim'] = df.progress_apply(
    lambda x: (
        cosine_similarity(
            x.sbert_embeddings[[-2]],   # x.article encoded
            x.sbert_embeddings[:-2]     # x.sentences encoded
        )[0]
    ),
    axis=1
)
  return df

In [None]:
# Load the OpenAI API key
openai.api_key = "YOUR_API_KEY"

# Load the GPT model
model_engine = "text-davinci-002"
model = openai.Model(model_engine)

def get_embeddings(sentences):
    # Generate the embeddings
    response = model.embeddings(
        sentences,
        model=model_engine
    )

    # Extract the embeddings from the response
    embeddings = response['data']

    # Convert the embeddings to a NumPy array
    embeddings = np.array(embeddings)

    return embeddings

In [12]:
# Load dataset into a dataframe
cnn_dailymail_dataset = load_dataset('cnn_dailymail', '3.0.0', split = 'test')
#SAMsum_dataset = load_dataset('samsum', split = 'test')
#reddit_dataset = load_dataset('reddit', split = 'test')
#aeslc_dataset = load_dataset('aeslc', split = 'test')

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading metadata: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


In [62]:
cnn_dailymail_df = pd.DataFrame({"paragraphs": cnn_dailymail_dataset['article'],
                                 "summaries": cnn_dailymail_dataset['highlights']})
cnn_dailymail_df = tokenizer(cnn_dailymail_df)
cnn_dailymail_df

100%|██████████| 11490/11490 [00:45<00:00, 253.07it/s]


Unnamed: 0,paragraphs,summaries,sentences
0,(CNN)The Palestinian Authority officially beca...,Membership gives the ICC jurisdiction over all...,[(CNN)The Palestinian Authority officially bec...
1,(CNN)Never mind cats having nine lives. A stra...,"Theia, a bully breed mix, was apparently hit b...","[(CNN)Never mind cats having nine lives., A st..."
...,...,...,...
11488,"Despite the hype surrounding its first watch, ...",Apple sold more than 61 million iPhones in the...,"[Despite the hype surrounding its first watch,..."
11489,Angus Hawley's brother has spoken of his shock...,Angus Hawley's brother said his late sibling '...,[Angus Hawley's brother has spoken of his shoc...


In [67]:
cnn_dailymail_df = TFIDF_cossim_s2a(cnn_dailymail_df)
cnn_dailymail_df

100%|██████████| 11490/11490 [01:15<00:00, 151.41it/s]


Unnamed: 0,paragraphs,summaries,sentences,tfidf_cossim
0,(CNN)The Palestinian Authority officially beca...,Membership gives the ICC jurisdiction over all...,[(CNN)The Palestinian Authority officially bec...,"[0.394838255251771, 0.2184224416188325, 0.5160..."
1,(CNN)Never mind cats having nine lives. A stra...,"Theia, a bully breed mix, was apparently hit b...","[(CNN)Never mind cats having nine lives., A st...","[0.0910601549035593, 0.29696905043672805, 0.43..."
...,...,...,...,...
11488,"Despite the hype surrounding its first watch, ...",Apple sold more than 61 million iPhones in the...,"[Despite the hype surrounding its first watch,...","[0.3847239488347531, 0.20526128662452667, 0.59..."
11489,Angus Hawley's brother has spoken of his shock...,Angus Hawley's brother said his late sibling '...,[Angus Hawley's brother has spoken of his shoc...,"[0.6310895042459086, 0.3007128522327899, 0.187..."


In [85]:
cnn_dailymail_df = BERT_cossim_s2a(cnn_dailymail_df)
cnn_dailymail_df

 17%|█▋        | 1915/11490 [2:29:19<12:26:39,  4.68s/it]


KeyboardInterrupt: ignored