In [1]:
!pip install -q transformers datasets rouge-score


#install spacy large language model. (Colab needs to be restarted/)
!python -m spacy download en_core_web_lg


Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [2]:
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy
from tqdm.notebook import tqdm # hiển thị thanh tiến trình 

import tensorflow_hub as hub # lưu trữ các mô hình học máy được đào tạo sẵn
from tensorflow import keras
import tensorflow as tf
from IPython.display import Image
import matplotlib.pyplot as plt


nlp = spacy.load('en_core_web_lg')

In [3]:
cnn_ds = tfds.as_numpy(tfds.load(
    'cnn_dailymail',
    split='test',
    batch_size=-1
))

In [4]:
cnn_df = pd.DataFrame(cnn_ds)
# 
cnn_df.highlights = cnn_df.highlights.apply(lambda x: x.decode('utf-8'))
cnn_df["summary"] = cnn_df.highlights.apply(lambda x: "".join(x.split("\n")) )
cnn_df.article = cnn_df.article.apply(lambda x: x.decode('utf-8'))
cnn_df["art_sents"] = cnn_df.article.apply(lambda x: len([x for x in nlp(x).sents]))
cnn_df

In [5]:
art_lengths = [ len(x) for x in cnn_df.article]
plt.hist(art_lengths);
plt.title("Histogram of Article Lengths. Mean = " + str(round(sum(art_lengths)/ len(art_lengths),1)) + " words")

In [6]:
sum_lengths = [ len(x) for x in cnn_df.summary]
plt.hist(sum_lengths);
plt.title("Histogram of Summary Lengths. Mean = " + str(round(sum(sum_lengths)/ len(sum_lengths),1)) + " words")

In [7]:
# art_lengths = [ len(x) for x in cnn_df.article]
plt.hist(cnn_df.art_sents);
plt.title("Histogram of # Sentences per Article . Mean = " + str(round(sum(cnn_df.art_sents)/ len(cnn_df.art_sents))))

In [8]:
import os

In [9]:
test_df =  pd.DataFrame(tfds.as_numpy(tfds.load( 'cnn_dailymail', split='test', batch_size=-1)))
train_df = pd.DataFrame(tfds.as_numpy(tfds.load( 'cnn_dailymail', split='train', batch_size=-1)))

test_df.highlights = test_df.highlights.apply(lambda x: x.decode('utf-8'))
train_df.highlights = train_df.highlights.apply(lambda x: x.decode('utf-8'))

test_df.article = test_df.article.apply(lambda x: x.decode('utf-8'))
train_df.article = train_df.article.apply(lambda x: x.decode('utf-8'))

test_df.shape, train_df.shape

In [10]:
import json
f1 = open("/kaggle/input/datatest/test.json")
dict1= json.load(f1)
test_df = pd.DataFrame.from_dict(dict1)

f2 = open("/kaggle/input/datatest/train.json")
dict2= json.load(f2)
train_df1 = pd.DataFrame.from_dict(dict2)

In [11]:
# train_df = train_df1[:50000]

In [12]:
import os
os.makedirs("data/", exist_ok=True)
os.makedirs("data/test", exist_ok=True)
os.makedirs("data/train", exist_ok=True)

# test_df.to_json("data/test/test.json")
# train_df.to_json("data/train/train.json")

In [13]:
def get_dicts(df, folder="test"):
  sents_dict = {}
  doc_dict = { i: {"article": df.article[i], "highlight": df.highlights[i]} for i in df.index }
  raw_docs = [ doc_dict[k]["article"] for k in doc_dict.keys()]

  doc_sents = {}
  sents_list = []
  raw_sents = []
  i = 0
  min_sent_length = 14
  for k in tqdm(doc_dict.keys()):
    article = doc_dict[k]["article"]
    highlight = doc_dict[k]["highlight"]
    sents = nlp(article).sents
    doc_sent_ids = []
    for sent in sents:
      if (len(sent)) > min_sent_length:
        sents_dict[i] = {"docid":k, "text": str(sent)}
        sents_list.append({"sentid":i, "docid":k, "text": str(sent) })
        raw_sents.append(str(sent))
        i += 1

  return doc_dict, sents_list

test_doc_dict, test_sents_list = get_dicts(test_df)
train_doc_dict, train_sents_list = get_dicts(train_df)

In [14]:
import json

In [15]:
folder_path = "data/train"
file_name = "train_doc_dict.json"
file_path = os.path.join(folder_path, file_name)
with open("data/train/train_doc_dict.json", 'w') as f:
    json.dump(train_doc_dict, f)

In [16]:
with open("data/train/train_sents_list.json", 'w') as f:
    json.dump(train_sents_list, f)

In [17]:
test_doc_dict.to_json("data/test/test_doc_dict.json")
test_sents_list.to_json("data/test/test_sents_list.json")
train_doc_dict.to_json("data/train/train_doc_dict.json")
train_sents_list.to_json("data/train/train_sents_list.json")

In [18]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

def get_rougue_score(text, highlights, metric="rougeL"):
  max_score = 0
  for h_text in highlights:
    score =  scorer.score(text, h_text)[metric].fmeasure
    # print(score, text, "\n \t" , h_text)
    if score > max_score:
      max_score = score
  return max_score


def get_label(sent, doc_dict,  score_threshold = 0.55):
  sent_id, doc_id, sentence = sent["sentid"], sent["docid"], sent["text"]
  highlights = doc_dict[doc_id]["highlight"].split("\n")
  doc = doc_dict[doc_id]["article"]

  label_score = get_rougue_score(sentence, highlights)
  # Normalize label to 0/1 based on rogue score threshold
  label_score = 0 if label_score < score_threshold else 1
  return (sentence, doc, label_score)

def sub_sample(sents_batch, doc_dict, neg_multiplier=2):
  # get labels
  vals = [get_label(x, doc_dict)  for x in sents_batch]

  # construct arrays of sentences, corresponding documents and labels
  sents, docs, y = [], [], []
  for row in vals:
    sents.append(row[0])
    docs.append(row[1])
    y.append(row[2])


  # get balanced number of positive and negative
  sub_df = pd.DataFrame.from_dict({"sents":sents, "docs":docs, "y":y})
  pos_df = sub_df[sub_df.y == 1]
  neg_df = sub_df[sub_df.y == 0]

  print("Negative sample size:", len(neg_df))
  print("Positive sample size:", len(pos_df))

  sub_neg_df = neg_df.sample(len(pos_df)*neg_multiplier)
#   balanced_df = pos_df.append(sub_neg_df)

  return pos_df, sub_neg_df

In [19]:
f1 = open("/kaggle/input/dataset/train_doc_dict.json")
train_doc_dict = json.load(f1)
# train_doc_dict = pd.DataFrame.from_dict(dict1, orient = 'index')

# f2 = open("/kaggle/input/dataset/train_sents_list.json")
# dict2= json.load(f2)
# train_sents_list = pd.DataFrame(dict2)

In [20]:
f2 = open("/kaggle/input/dataset/train_sents_list.json")
train_sents_list = json.load(f2)
# train_sents_list = pd.DataFrame(dict2)

In [21]:
train_pos_df, train_sub_neg_df = sub_sample(train_sents_list, train_doc_dict)
# train_bdf = train_pos_df.append(train_sub_neg_df)


Negative sample size: 1238943
Positive sample size: 39878


In [22]:
# test_pos_df, test_sub_neg_df = sub_sample(test_sents_list, test_doc_dict)
# test_bdf = test_pos_df.append(test_sub_neg_df)


In [23]:
# test_pos_df
train_pos_df

Unnamed: 0,sents,docs,y
3,The diocese announced on Monday that Bishop Jo...,"By. Associated Press. PUBLISHED:. 14:11 EST, 2...",1
4,The diocese says he contracted the infection t...,"By. Associated Press. PUBLISHED:. 14:11 EST, 2...",1
24,Police said Eccleston-Todd had drunk at least ...,A drunk driver who killed a young woman in a h...,1
25,He was found guilty of causing death by danger...,A drunk driver who killed a young woman in a h...,1
64,As such EU leaders must be ready to accept san...,(CNN) -- With a breezy sweep of his pen Presid...,1
...,...,...,...
1278539,Former Pakistan Prime Minister Benazir Bhutto ...,(CNN) -- A close aide to Pakistan's Taliban ch...,1
1278548,"The charity's latest project, Bright Pink Lips...",Krystal Barter created the Pink Hope charity i...,1
1278787,They were arrested after a devastated Alan and...,A Florida man and his girlfriend allegedly bro...,1
1278789,According to WPTV the pair admitted to killing...,A Florida man and his girlfriend allegedly bro...,1


In [24]:
# test_sub_neg_df
train_sub_neg_df

Unnamed: 0,sents,docs,y
557497,Getting water to every person on the planet wi...,London (CNN) -- When one looks back at humanit...,0
135480,TV: Shazam's biggest small screen venture to d...,If you've ever fallen in love with a piece of ...,0
399446,BJP spokesman Mukhtar Abbas Naqvi said: 'He [P...,"By. Piyush Srivastava. Updated:. 16:41 EST, 22...",0
413273,‘The prosecution accept Anne Pollen had intend...,By. Chris Pleasance. Ann Pollen has admitted k...,0
1275232,"All the girls must be well-behaved, and they p...",A cropped version of the controversial image. ...,0
...,...,...,...
836793,"Major Stallard launched Operation Olympian, so...",The story of the greatest escape of World War...,0
231387,"""These online activities make them more mature...","Beijing, China (CNN) -- Lazy, promiscuous, con...",0
1081154,"However, it said mayor Guo Jinlong had stepped...",Hong Kong (CNN) -- More rain was forecast for ...,0
522113,Mrs Justice Pauffley concluded: ‘The details w...,By. Sam Greenhill. High Court judge Mrs Justic...,0


In [25]:
# test_bdf = pd.concat([test_pos_df, test_sub_neg_df])
train_bdf = pd.concat([train_pos_df, train_sub_neg_df])

In [26]:
# test_bdf

In [27]:
# test_bdf = test_bdf.sort_index()
train_bdf = train_bdf.sort_index()

In [28]:
# test_bdf
train_bdf

Unnamed: 0,sents,docs,y
3,The diocese announced on Monday that Bishop Jo...,"By. Associated Press. PUBLISHED:. 14:11 EST, 2...",1
4,The diocese says he contracted the infection t...,"By. Associated Press. PUBLISHED:. 14:11 EST, 2...",1
9,A criminal complaint unsealed in U.S. District...,(CNN) -- Ralph Mata was an internal affairs li...,0
14,"Mata, according to the complaint, then used co...",(CNN) -- Ralph Mata was an internal affairs li...,0
24,Police said Eccleston-Todd had drunk at least ...,A drunk driver who killed a young woman in a h...,1
...,...,...,...
1278789,According to WPTV the pair admitted to killing...,A Florida man and his girlfriend allegedly bro...,1
1278794,Charged: Adams and Curran are believed to have...,A Florida man and his girlfriend allegedly bro...,0
1278807,The 'star' of the show: Executive producer Mar...,Chicago is no stranger to corruption so perhap...,0
1278815,Producers Levin and Benjamin are both represen...,Chicago is no stranger to corruption so perhap...,1


In [29]:
# test_bdf.to_json("data/test_bdf.json")

In [30]:
train_bdf.to_json("data/train_bdf.json")
