In [1]:
# Uncomment if notebook is run in Colab
# %%capture
# !pip install datasets
# !pip install rouge-score

In [2]:
import datasets
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
import numpy as np
import random
from datasets import load_dataset, load_metric
from IPython.display import display, HTML

import warnings
warnings.filterwarnings('ignore')

In [3]:
rouge = load_metric('rouge', seed=42)

# Helper functions

def show_random_elements(dataset, num_examples=3):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

def rouge_2_recall(target_text_1, target_text_2):
    rouge_output = rouge.compute(predictions=target_text_2, references=target_text_1)
    ROUGE_2_recall = \
    round((rouge_output['rouge2'].low.recall \
           + rouge_output['rouge2'].mid.recall \
           + rouge_output['rouge2'].high.recall)/3, 1)
    
    return ROUGE_2_recall

def clean_string(string):
    string = re.sub('\*', '', string).lower().rstrip()
    return string

def remove_duplicate_sets_from_list(candidate_duplicates):
    candidate_duplicates_sets = list(map(set, candidate_duplicates))
    unique_sets = list(set(frozenset(item) for item in candidate_duplicates))
    candidate_duplicates_unique_sets = [set(item) for item in set(frozenset(item) for item in unique_sets)]
    
    candidate_duplicates_lists = []
    for item in candidate_duplicates_unique_sets:
        candidate_duplicates_lists.append(list(item))
        
    return candidate_duplicates_lists

## **Download Webis-TLDR-16**

- No train-val-test split is provided anywhere for Webis-TLDR-17
- We download Webis-TLDR-17 from Hugging Face datasets 
- the split='train' downloads the whole dataset

In [4]:
webis_tldr = load_dataset('reddit', split='train')

Found cached dataset reddit (C:/Users/Anna/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e)


In [5]:
webis_tldr

Dataset({
    features: ['author', 'body', 'normalizedBody', 'subreddit', 'subreddit_id', 'id', 'content', 'summary'],
    num_rows: 3848330
})

In [6]:
# 3 random examples from the Webis-TLDR-17 TIFU dataset:

show_random_elements(webis_tldr)

Unnamed: 0,author,body,normalizedBody,subreddit,subreddit_id,id,content,summary
0,[deleted],"Excuse me if this isn't the proper thread to post this kind of question, first of all. If that's the case, please point me in the right direction. \n\n**Question:**\n\nI'm looking to get into riding, I have minimal experience (I used to work at motorcycle shop and from time to time would have to ride bikes on our property or at our training course, auction, etc) \n\nAnyway, I'm 26, 6'2"" ~175lbs, and I'm interested in getting into riding, I have a preference of sportbikes. My question to you fine people is what's a good (used) bike that would be good for a beginner? I've heard people say go for it and start on a 600, others say start smaller, I'm very intrigued by the CBR500R, though I feel like that may be a little out of budget considering they're hard to find used. \n\n\nAny help would be appreciated, thanks!\n\nTLDR: What bike should a beginner get? \n","Excuse me if this isn't the proper thread to post this kind of question, first of all. If that's the case, please point me in the right direction. \n Question: \n I'm looking to get into riding, I have minimal experience (I used to work at motorcycle shop and from time to time would have to ride bikes on our property or at our training course, auction, etc) \n Anyway, I'm 26, 6'2"" ~175lbs, and I'm interested in getting into riding, I have a preference of sportbikes. My question to you fine people is what's a good (used) bike that would be good for a beginner? I've heard people say go for it and start on a 600, others say start smaller, I'm very intrigued by the CBR500R, though I feel like that may be a little out of budget considering they're hard to find used. \n Any help would be appreciated, thanks! \n TLDR: What bike should a beginner get? \n",motorcycles,t5_2qi6d,t3_3bl7xa,"Excuse me if this isn't the proper thread to post this kind of question, first of all. If that's the case, please point me in the right direction. \n Question: \n I'm looking to get into riding, I have minimal experience (I used to work at motorcycle shop and from time to time would have to ride bikes on our property or at our training course, auction, etc) \n Anyway, I'm 26, 6'2"" ~175lbs, and I'm interested in getting into riding, I have a preference of sportbikes. My question to you fine people is what's a good (used) bike that would be good for a beginner? I've heard people say go for it and start on a 600, others say start smaller, I'm very intrigued by the CBR500R, though I feel like that may be a little out of budget considering they're hard to find used. \n Any help would be appreciated, thanks!",What bike should a beginner get?
1,[deleted],"So, I've been getting into shape and rather passively participating in /r/GetMotivated. I've also been trying to get out and talk to women despite my anxiety and everything's going pretty well.\n\nToday, I had a bunch of shit to do, and 45 minutes before the post office closed, I decided to head out and photocopy some stuff that I had to mail out. I got everything together, went out to the garage, and my bike tire was flat. Tried to reinflate, no dice.\n\nSo what did I do, GetMotivated? I did what I've been doing on those stupid gym hamster wheel things, only outside. I ran my fucking ass to the library, walked over to the post office, and then ran home so I could get ready to go to work.\n\nI feel like a wizard. And you can too.\n\n**TL;DR:** Running is a mode of transportation, not just an exercise. Do it.","So, I've been getting into shape and rather passively participating in /r/GetMotivated. I've also been trying to get out and talk to women despite my anxiety and everything's going pretty well. \n Today, I had a bunch of shit to do, and 45 minutes before the post office closed, I decided to head out and photocopy some stuff that I had to mail out. I got everything together, went out to the garage, and my bike tire was flat. Tried to reinflate, no dice. \n So what did I do, GetMotivated? I did what I've been doing on those stupid gym hamster wheel things, only outside. I ran my fucking ass to the library, walked over to the post office, and then ran home so I could get ready to go to work. \n I feel like a wizard. And you can too. \n TL;DR: Running is a mode of transportation, not just an exercise. Do it. \n",GetMotivated,t5_2rmfx,t3_ycdpu,"So, I've been getting into shape and rather passively participating in /r/GetMotivated. I've also been trying to get out and talk to women despite my anxiety and everything's going pretty well. \n Today, I had a bunch of shit to do, and 45 minutes before the post office closed, I decided to head out and photocopy some stuff that I had to mail out. I got everything together, went out to the garage, and my bike tire was flat. Tried to reinflate, no dice. \n So what did I do, GetMotivated? I did what I've been doing on those stupid gym hamster wheel things, only outside. I ran my fucking ass to the library, walked over to the post office, and then ran home so I could get ready to go to work. \n I feel like a wizard. And you can too.","Running is a mode of transportation, not just an exercise. Do it."
2,Typhun,"I went with a friend to a party, it was her group of friends whom I had never met. I am pretty damned introverted, but I do enjoy going to parties. The place was jam packed with the usual loud music, drinking games, and all that fun stuff. However, I noticed things were a bit off rather early. Somebody was smoking something, and I knew it wasn't weed. I never found out what somebody was smoking, because other sights caught my attention. Sights like people doing lines off of a piano. Again, very introverted guy; I'd never seen anybody actually do cocaine before so I was quite shocked, but that wasn't the thing that was strangest for me that night. I was walking by a room, and I saw a door open and close. I saw a naked girl and a bunch of naked dudes all around her, she with a huge fucking grin on her face. The door closes. I opened it back up to confirm what I just saw. Yeeeep, it was a gangbang going on in the side room, and the girl looked like she was enjoying it. I didn't feel inclined to participate or intervene, because it looked like everybody was consenting adult. Then the loads of thugs started rolling in, so me and my friend got out of there. \n\nTLDR: Saw people doing lines of cocaine and a gangbang at a party.","I went with a friend to a party, it was her group of friends whom I had never met. I am pretty damned introverted, but I do enjoy going to parties. The place was jam packed with the usual loud music, drinking games, and all that fun stuff. However, I noticed things were a bit off rather early. Somebody was smoking something, and I knew it wasn't weed. I never found out what somebody was smoking, because other sights caught my attention. Sights like people doing lines off of a piano. Again, very introverted guy; I'd never seen anybody actually do cocaine before so I was quite shocked, but that wasn't the thing that was strangest for me that night. I was walking by a room, and I saw a door open and close. I saw a naked girl and a bunch of naked dudes all around her, she with a huge fucking grin on her face. The door closes. I opened it back up to confirm what I just saw. Yeeeep, it was a gangbang going on in the side room, and the girl looked like she was enjoying it. I didn't feel inclined to participate or intervene, because it looked like everybody was consenting adult. Then the loads of thugs started rolling in, so me and my friend got out of there. \n TLDR: Saw people doing lines of cocaine and a gangbang at a party. \n",AskReddit,t5_2qh1i,c2xme7k,"I went with a friend to a party, it was her group of friends whom I had never met. I am pretty damned introverted, but I do enjoy going to parties. The place was jam packed with the usual loud music, drinking games, and all that fun stuff. However, I noticed things were a bit off rather early. Somebody was smoking something, and I knew it wasn't weed. I never found out what somebody was smoking, because other sights caught my attention. Sights like people doing lines off of a piano. Again, very introverted guy; I'd never seen anybody actually do cocaine before so I was quite shocked, but that wasn't the thing that was strangest for me that night. I was walking by a room, and I saw a door open and close. I saw a naked girl and a bunch of naked dudes all around her, she with a huge fucking grin on her face. The door closes. I opened it back up to confirm what I just saw. Yeeeep, it was a gangbang going on in the side room, and the girl looked like she was enjoying it. I didn't feel inclined to participate or intervene, because it looked like everybody was consenting adult. Then the loads of thugs started rolling in, so me and my friend got out of there.",Saw people doing lines of cocaine and a gangbang at a party.


In [7]:
webis_tldr[8200]

{'author': 'ordig',
 'body': 'Jack Kerouac spent 7 years on the road. He wrote "On The Road" in 3 weeks on a single roll of paper. \n\nHe was not a professional writer. But, he had a story to tell. \n\nHe was not rationalizing his life decisions in writing.  He was not telling a story about himself. He was telling a story about the world his decisions led him to, who they him led to, and perhaps in so doing, why he was led to make them in the first place.\n\ntldr: Don\'t write a story about yourself. Its boring.',
 'normalizedBody': 'Jack Kerouac spent 7 years on the road. He wrote "On The Road" in 3 weeks on a single roll of paper. \n He was not a professional writer. But, he had a story to tell. \n He was not rationalizing his life decisions in writing.  He was not telling a story about himself. He was telling a story about the world his decisions led him to, who they him led to, and perhaps in so doing, why he was led to make them in the first place. \n tldr: Don\'t write a story ab

In [8]:
webis_tldr_df = webis_tldr.to_pandas()

# **Step 1:** inspect Webis-TLDR-17 for duplicates of the source texts ('content' column)

In [9]:
webis_tldr_df.iloc[20094] # Random element

author                                               Nightshade3312
body              If it has a whammy bar setup, or if the string...
normalizedBody    If it has a whammy bar setup, or if the string...
subreddit                                                    Guitar
subreddit_id                                               t5_2qi79
id                                                          c49hw3s
content           If it has a whammy bar setup, or if the string...
summary           if your strings are fed through the back of th...
Name: 20094, dtype: object

In [10]:
len(webis_tldr_df['content'].value_counts()) # Find number of unique values 

3807923

In [11]:
len(webis_tldr_df)

3848330

- The value 3,807,923 is smaller than the number of examples in the dataset (3,848,330).
- This indicates that there are duplicates, for the column 'content', in the dataset.
- 3,848,330-3,807,923=40,407 *exact* duplicates that should be removed

In [None]:
## Attention! Next cell can take hours to run! - You can skip to exact_duplicates_df = pd.read_csv("webis_tldr_exact_duplicates_df.csv", sep="\t")

In [16]:
# Find the indices of the webis_tldr_df of the exact matches for the column 'content'
# store them in the *exact_duplicates_texts_indices* variable  

# Count the values of the field 'content' that occur more than once 
# print(len(webis_tldr_df['content'].value_counts()[webis_tldr_df['content'].value_counts() > 1]))

# Identify exact duplicates in the 'content' column
# 'exact_duplicates', will store a Series containing the exact duplicate contents along with their counts
exact_duplicates = webis_tldr_df['content'].value_counts()[webis_tldr_df['content'].value_counts() > 1]

exact_duplicates_df = pd.DataFrame({'value': exact_duplicates.index, 'occurencies_count': exact_duplicates.values})

# exact_duplicates_df['occurencies_count'].sum()

exact_duplicates_texts_indices_lists = []

for element in exact_duplicates_df['value'].to_list():
    element_occurence_indices = webis_tldr_df.index[webis_tldr_df['content'] == element].tolist()
    exact_duplicates_texts_indices_lists.append(element_occurence_indices)
    
# exact_duplicates_texts_indices_lists = (webis_tldr_df[webis_tldr_df['content'].isin(exact_duplicates_df['value'])]
#                                        .groupby('content')
#                                        .apply(lambda x: x.index.tolist())
#                                        .tolist())

# for the *exact_duplicates_texts_indices* we keep all the elements that are
# duplicates of the first element in each list,
# each first element ("original" element) index is not stored in exact_duplicates_texts_indices since
# it itself is not a duplicate

exact_duplicates_texts_indices = []

for element in exact_duplicates_texts_indices_lists:
    for i in range(1, len(element)):
        exact_duplicates_texts_indices.append(element[i])

In [21]:
len(exact_duplicates_texts_indices) 

40407

In [17]:
exact_duplicates_df

Unnamed: 0,value,occurencies_count
0,What does,134
1,Be sure to explain in detail with line breaks.,104
2,Can someone,79
3,Are you looking for a fun server to play on?\n...,76
4,Thanks for the,66
...,...,...
30961,Hoping to get some advice here. My husband and...,2
30962,Hello! I want to stick an Ardunio in an airsof...,2
30963,About a week ago my girlfriend decided she wan...,2
30964,"Alright, loseit, here we are. I've been at thi...",2


In [19]:
exact_duplicates_df[5:15]

Unnamed: 0,value,occurencies_count
5,"I am looking for a Sub, but foremost, a friend...",56
6,when i walked in my bedroom home early from wo...,53
7,"I'm looking for a Sub, but foremost, a friend ...",52
8,Avoid that Kingston SSD. See [this thread]( as...,52
9,Upvote for the,47
10,Consider this a friendly warning/guideline:\nR...,46
11,Posted on /r/,45
12,what does,44
13,General Information' | So 2nd order so far wit...,44
14,Where's the,42


In [20]:
exact_duplicates_df.to_csv("webis_tldr_exact_duplicates_df.csv", sep="\t")

In [13]:
exact_duplicates_df = pd.read_csv("webis_tldr_exact_duplicates_df.csv", sep="\t")

# **Step 2:** inspect dataset for problematic source texts ('content' column)

In [16]:
not_useful_texts_indices = []

# Find the indices of the 'documents' that are empty or not text (e.g., punctuation marks only)

''' a regular expression that describes text: '''
text_pattern = re.compile("([a-z1-9])+.", re.IGNORECASE)

for i in range(len(webis_tldr_df)):
    if len(webis_tldr_df['content']) == 0 or not(text_pattern.search(webis_tldr_df['content'].loc[i])):
        not_useful_texts_indices.append(i)

In [17]:
len(not_useful_texts_indices)

59

# **Step 3:** inspect dataset for problematic summaries ('summary' column)

In [19]:
# Prepare a dataframe to examine the 'tldr' column values

webis_tldr_targets = webis_tldr_df['summary']
webis_tldr_targets_df = pd.DataFrame({'original_index': webis_tldr_targets.index, 'text': webis_tldr_targets.values})

In [20]:
# Remove the special character * that appears often in the original 'tldr' field
# but offers no practical value 
webis_tldr_targets_df['clean_text'] = webis_tldr_targets_df['text'].apply(clean_string)

In [21]:
webis_tldr_targets_df

Unnamed: 0,original_index,text,clean_text
0,0,Shifting seasonal time is no longer worth it.,shifting seasonal time is no longer worth it.
1,1,Personal opinions 'n shit.,personal opinions 'n shit.
2,2,insults and slack ass insight. \n Wall Street ...,insults and slack ass insight. \n wall street ...
3,3,"Yes, Joysticks in modern games have apparently...","yes, joysticks in modern games have apparently..."
4,4,Class only items dropped from high-lvl monsters.,class only items dropped from high-lvl monsters.
...,...,...,...
3848325,3848325,"hate my own feet, and don't know how to give a...","hate my own feet, and don't know how to give a..."
3848326,3848326,"want to win cash prize, need answer for radio ...","want to win cash prize, need answer for radio ..."
3848327,3848327,"want cash prize, need answer for radio contest...","want cash prize, need answer for radio contest..."
3848328,3848328,my xbox has died only a few days before launch...,my xbox has died only a few days before launch...


Next: Find the indices of the items that are not useful (not informative);
*   nonsensical tldrs (e.g., punctuation marks only),
*   tldrs that clearly are not a summary (e.g., "see title")

In [22]:
# Find the indices of the items that are not useful (not informative);
#   - nonsensical tldrs (e.g., punctuation marks only),
#   - tldrs that clearly are not a summary (e.g., "see title") 

not_useful_tldrs_indices = []

# Find the indices of the TLDRs that empty or not text (e.g., punctuation marks only)

''' a regular expression that describes text: '''
text_pattern = re.compile("([a-z1-9])+.", re.IGNORECASE)

for i in range(len(webis_tldr_df)):
    if len(webis_tldr_df['summary']) == 0 or not(text_pattern.search(webis_tldr_df['summary'].loc[i])):
        not_useful_tldrs_indices.append(i)

# Find the indices of the TLDRs that are not useful, e.g., "see title"

not_useful_tldrs = ['title', 'title.',
                    'see title', 'see title.',
                    'read title', 'read title.',
                    'at bottom', 'at bottom.',
                    'at the bottom', 'at the bottom.',
                    'at the end', 'at the end.',
                    'version:']

for i in range(len(webis_tldr_df)):
    if webis_tldr_df.loc[i]['summary'] in not_useful_tldrs:
        not_useful_tldrs_indices.append(i)

for indx in not_useful_tldrs_indices:
    print(webis_tldr_df.loc[indx]['summary'])

D
s
P
s
s
σάτιρα δεν χωράει όρια.
D
s
1
S
c
s
s
P
هي ^^^^^^^كطوع ^^^^^^^المشعلادن
οι απόψεις που σε ενοχλούν δεν έχουν να κάνουν με την λογική, αλλά με την επιβίωση της κοινωνίας και των ατόμων της. Οι περισσότεροι άνθρωποι που τις πιστεύουν και προσπαθούν να τις επιβάλλουν απλά μεγάλωσαν έτσι και πιθανότατα θα έκανες το ίδιο αν ήσουν στη θέση τους. Δεν είναι ανάγκη να το αποδεχθείς, αντίθετα πρέπει να το πολεμήσεις πολιτικά, αλλά καλό θα ήταν να δείξεις κατανόηση.
ಠ_ಠ
s
D
s
s
s
s
s
s
s
s
S
s
D
P
C
6
D
p
K
2
s
3
P
D
2
s
s
s
s
1
P
大山比我好。
k
s
s
D
P
p
ಠ_ಠ
s
ಠ_ಠ
مص بلدي بالز
ಠ_ಠ
D
p
P
s
P
3
s
s
3
3
S
Ως φιλελεύθερους εννοούμε κυρίως τους  ύστερους κλασσικούς φιλελεύθερους  κυρίως καπιταλιστές της ελεύθερης αγοράς (τύπου νεοφιλελεύθεροι) αλλά για να είμαστε ακριβείς φιλελεύθεροι είναι και οι υπόλοιποι σύγχρονοι οπαδοί του καπιταλισμού. Φιλελεύθεροι δεν είναι με αυτήν την στενή έννοια οι αστοί διαφωτιστές που προηγούνται του καπιταλισμού. Είναι μέν κλασσικοί φιλελεύθεροι με την ευρύτερη έννο

# **Step 4:** Aggregate all the indices that should be removed, found so far

In [None]:
len(exact_duplicates_texts_indices)

In [24]:
len(not_useful_texts_indices)

59

In [25]:
len(not_useful_tldrs_indices)

2497

In [None]:
# Aggregate all the indices that should be removed
indices_to_remove = exact_duplicates_texts_indices + not_useful_texts_indices + not_useful_tldrs_indices

In [None]:
len(indices_to_remove)

# **Step 5:** Remove the indices & inspect the rest of the dataset for duplicates of summaries (column 'summary' -> 'clean_text')