In [1]:
# Uncomment if notebook is run in Colab
# %%capture
# !pip install datasets
# !pip install rouge-score

In [2]:
import datasets
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
import numpy as np
import random
from datasets import load_dataset, load_metric
from IPython.display import display, HTML

import warnings
warnings.filterwarnings('ignore')

In [3]:
rouge = load_metric('rouge', seed=42)

# Helper functions

def show_random_elements(dataset, num_examples=3):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

def rouge_2_recall(target_text_1, target_text_2):
    rouge_output = rouge.compute(predictions=target_text_2, references=target_text_1)
    ROUGE_2_recall = \
    round((rouge_output['rouge2'].low.recall \
           + rouge_output['rouge2'].mid.recall \
           + rouge_output['rouge2'].high.recall)/3, 1)
    
    return ROUGE_2_recall

def clean_string(string):
    string = re.sub('\*', '', string).lower().rstrip()
    return string

def remove_duplicate_sets_from_list(candidate_duplicates):
    candidate_duplicates_sets = list(map(set, candidate_duplicates))
    unique_sets = list(set(frozenset(item) for item in candidate_duplicates))
    candidate_duplicates_unique_sets = [set(item) for item in set(frozenset(item) for item in unique_sets)]
    
    candidate_duplicates_lists = []
    for item in candidate_duplicates_unique_sets:
        candidate_duplicates_lists.append(list(item))
        
    return candidate_duplicates_lists

## **Download Webis-TLDR-16**

- No train-val-test split is provided anywhere for Webis-TLDR-17
- We download Webis-TLDR-17 from Hugging Face datasets 
- the split='train' downloads the whole dataset

In [5]:
webis_tldr = load_dataset('reddit', split='train')

Using the latest cached version of the module from C:\Users\Anna\.cache\huggingface\modules\datasets_modules\datasets\reddit\bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e (last modified on Sun Nov 19 23:04:39 2023) since it couldn't be found locally at reddit., or remotely on the Hugging Face Hub.
Found cached dataset reddit (C:/Users/Anna/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e)


In [6]:
webis_tldr

Dataset({
    features: ['author', 'body', 'normalizedBody', 'subreddit', 'subreddit_id', 'id', 'content', 'summary'],
    num_rows: 3848330
})

In [11]:
# 3 random examples from the Webis-TLDR-17 TIFU dataset:

show_random_elements(webis_tldr)

Unnamed: 0,author,body,normalizedBody,subreddit,subreddit_id,id,content,summary
0,TheFurien,I worked as a security guard for a transportation company for 3 years before I got my promotion. I was attending college at the time and really hated doing security work. But then I was offered a postion as a dispatcher and slowly climbed the ranks of the company and now I'm a north bound planner and driver manager. I absolutely love my job. \nTL;DR: It's up to you to accept what you think you'll love to do. You'll be in the field for the rest of your life so make sure it counts. :),I worked as a security guard for a transportation company for 3 years before I got my promotion. I was attending college at the time and really hated doing security work. But then I was offered a postion as a dispatcher and slowly climbed the ranks of the company and now I'm a north bound planner and driver manager. I absolutely love my job. \nTL;DR: It's up to you to accept what you think you'll love to do. You'll be in the field for the rest of your life so make sure it counts. :) \n,AskReddit,t5_2qh1i,c7yf11r,I worked as a security guard for a transportation company for 3 years before I got my promotion. I was attending college at the time and really hated doing security work. But then I was offered a postion as a dispatcher and slowly climbed the ranks of the company and now I'm a north bound planner and driver manager. I absolutely love my job.,It's up to you to accept what you think you'll love to do. You'll be in the field for the rest of your life so make sure it counts. :)
1,Pirates_Smile,"**REALITY101** I honestly hope he can find his voice. His art is all over the place and not to be blunt, but, it doesn't say ANYTHING to me. Especially his photography. It honestly comes off as ""stock"" image photography. I agree with so many below me who have pointed out he needs to get a steady, paying job to support his art, not vice-versa.\n\n I also understand the angst that comes from an art student who has sold the farm, so to speak, to get into most of these art programs around the country. I honestly think MOST (really mean **ALL**) Art schools are some of the biggest impending financial scams being perpetrated on a HUGE demographic of seriously deluded people who have but a modicum of actual artistic talent or voice. And that's a shame. Brooks Institute, Savannah College of Art and Design, Art Institute of___________ are all predatory programs that are using people like your brother to pay for THEIR Beamers. I know this for a fact because I had a full-ride to Savannah's College of Art and Design and after my junior year I dropped out after a great discussion with one of my professors. Haven't looked back since.\n\nMy advice: Get a steady, bill-paying job doing something that doesn't kill him or his will to create, and keep working on his painting voice in his spare time. He's getting so much better and almost there.\n\n**TL;DR** Get a day job to keep food on the table and Netflicks going, Show and sell your work on the weekends you HIPPIE. Good luck.","REALITY101 I honestly hope he can find his voice. His art is all over the place and not to be blunt, but, it doesn't say ANYTHING to me. Especially his photography. It honestly comes off as ""stock"" image photography. I agree with so many below me who have pointed out he needs to get a steady, paying job to support his art, not vice-versa. \n I also understand the angst that comes from an art student who has sold the farm, so to speak, to get into most of these art programs around the country. I honestly think MOST (really mean ALL ) Art schools are some of the biggest impending financial scams being perpetrated on a HUGE demographic of seriously deluded people who have but a modicum of actual artistic talent or voice. And that's a shame. Brooks Institute, Savannah College of Art and Design, Art Institute of ___ are all predatory programs that are using people like your brother to pay for THEIR Beamers. I know this for a fact because I had a full-ride to Savannah's College of Art and Design and after my junior year I dropped out after a great discussion with one of my professors. Haven't looked back since. \n My advice: Get a steady, bill-paying job doing something that doesn't kill him or his will to create, and keep working on his painting voice in his spare time. He's getting so much better and almost there. \n TL;DR Get a day job to keep food on the table and Netflicks going, Show and sell your work on the weekends you HIPPIE. Good luck. \n",pics,t5_2qh0u,cac5d7s,"REALITY101 I honestly hope he can find his voice. His art is all over the place and not to be blunt, but, it doesn't say ANYTHING to me. Especially his photography. It honestly comes off as ""stock"" image photography. I agree with so many below me who have pointed out he needs to get a steady, paying job to support his art, not vice-versa. \n I also understand the angst that comes from an art student who has sold the farm, so to speak, to get into most of these art programs around the country. I honestly think MOST (really mean ALL ) Art schools are some of the biggest impending financial scams being perpetrated on a HUGE demographic of seriously deluded people who have but a modicum of actual artistic talent or voice. And that's a shame. Brooks Institute, Savannah College of Art and Design, Art Institute of ___ are all predatory programs that are using people like your brother to pay for THEIR Beamers. I know this for a fact because I had a full-ride to Savannah's College of Art and Design and after my junior year I dropped out after a great discussion with one of my professors. Haven't looked back since. \n My advice: Get a steady, bill-paying job doing something that doesn't kill him or his will to create, and keep working on his painting voice in his spare time. He's getting so much better and almost there.","Get a day job to keep food on the table and Netflicks going, Show and sell your work on the weekends you HIPPIE. Good luck."
2,puromyc1n,"I agree. But I dont see it happening, at least not any time soon.\n\nThe problem is twofold:\n\n1) The scale of nerfs required would unbalance the game so badly it would go back to nearly beta state\n\n2) That scale of change to the meta would wreak absolute havok on the SPL and professional gameplay.\n\n\n**The current burst style of this game creates problems further than that. The reason there ""arent enough items"" isnt completely because there arent enough items. Its because all we look for is damage or damage mitigation for as little gold as possible.** \n\nItems are barely bought for passives alone, and the items that are bought for passives is because the passive only increases the actual item attributes (e.g. tahuti=power, sov=protection).\n\nWhen do you see someone who knows what theyre doing buy Pythagorems piece or celestial helm or soul eater or hide of nimean or ichaival etc etc etc? Never.\n\nSame problem with actives, which are also unbalanced. Right now beads is a must have for almost everyone regardless of mobility, it is hands down better than every active except maybe hog 3. \n\n**TL;DR: The Burst issue is also the direct reason behind the ""cookie cutter build or gg"" problem, the lack of items/garbage items problem as well. In an effort to become faster paced and more action packed, the game has lost its depth and inadvertently simplified itself into an arms race of highest damage + best cc=win.**","I agree. But I dont see it happening, at least not any time soon. \n The problem is twofold: \n 1) The scale of nerfs required would unbalance the game so badly it would go back to nearly beta state \n 2) That scale of change to the meta would wreak absolute havok on the SPL and professional gameplay. \n The current burst style of this game creates problems further than that. The reason there ""arent enough items"" isnt completely because there arent enough items. Its because all we look for is damage or damage mitigation for as little gold as possible. \n Items are barely bought for passives alone, and the items that are bought for passives is because the passive only increases the actual item attributes (e.g. tahuti=power, sov=protection). \n When do you see someone who knows what theyre doing buy Pythagorems piece or celestial helm or soul eater or hide of nimean or ichaival etc etc etc? Never. \n Same problem with actives, which are also unbalanced. Right now beads is a must have for almost everyone regardless of mobility, it is hands down better than every active except maybe hog 3. \n TL;DR: The Burst issue is also the direct reason behind the ""cookie cutter build or gg"" problem, the lack of items/garbage items problem as well. In an effort to become faster paced and more action packed, the game has lost its depth and inadvertently simplified itself into an arms race of highest damage + best cc=win. \n",Smite,t5_2stl8,cltjh6q,"I agree. But I dont see it happening, at least not any time soon. \n The problem is twofold: \n 1) The scale of nerfs required would unbalance the game so badly it would go back to nearly beta state \n 2) That scale of change to the meta would wreak absolute havok on the SPL and professional gameplay. \n The current burst style of this game creates problems further than that. The reason there ""arent enough items"" isnt completely because there arent enough items. Its because all we look for is damage or damage mitigation for as little gold as possible. \n Items are barely bought for passives alone, and the items that are bought for passives is because the passive only increases the actual item attributes (e.g. tahuti=power, sov=protection). \n When do you see someone who knows what theyre doing buy Pythagorems piece or celestial helm or soul eater or hide of nimean or ichaival etc etc etc? Never. \n Same problem with actives, which are also unbalanced. Right now beads is a must have for almost everyone regardless of mobility, it is hands down better than every active except maybe hog 3.","The Burst issue is also the direct reason behind the ""cookie cutter build or gg"" problem, the lack of items/garbage items problem as well. In an effort to become faster paced and more action packed, the game has lost its depth and inadvertently simplified itself into an arms race of highest damage + best cc=win."


In [9]:
webis_tldr[8200]

{'author': 'ordig',
 'body': 'Jack Kerouac spent 7 years on the road. He wrote "On The Road" in 3 weeks on a single roll of paper. \n\nHe was not a professional writer. But, he had a story to tell. \n\nHe was not rationalizing his life decisions in writing.  He was not telling a story about himself. He was telling a story about the world his decisions led him to, who they him led to, and perhaps in so doing, why he was led to make them in the first place.\n\ntldr: Don\'t write a story about yourself. Its boring.',
 'normalizedBody': 'Jack Kerouac spent 7 years on the road. He wrote "On The Road" in 3 weeks on a single roll of paper. \n He was not a professional writer. But, he had a story to tell. \n He was not rationalizing his life decisions in writing.  He was not telling a story about himself. He was telling a story about the world his decisions led him to, who they him led to, and perhaps in so doing, why he was led to make them in the first place. \n tldr: Don\'t write a story ab

In [8]:
webis_tldr_df = webis_tldr.to_pandas()

# **Step 1:** inspect Webis-TLDR-17 for duplicates of the source texts ('content' column)

In [12]:
webis_tldr_df.iloc[20094] # Random element

author                                               Nightshade3312
body              If it has a whammy bar setup, or if the string...
normalizedBody    If it has a whammy bar setup, or if the string...
subreddit                                                    Guitar
subreddit_id                                               t5_2qi79
id                                                          c49hw3s
content           If it has a whammy bar setup, or if the string...
summary           if your strings are fed through the back of th...
Name: 20094, dtype: object

In [14]:
len(webis_tldr_df['content'].value_counts()) # Find number of unique values 

3807923

In [15]:
len(webis_tldr_df)

3848330

- The value 3,807,923 is smaller than the number of examples in the dataset (3,848,330).
- This indicates that there are duplicates, for the column 'content', in the dataset.
- 3,848,330-3,807,923=40,407 *exact* duplicates that should be removed

In [16]:
# Find the indices of the webis_tldr_df of the exact matches for the column 'content'
# store them in the *exact_duplicates_texts_indices* variable  

# Count the values of the field 'content' that occur more than once 
# print(len(webis_tldr_df['content'].value_counts()[webis_tldr_df['content'].value_counts() > 1]))

# Identify exact duplicates in the 'content' column
# 'exact_duplicates', will store a Series containing the exact duplicate contents along with their counts
exact_duplicates = webis_tldr_df['content'].value_counts()[webis_tldr_df['content'].value_counts() > 1]

exact_duplicates_df = pd.DataFrame({'value': exact_duplicates.index, 'occurencies_count': exact_duplicates.values})

# exact_duplicates_df['occurencies_count'].sum()

exact_duplicates_texts_indices_lists = []

for element in exact_duplicates_df['value'].to_list():
    element_occurence_indices = webis_tldr_df.index[webis_tldr_df['content'] == element].tolist()
    exact_duplicates_texts_indices_lists.append(element_occurence_indices)

# for the *exact_duplicates_texts_indices* we keep all the elements that are
# duplicates of the first element in each list,
# each first element ("original" element) index is not stored in exact_duplicates_texts_indices since
# it itself is not a duplicate

exact_duplicates_texts_indices = []

for element in exact_duplicates_texts_indices_lists:
    for i in range(1, len(element)):
        exact_duplicates_texts_indices.append(element[i])

In [None]:
# exact_duplicates_texts_indices_lists = (webis_tldr_df[webis_tldr_df['content'].isin(exact_duplicates_df['value'])]
#                                        .groupby('content')
#                                        .apply(lambda x: x.index.tolist())
#                                        .tolist())

In [17]:
exact_duplicates_df

Unnamed: 0,value,occurencies_count
0,What does,134
1,Be sure to explain in detail with line breaks.,104
2,Can someone,79
3,Are you looking for a fun server to play on?\n...,76
4,Thanks for the,66
...,...,...
30961,Hoping to get some advice here. My husband and...,2
30962,Hello! I want to stick an Ardunio in an airsof...,2
30963,About a week ago my girlfriend decided she wan...,2
30964,"Alright, loseit, here we are. I've been at thi...",2


In [19]:
exact_duplicates_df[5:15]

Unnamed: 0,value,occurencies_count
5,"I am looking for a Sub, but foremost, a friend...",56
6,when i walked in my bedroom home early from wo...,53
7,"I'm looking for a Sub, but foremost, a friend ...",52
8,Avoid that Kingston SSD. See [this thread]( as...,52
9,Upvote for the,47
10,Consider this a friendly warning/guideline:\nR...,46
11,Posted on /r/,45
12,what does,44
13,General Information' | So 2nd order so far wit...,44
14,Where's the,42


In [20]:
exact_duplicates_df.to_csv("webis_tldr_exact_duplicates_df.csv", sep="\t")

In [21]:
len(exact_duplicates_texts_indices) 

40407