In [None]:
%%capture
!pip install datasets
!pip install rouge-score

In [None]:
from datasets import load_dataset
import pandas as pd
import re

In [None]:
# Helper function

def clean_string(string):
    string = re.sub('\*', '', string).lower().rstrip()
    return string

In [4]:
webis_tldr = load_dataset('reddit', split='train')
reddit_tifu = load_dataset('reddit_tifu', 'long', split='train')

Downloading builder script:   0%|          | 0.00/4.38k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.83k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.14k [00:00<?, ?B/s]

Downloading and preparing dataset reddit/default to /root/.cache/huggingface/datasets/reddit/default/1.0.0/98ba5abea674d3178f7588aa6518a5510dc0c6fa8176d9653a3546d5afcb3969...


Downloading data:   0%|          | 0.00/3.14G [00:00<?, ?B/s]



Computing checksums: 100%|##########| 1/1 [00:10<00:00, 10.03s/it]

Generating train split:   0%|          | 0/3848330 [00:00<?, ? examples/s]

Dataset reddit downloaded and prepared to /root/.cache/huggingface/datasets/reddit/default/1.0.0/98ba5abea674d3178f7588aa6518a5510dc0c6fa8176d9653a3546d5afcb3969. Subsequent calls will reuse this data.


Downloading builder script:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

Downloading and preparing dataset reddit_tifu/long to /root/.cache/huggingface/datasets/reddit_tifu/long/1.1.0/1c73fb08807b54ec26b025829b2a3d90c6f7466dac20801c825571af9514c049...


Downloading data:   0%|          | 0.00/671M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/42139 [00:00<?, ? examples/s]

Dataset reddit_tifu downloaded and prepared to /root/.cache/huggingface/datasets/reddit_tifu/long/1.1.0/1c73fb08807b54ec26b025829b2a3d90c6f7466dac20801c825571af9514c049. Subsequent calls will reuse this data.


Select the slices of the initial datasets that correspond to:
* Webis-TLDR-17: tifu subreddit only, filtered, duplicates removed
* Reddit TIFU: filtered, duplicates removed

In [6]:
reddit_tifu_indices_to_keep = []
with open("/content/drive/MyDrive/NLP-AbstractiveSummarization/datasets/filtering & overlap examination of  Webis-TLDR-17 & Reddit TIFU/reddit_tifu_indices_to_keep.txt") as file:
    for line in file: 
        line = line.strip() #or some other preprocessing
        reddit_tifu_indices_to_keep.append(int(line))

webis_tldr_tifu_indices_to_keep = []
with open("/content/drive/MyDrive/NLP-AbstractiveSummarization/datasets/filtering & overlap examination of  Webis-TLDR-17 & Reddit TIFU/webis_tldr_tifu_indices_to_keep.txt") as file:
    for line in file: 
        line = line.strip() #or some other preprocessing
        webis_tldr_tifu_indices_to_keep.append(int(line))

webis_tldr_tifu_clean =  webis_tldr.select(webis_tldr_tifu_indices_to_keep)
reddit_tifu_clean =  reddit_tifu.select(reddit_tifu_indices_to_keep)

In [7]:
# Sanity check

print("webis_tldr_tifu_indices_to_keep: ", len(webis_tldr_tifu_indices_to_keep))
print("reddit_tifu_indices_to_keep: ", len(reddit_tifu_indices_to_keep))

webis_tldr_tifu_indices_to_keep:  51464
reddit_tifu_indices_to_keep:  41923


In [8]:
webis_tldr_tifu_clean

Dataset({
    features: ['author', 'body', 'normalizedBody', 'subreddit', 'subreddit_id', 'id', 'content', 'summary'],
    num_rows: 51464
})

In [9]:
reddit_tifu_clean

Dataset({
    features: ['ups', 'num_comments', 'upvote_ratio', 'score', 'documents', 'tldr', 'title'],
    num_rows: 41923
})

* Define the targets (summary) field for each of the datasets
* Filter the fields & add a 'text_clean' column to the dataframes

In [10]:
webis = webis_tldr_tifu_clean
tifu = reddit_tifu_clean

webis_df = webis.to_pandas()
tifu_df = tifu.to_pandas()

webis_targets = webis_df['summary']
tifu_targets = tifu_df['tldr']

webis_targets_df = pd.DataFrame({'index': webis_targets.index, 'text': webis_targets.values})
tifu_targets_df = pd.DataFrame({'index': tifu_targets.index, 'text': tifu_targets.values})

webis_targets_df['text_uncased'] = webis_targets_df.apply(lambda x: x['text'].lower(), axis=1)
tifu_targets_df['text_uncased'] = tifu_targets_df.apply(lambda x: x['text'].lower(), axis=1)

# From the manual inspection of the datasets it was observed that Webis-TLDR-17 often contains 
# edits, upadtes and resolutions in the end of the 'summary' fields, while Reddit TIFU always omits them.
# So we remove these part of the tldr summaries for Webis-TLDR-17 to make comparisons with 
# Reddit TIFU elements easier & because the edits, updates & resolutions do not 
# serve well the concept of the tldr summary of the source text

webis_targets_df['text_clean'] = webis_targets_df.apply(lambda x: clean_string(re.split('\n edit|\n update|\n resolution', x['text_uncased'], maxsplit=2)[0].rstrip()), axis=1)
tifu_targets_df['text_clean'] = tifu_targets_df.apply(lambda x: clean_string(x['text_uncased']), axis=1)

tifu_targets_list = tifu_targets_df['text_clean'].tolist()
webis_targets_list = webis_targets_df['text_clean'].tolist()

In [11]:
tifu_in_webis_indices_lists = []

# Compare every tldr summary in Reddit TIFU(clean) with every summary in Webis-TLDR-17(clean,tifu subreddit)
# *tifu_in_webis_indices* will store the occurencies of Reddit TIFU elements in Webis-TLDR-17 
# i.e. *tifu_in_webis_indices* stores Webis-TLDR-17(clean) indices

for element in tifu_targets_list:
    match = webis_targets_df['text_clean'].eq(element)
    matching_indices_df = pd.DataFrame({'indx': match.index, 'bool': match.values})
    matching_positions_list = matching_indices_df.index[matching_indices_df['bool'] == True].tolist()
    if len(matching_positions_list) > 0 :
        tifu_in_webis_indices_lists.append(matching_positions_list)

# Flatten list of lists
tifu_in_webis_indices_list = [element for sublist in tifu_in_webis_indices_lists for element in sublist]

# Remove duplicates from list
tifu_in_webis_indices = list(dict.fromkeys(tifu_in_webis_indices_list))

# Select the Webis-TLDR-17 indices to keep
indices_to_keep = []
all_webis_indices = []
all_webis_indices.extend(range(0,len(webis_df)))

webis_tifu_clean_indices_to_keep = [element for element in all_webis_indices if element not in tifu_in_webis_indices]

# with open('/content/drive/MyDrive/NLP-Abstractive Summarization/datasets/filtering & overlap examination of  Webis-TLDR-17 & Reddit TIFU/webis_tifu_clean_indices_to_keep.txt', 'w') as f:
#     for item in webis_tifu_clean_indices_to_keep:
#         f.write("%s\n" % item)

In [12]:
webis_tifu_clean_indices_to_discard = [element for element in all_webis_indices if element in tifu_in_webis_indices]

In [14]:
webis_targets_df

Unnamed: 0,index,text,text_uncased,text_clean
0,0,"For those keeping count, I had a dream-within-...","for those keeping count, i had a dream-within-...","for those keeping count, i had a dream-within-..."
1,1,I'm not sure I'd advise sticking 74 magnets up...,i'm not sure i'd advise sticking 74 magnets up...,i'm not sure i'd advise sticking 74 magnets up...
2,2,"Dutch ovened myself, barfed. I'm also probably...","dutch ovened myself, barfed. i'm also probably...","dutch ovened myself, barfed. i'm also probably..."
3,3,"Thumb slipped, junk whipped, everything dripped.","thumb slipped, junk whipped, everything dripped.","thumb slipped, junk whipped, everything dripped."
4,4,I'm a fucking moron,i'm a fucking moron,i'm a fucking moron
...,...,...,...,...
51459,51459,b] I tried to show my nephews how good carrots...,b] i tried to show my nephews how good carrots...,b] i tried to show my nephews how good carrots...
51460,51460,moral of the story is: Don't send girls who yo...,moral of the story is: don't send girls who yo...,moral of the story is: don't send girls who yo...
51461,51461,showed shy friend massive porn collection. Sho...,showed shy friend massive porn collection. sho...,showed shy friend massive porn collection. sho...
51462,51462,read it and be warned.,read it and be warned.,read it and be warned.


In [15]:
for item in webis_tifu_clean_indices_to_discard:
  print(webis_targets_df.iloc[item])

[1;30;43mΗ έξοδος ροής περικόπηκε στις τελευταίες 5000 γραμμές.[0m
index                                                       49692
text            I fucked up by calling one of my best friend's...
text_uncased    i fucked up by calling one of my best friend's...
text_clean      i fucked up by calling one of my best friend's...
Name: 49692, dtype: object
index                                                       49694
text            Took my 6th drive exam, woman used the emergen...
text_uncased    took my 6th drive exam, woman used the emergen...
text_clean      took my 6th drive exam, woman used the emergen...
Name: 49694, dtype: object
index                                           49701
text            Fell on a treadmill, flashed the gym.
text_uncased    fell on a treadmill, flashed the gym.
text_clean      fell on a treadmill, flashed the gym.
Name: 49701, dtype: object
index                                                       49704
text            Dad wants to show me a p

* Additionally, the same method we applied in filtering the Reddti TIFU & Webis-TLDR-17 datasets (find candidate duplicates from the tldr summaries & then confirm they are actual duplicates by computing the ROUGE-2 recall similarity of the source text, as described in *Zhang, J., Zhao, Y., Saleh, M., & Liu, P. (2020, November). Pegasus: Pre-training with extracted gap-sentences for abstractive summarization. In International Conference on Machine Learning (pp. 11328-11339). PMLR.*) could also be applied here for extra validation,
* but for these similarity computations would be very computationally expensive & take a very long time


* webis_tifu_clean_indices_to_keep.txt will be used to remove elements that appear in both Reddit TIFU & Webis-TLDR-17 occurencies from Webis-TLDR-17,
* that is all we need, as we will not be removing the common elements from both datasets (just from Webis-TLDR-17)
* the following code is just a sanity check

In [None]:
# Compare every tldr summary in Webis-TLDR-17(clean,tifu subreddit) with every summary in Reddit TIFU(clean)
# *webis_in_tifu_indices* will store the occurencies of Webis-TLDR-17 elements in Reddit TIFU
# i.e. *webis_in_tifu_indices* stores Reddit TIFU(clean) indices

webis_in_tifu_indices_lists = []

for element in webis_targets_list:
    match = tifu_targets_df['text_clean'].eq(element)
    matching_indices_df = pd.DataFrame({'indx': match.index, 'bool': match.values})
    matching_positions_list = matching_indices_df.index[matching_indices_df['bool'] == True].tolist()
    if len(matching_positions_list) > 0 :
        webis_in_tifu_indices_lists.append(matching_positions_list)

# Flatten list of lists
webis_in_tifu_indices_list = [element for sublist in webis_in_tifu_indices_lists for element in sublist]

# Remove duplicates from list
webis_in_tifu_indices = list(dict.fromkeys(webis_in_tifu_indices_list))