In [1]:
# Uncomment if notebook is run in Colab
# %%capture
# !pip install datasets
# !pip install rouge-score

In [2]:
import datasets
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
import numpy as np
import random
from datasets import load_dataset, load_metric
from IPython.display import display, HTML

import warnings
warnings.filterwarnings('ignore')

In [3]:
rouge = load_metric('rouge', seed=42)

# Helper functions

def show_random_elements(dataset, num_examples=3):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

def rouge_2_recall(target_text_1, target_text_2):
    rouge_output = rouge.compute(predictions=target_text_2, references=target_text_1)
    ROUGE_2_recall = \
    round((rouge_output['rouge2'].low.recall \
           + rouge_output['rouge2'].mid.recall \
           + rouge_output['rouge2'].high.recall)/3, 1)
    
    return ROUGE_2_recall

def clean_string(string):
    string = re.sub('\*', '', string).lower().rstrip()
    return string

def remove_duplicate_sets_from_list(candidate_duplicates):
    candidate_duplicates_sets = list(map(set, candidate_duplicates))
    unique_sets = list(set(frozenset(item) for item in candidate_duplicates))
    candidate_duplicates_unique_sets = [set(item) for item in set(frozenset(item) for item in unique_sets)]
    
    candidate_duplicates_lists = []
    for item in candidate_duplicates_unique_sets:
        candidate_duplicates_lists.append(list(item))
        
    return candidate_duplicates_lists

## **Download Reddit-TIFU**

- No train-val-test split for this dataset is provided or mentioned anywhere 
- We download Reddit TIFU from Hugging Face datasets 
- the split='train' downloads the whole dataset

In [4]:
reddit_tifu = load_dataset('reddit_tifu', 'long', split='train')

Found cached dataset reddit_tifu (C:/Users/Anna/.cache/huggingface/datasets/reddit_tifu/long/1.1.0/3136b11fbef3f2517de1d720621af110bd29e6083aebeab0d8ec198c9f95dc95)


In [5]:
reddit_tifu

Dataset({
    features: ['ups', 'num_comments', 'upvote_ratio', 'score', 'documents', 'tldr', 'title'],
    num_rows: 42139
})

In [6]:
reddit_tifu[8200]

{'ups': 0.0,
 'num_comments': 0.0,
 'upvote_ratio': 0.5,
 'score': 0.0,
 'documents': 'so this happened last week. \ni am a college junior and i am in a business communications course which is probably the most time consuming class i\'ve taken in college. \n\nanyways... the way this course is designed is your group gets a real live client (a local organization) and you do some sort of consulting for them. you always get a project manager (usually a ta) who you report everything you do to, including a prescreen of the final presentation to our client -- which is where the fuck up happens. \n\ngearing up for our final presentation, our project manager (who we will call gabe for the rest of this story) asked us to do the presentation for him before we present to the client. we all have extremely busy schedules, so the only time and place that worked for gabe and the team was 9pm in one of the reservable group rooms at the library. gabe had requested that we all show up in business profess

In [7]:
# 3 random examples from the Reddit TIFU dataset:

show_random_elements(reddit_tifu)

Unnamed: 0,ups,num_comments,upvote_ratio,score,documents,tldr,title
0,2185.0,554.0,0.94,2185.0,"so, some background information. i am a recent grad still looking for their first real salaried job. i'm an industrial designer, and i really want to work in the toy industry and be a toy designer, and maybe even a brand manager one day. the thing about the toy industry is that it is very small, and very competitive. with the two major companies (hasbro and mattel) having their pick of the litter when it comes to talent, finding openings there is tough. \n\nso rewind to graduation, i got a design internship at a major toy company. after the term of the internship was over they weren't in a position to hire anyone so they released me in to the world to find a real job. so, the office at this company was pretty casual. one of the female designers would sometimes wear fairly formal blouses and dresses, but the rest of the team would dress very casually. the most senior designer would regularly wear cargo shorts and a t-shirt to work, and the design manager (everyone's boss who reports directly to the ceo) would regularly wear t-shirts, torn, ripped jeans, and skater shoes. everyone else was fairly casual, but fell somewhere inbetween. i regularly dressed nicer than the average (button ups, khakis, ect) if for nothing else but to impress and not be the lowest common denominator. \n\nfast forward a few months, i've been working with a recruiter who specializes in the toy industry, and i had a very promising opportunity at a small company. several rounds of phone interviews went stellar, and they finally flew me out to atlanta for the big interview. now, somewhere along the way i remember either reading or being taught that wearing what you might normally wear to work to a job interview is usually acceptable. i have interviewed at other toy companies in person with this strategy and never seemed to have an issue (but never got the job either). not wanting to seem stiff, i left the blazer at home and hit the airport wearing pretty much exactly what i would regularly wear as an intern. button up (with a small, fun but not casual repeating print of cactuses) khakis and some dope sneaks for some flair. \n\nso i walk into their reception area and look around and think to my self, *wow, it is really nice in here*. i instantly become self conscious because the office is way more formal than i was expecting from all my previous interviewing and work experience in the industry. i go into the interview desperately hoping i just come off as the young, energetic, fun designer they are looking for. i meet the design manager who will be my boss for the first time in person (i would have been the 3rd member of the team), and we instantly hit it off. the interview is going very well and i never get any vibes from him that my clothes are unacceptable (he was wearing jeans). the interview is going super well, and the language starts to change from ""if you start"" to ""when you start"" and i am elated. he loves my work, my personality, and i like the product they make and immediately see many ways to help them develop it further (which is why they were hiring)\n\nthen a knock comes on the door. it's the coo. i knew he was going to be attending the interview at some point, but it had been almost an hour and a half before he shows up. in his words, he was checking for ""corporate and office fit"" and was leaving the talent and personality evaluation to the other guy since i would be working with him directly. he was wearing a purple long sleeve formal button up, with slacks and dress shoes. this portion of the interview goes well, and the design manager is visibly excited, telling the coo about me, and what i brought to the table (figuratively and literally). we wrap up after about another additional hour (2.5 hours total!) and i head back to the airport thinking i nailed it. we even talked about areas of atlanta i could look at moving to, moving assistance, and temporary housing while i looked for a place. they tell me they are interviewing another guy and i will know in about a week. \n\nso i get a call from my recruiter expecting good news. he tells me they went with the other guy and i'm crushed. he tells me they just wanted someone with a little more real world experience (i've heard this several times and it is beyond frustrating), and he leaves it at that. he tells me the other guy isn't a slam dunk because he has a small family to move and they weren't sure if he would accept, and i would be the guy if it falls through. it didn't fall through.\n\nit isn't until a few days later when i'm chatting with my recruiter about other possibilities and asking him to critique me as a candidate based on the feed back he has been getting (i had interviewed with 5 companies at this point, get furthest with the atlanta guys) and he drops the bomb that they thought i was dressed poorly! he quoted them as saying i ""looked like i was dressed to go to the mall"" and i wanted to shoot myself for not dressing up more. i could tell this was all coming from the coo and he most likely vetoed me because i didn't impress him with my atire. i was, and am still (it's been 2 months) crushed. and unemployed.\n\n \n\nedit: several people are commenting about the ""wear what you might wear to work"" idea. this came from my vocational background. i went to school to be an automotive technician before i went to art school for design. the idea behind that thinking is to ""look ready to start working immediatley"". obviously i realize now that this isn't appropriate for a corporate setting, but that was the idea behind it anyway.","the toy industry is laid back overall, and i assumed they would all be that way, but was rejected for not dressing formally enough in an interview by a c-level employee of the company i was interviewing for, even though i was a slam dunk with the guy i would have worked directly under.",wearing the wrong thing to a big job interview
1,312.0,78.0,0.96,312.0,"this was today. this fateful morning to be exact.\n\nit was gorgeous out and i decided to take the dog for an ol' walk. about a mile east of my house, on my very street, i happened to look down and see what looked like the arm of a baby doll lying on the side of the road. when i kicked it, the hilarious truth was revealed. this wasn't a doll arm - it was a dildo. a big, fat, flesh-toned, veiny dildo.\n\ni live in the rural northeast - not a city - an don't often see trash on the road, let alone a trash penis. a thought crossed my mind at that moment - two of my friends' birthdays are coming up and they live together. what if i spray painted this dildo, put googly eyes and a cape on it, and dubbed it ""dilbert the dildo""? funny apartment knickknack, no?\n\nnot wanting to be seen carrying what looks like andre the giant's wang, i slip the penis between my left arm and body, under the armpit and covered by the sweatshirt. you all know the maneuver, we've all done it while trying to hide something. so i carry on my merry way.\n\na few minutes later, i run into my neighbor - let's call him ed because he looks exactly like an ed. he's about my age, and my brother and i used to hang out with him as kids. i never liked him all that much - he was an ok guy but we never had anything in common. i'll always remember ed's shiny, fleshy body waddling around the pool. the epitome of ""the chubby kid"".\n\nso we say hi, exchange pleasantries, not much to say to one another like always. i'm a little nervous he'll spot dilbert under my arm but all seems to go well. not long after, he says he has to get going and i say the same.\n\nnow here's the fuckup. i'm holding my dog's leash with my right arm, so he sticks out his left hand to shake before we part. shaking hands is kind of one of those instinctual things for me, so when someone offers their hand i mindlessly stick out mine.\n\nmomentarily forgetting about dilbert.\n\n*plop*\n\nthe penis hits the hot asphalt with a soft bounce, rolls a bit, and finally lands veiny side up in full view. we both stare at it for a second which seems like eternity. my dog tries to investigate it but i shamefully pull her away. ed says something like ""well.. i should... head out"".\n\nhe walks away, head held low.\n\nbut, goddamn it reddit, i picked that dildo right back up and headed home, because as mortified as i was, dilbert must live on.",i picked up the bone.,picking up a road dildo.
2,3.0,17.0,0.53,3.0,"after eloquently preparing a text based choke slam to a friend of mine in the notes application, i pasted it to the wrong message. my recover was less than smooth.\n\nhttp://imgur.com/symbxru",my copypasta game is subpar; i texted crush a hard-worded message.,texting the wrong person.


# **Step 1:** inspect Reddit-TIFU for duplicates of the source texts ('documents' column)

In [8]:
reddit_tifu_df = reddit_tifu.to_pandas()

In [9]:
reddit_tifu_df.iloc[20094] # Random element

ups                                                           5.0
num_comments                                                  6.0
upvote_ratio                                                 0.87
score                                                         5.0
documents       earlier this week*\n\nso, i have this intervie...
tldr            had an interview. forgot interviewers name. ca...
title                           asking an interviewer for a name.
Name: 20094, dtype: object

In [10]:
len(reddit_tifu_df['documents'].value_counts())

42101

- The value 42101 is smaller than the number of examples in the dataset (42139).
- This indicates that there are duplicates, for the column 'documents', in the dataset.
- 42139-42101=38 *exact* duplicates that should be removed

In [11]:
# Find the indices of the reddit_tifu_df of the exact matches for the column 'documents'
# store them in the *exact_duplicates_texts_indices* variable  

# Count the values of the field 'documents' that occur more than once 
# print(len(reddit_tifu_df['documents'].value_counts()[reddit_tifu_df['documents'].value_counts() > 1]))

# Identify exact duplicates in the 'documents' column
# 'exact_duplicates', will store a Series containing the exact duplicate documents along with their counts
exact_duplicates = reddit_tifu_df['documents'].value_counts()[reddit_tifu_df['documents'].value_counts() > 1]

exact_duplicates_df = pd.DataFrame({'value': exact_duplicates.index, 'occurencies_count': exact_duplicates.values})

# exact_duplicates_df['occurencies_count'].sum()

exact_duplicates_texts_indices_lists = []

for element in exact_duplicates_df['value'].to_list():
    element_occurence_indices = reddit_tifu_df.index[reddit_tifu_df['documents'] == element].tolist()
    exact_duplicates_texts_indices_lists.append(element_occurence_indices)

# for the *exact_duplicates_texts_indices* we keep all the elements that are
# duplicates of the first element in each list,
# each first element ("original" element) index is not stored in exact_duplicates_texts_indices since
# it itself is not a duplicate

exact_duplicates_texts_indices = []

for element in exact_duplicates_texts_indices_lists:
    for i in range(1, len(element)):
        exact_duplicates_texts_indices.append(element[i])

In [12]:
exact_duplicates_df

Unnamed: 0,value,occurencies_count
0,so this happened last week. \ni am a college j...,8
1,so this happened last week. \ni am a college j...,5
2,so this happened last week. \ni am a college j...,4
3,so this happened last week. \ni am a college j...,4
4,today i was invited to a mavericks game by my ...,2
5,"so this date backs to a couple of days ago, bu...",2
6,obligatory this didn't happen today. this happ...,2
7,a little bit of context for this. i am a 16 ye...,2
8,so i'm a young male and therefore an avid tind...,2
9,"this happened two days ago, and the only reaso...",2


In [13]:
len(exact_duplicates_texts_indices) 

38

The lenght of the *exact_duplicates_texts_indices* list confirms our initial finding; "42139-42101=38 exact duplicates that should be removed"

# **Step 2:** inspect dataset for problematic source texts ('documents' column)

In [14]:
not_useful_texts_indices = []

# Find the indices of the 'documents' that are empty or not text (e.g., punctuation marks only)

''' a regular expression that describes text: '''
text_pattern = re.compile("([a-z1-9])+.", re.IGNORECASE)

for i in range(len(reddit_tifu_df)):
    if len(reddit_tifu_df['documents']) == 0 or not(text_pattern.search(reddit_tifu_df['documents'].loc[i])):
        not_useful_texts_indices.append(i)

# **Step 3:** inspect dataset for problematic summaries ('tldr' column)

In [15]:
# Prepare a dataframe to examine the 'tldr' column values

reddit_tifu_targets = reddit_tifu_df['tldr']
reddit_tifu_targets_df = pd.DataFrame({'original_index': reddit_tifu_targets.index, 'text': reddit_tifu_targets.values})

In [16]:
# Remove the special character * that appears often in the original 'tldr' field
# but offers no practical value 
reddit_tifu_targets_df['clean_text'] = reddit_tifu_targets_df['text'].apply(clean_string)

In [17]:
reddit_tifu_targets_df

Unnamed: 0,original_index,text,clean_text
0,0,confuse a 5th grade girl for a boy in front of...,confuse a 5th grade girl for a boy in front of...
1,1,"i found my estranged dad, thought i loved him ...","i found my estranged dad, thought i loved him ..."
2,2,had my balls burned by sauron and was left dev...,had my balls burned by sauron and was left dev...
3,3,peppermint + bath = burning cold ladybits.,peppermint + bath = burning cold ladybits.
4,4,"got too high and too hot in the bath, almost c...","got too high and too hot in the bath, almost c..."
...,...,...,...
42134,42134,forgot my quarter for lunch at school for a we...,forgot my quarter for lunch at school for a we...
42135,42135,girlfriend prefers clean shaven groin. i try t...,girlfriend prefers clean shaven groin. i try t...
42136,42136,today i broke a window that costs more then i ...,today i broke a window that costs more then i ...
42137,42137,i invited over new girlfriend for dinner to sp...,i invited over new girlfriend for dinner to sp...


Next: Find the indices of the items that are not useful (not informative);
*   nonsensical tldrs (e.g., punctuation marks only),
*   tldrs that clearly are not a summary (e.g., "see title") 

In [18]:
# Find the indices of the items that are not useful (not informative);
#   - nonsensical tldrs (e.g., punctuation marks only),
#   - tldrs that clearly are not a summary (e.g., "see title") 

not_useful_tldrs_indices = []

# Find the indices of the TLDRs that empty or not text (e.g., punctuation marks only)

''' a regular expression that describes text: '''
text_pattern = re.compile("([a-z1-9])+.", re.IGNORECASE)

for i in range(len(reddit_tifu_df)):
    if len(reddit_tifu_df['tldr']) == 0 or not(text_pattern.search(reddit_tifu_df['tldr'].loc[i])):
        not_useful_tldrs_indices.append(i)

# Find the indices of the TLDRs that are not useful, e.g., "see title"

not_useful_tldrs = ['title', 'title.',
                    'see title', 'see title.',
                    'in the title', 'in the title.',
                    'read title', 'read title.', 'read the title',
                    'read up', 'read up!',
                    'read it', 'read it!',
                    'at bottom', 'at bottom.',
                    'at the bottom', 'at the bottom.',
                    'at the end', 'at the end.',
                    'at the top', 'at the top.',
                    'version:',
                    'upvote', 'upvote.', 'upvote!',
                    'mandatory summary/question!']

for i in range(len(reddit_tifu_df)):
    if reddit_tifu_df.loc[i]['tldr'] in not_useful_tldrs:
        not_useful_tldrs_indices.append(i)

for indx in not_useful_tldrs_indices:
    print(reddit_tifu_df.loc[indx]['tldr'])

?
---------
**
( ͡° ͜ʖ ͡°)
,
**
**
~~
k
**
**
**
(( ͡° ͜ʖ ͡°)͜ʖ( ͡° ͜ʖ ͡°))*
-
**:
**
**
**
**
--
"
**
???
**
)**
*
:
*
**:
**
:
⬆️
**
:
,
;
/╲/( ͡° ͡° ͜ʖ ͡° ͡°)/\╱\
**
,
;
?**
**
**
]
*
**
-
**
*
**
**
:
;
**
**
:
**
:
**
**
.
**
;
/
**
;
*
💨 💨 🐝💦💦💻 😯😐
?
*
*
:
)
,
,
'
'
.**
'
]
.**
**
read the title
at the bottom
read the title
read the title
see title
version:
title.
at the bottom.
version:
at the end.
see title
read the title
at the bottom.
see title.
title.
see title
at the bottom.
at the bottom.
at bottom.
at the bottom.
see title
at the bottom.
title
at bottom.
title.
read the title
read title
at the bottom.
title
title
at the bottom.
at the bottom.
at the bottom
title
at the end.
title.
at bottom.
at the bottom
at bottom
at the bottom
at bottom.
at the bottom.
at the bottom
at the end.


# **Step 4:** Aggregate all the indices that should be removed, found so far

In [19]:
len(exact_duplicates_texts_indices)

38

In [20]:
len(not_useful_texts_indices)

1

In [21]:
len(not_useful_tldrs_indices)

126

In [22]:
# Aggregate all the indices that should be removed
indices_to_remove = exact_duplicates_texts_indices + not_useful_texts_indices + not_useful_tldrs_indices

In [23]:
len(indices_to_remove)

165

# **Step 5:** Remove the indices & inspect the rest of the dataset for duplicates of summaries (column 'tldr' -> 'clean_text')

In [24]:
# Select the reddit_tifu indices to keep by removing the indices to remove

all_indices = []
all_indices.extend(range(0,42139))

indices_to_keep = [x for x in all_indices if x not in indices_to_remove]

**One more step: search for candidate duplicates using the values of the column 'tldr'**

In [25]:
reddit_tifu_targets_df = reddit_tifu_targets_df.iloc[indices_to_keep]

len(reddit_tifu_targets_df)

41974

In [26]:
reddit_tifu_targets_df.reset_index(drop=True, inplace=True)

In [27]:
reddit_tifu_targets_df

Unnamed: 0,original_index,text,clean_text
0,0,confuse a 5th grade girl for a boy in front of...,confuse a 5th grade girl for a boy in front of...
1,1,"i found my estranged dad, thought i loved him ...","i found my estranged dad, thought i loved him ..."
2,2,had my balls burned by sauron and was left dev...,had my balls burned by sauron and was left dev...
3,3,peppermint + bath = burning cold ladybits.,peppermint + bath = burning cold ladybits.
4,4,"got too high and too hot in the bath, almost c...","got too high and too hot in the bath, almost c..."
...,...,...,...
41969,42134,forgot my quarter for lunch at school for a we...,forgot my quarter for lunch at school for a we...
41970,42135,girlfriend prefers clean shaven groin. i try t...,girlfriend prefers clean shaven groin. i try t...
41971,42136,today i broke a window that costs more then i ...,today i broke a window that costs more then i ...
41972,42137,i invited over new girlfriend for dinner to sp...,i invited over new girlfriend for dinner to sp...


In [28]:
reddit_tifu_targets_df.iloc[20005]['text']

': friends mom caught us with pot, parents got disappointed in me, forced to study the whole day **'

In [29]:
reddit_tifu_targets_df.iloc[20005]['original_index']

20098

# **Step 6**:  Inspect the rest of the dataset for duplicates of summaries (column 'tldr' -> 'clean_text')

* Next we look for candidate duplicates based on the values of the column 'tldr',
* we call them 'candidate' duplicates because: identical values in the 'tldr' column do not necessarily indicate a duplicate element in the Reddit TIFU

In [30]:
# E.g., the following two elements of Reddit TIFU,
# have the same 'tldr' but are not duplicates

print("\n**Reddit TIFU indx 20074**")
print(f"TLDR SUMMARY: {reddit_tifu_df.loc[20074]['tldr']}")
print(f"SOURCE TEXT: {reddit_tifu_df.loc[20074]['documents']}")

print("\n**Reddit TIFU indx 23123**")
print(f"TLDR SUMMARY: {reddit_tifu_df.loc[23123]['tldr']}")
print(f"SOURCE TEXT: {reddit_tifu_df.loc[23123]['documents']}")


**Reddit TIFU indx 20074**
TLDR SUMMARY: think before you speak
SOURCE TEXT: yipee, this just happened (+5 tifu points)

i was watching the anzac ceremony in gallipoli on tv, like a new zealander should, but the volume was on 2. usually my family listens to the tv on volume 8-11 so my ears struggled to pick up the sounds.

i was not in the vicinity of the remote so i was unable to do it myself.

heres the fu:

i said "its very quiet" while they were playing the national anthem of turkey.

in my head i was wanting to hear the national anthem as i had never heard it before. instead it sounded like an offensive joke because there was a silence while the anthems play. awkward looks ensued.

**Reddit TIFU indx 23123**
TLDR SUMMARY: think before you speak
SOURCE TEXT: little background info: sometimes i blurt out things before i realize it wasn't a good idea to say it.

anyway were standing around and my friend was talking about what this guy could play when he uses the wah wah pedal ( http

"think before you speak" seems to be a popular conclusion :)

In [31]:
candidate_duplicates = []

reddit_tifu_targets_list = reddit_tifu_targets_df['clean_text'].tolist()

for indx in range(len(reddit_tifu_targets_df.index)):
    target = reddit_tifu_targets_list[indx]
    match = reddit_tifu_targets_df['clean_text'].eq(target)
    matching_indices_df = pd.DataFrame({'indx': match.index, 'bool': match.values})

    ''' matching_positions_list stores lists of all the positions(indices) that 
    match the target, the target position itself included.
    matching_positions_list contains repetitions (of the same set) and should 
    be filtered '''
    matching_positions_list = matching_indices_df.index[matching_indices_df['bool'] == True].tolist()
    if len(matching_positions_list) > 1 :
        candidate_duplicates.append(matching_positions_list)

''' candidate duplicates contains repetitions of the same set '''
candidate_duplicates_sets = list(map(set, candidate_duplicates))
unique_sets = list(set(frozenset(item) for item in candidate_duplicates))
candidate_duplicates_unique_sets = [set(item) for item in set(frozenset(item) for item in unique_sets)]

candidate_duplicates_lists = []
for item in candidate_duplicates_unique_sets:
    candidate_duplicates_lists.append(list(item))

* After finding the candidate duplicates based on the 'tldr' column
* we compare the corresponding source texts ('documents' column) to figure out if they are actual duplicates
* to compare the source texts for similarity ROUGE-2 recall is used
* two texts are considered duplicates if ROUGE-2 recall > 0.8 
* this way of computing similarity is based on the approach used in *Zhang, J., Zhao, Y., Saleh, M., & Liu, P. (2020, November). Pegasus: Pre-training with extracted gap-sentences for abstractive summarization. In International Conference on Machine Learning (pp. 11328-11339). PMLR.*

In [32]:
duplicates_tldrs_indices = []

for element in candidate_duplicates_lists:
    for i in range(1, len(element)):
        target_1 = reddit_tifu_df.loc[reddit_tifu_targets_df.loc[element[0]]['original_index']]['documents']
        target_2 = reddit_tifu_df.loc[reddit_tifu_targets_df.loc[element[i]]['original_index']]['documents']
      
    if rouge_2_recall([target_1], [target_2])>=0.8:        
        duplicates_tldrs_indices.append(reddit_tifu_targets_df.loc[element[i]]['original_index'])

In [33]:
# Sanity check: there should be no common elements in the two lists
set(duplicates_tldrs_indices) & set(indices_to_remove)

set()

# **Reddit TIFU indices that correspond to duplicates**

In [34]:
reddit_tifu_duplicates_indices = exact_duplicates_texts_indices + duplicates_tldrs_indices

In [35]:
len(reddit_tifu_duplicates_indices)

89

## **Reddit TIFU indices that will be removed from the dataset (duplicates + not useful)**

In [36]:
reddit_tifu_indices_to_remove = exact_duplicates_texts_indices \
                                + duplicates_tldrs_indices \
                                + not_useful_texts_indices \
                                + not_useful_tldrs_indices

In [37]:
len(reddit_tifu_indices_to_remove)

216

In [38]:
# Select the reddit_tifu indices to keep by removing the indices to remove

all_reddit_tifu_indices = []
all_reddit_tifu_indices.extend(range(len(reddit_tifu_df)))

reddit_tifu_indices_to_keep = [element for element in all_reddit_tifu_indices if element not in reddit_tifu_indices_to_remove]

In [39]:
len(reddit_tifu_indices_to_keep)

41923

In [40]:
with open('reddit_tifu_indices_to_keep.txt', 'w') as f:
    for item in reddit_tifu_indices_to_keep:
        f.write("%s\n" % item)