In [1]:
%%capture
!pip install datasets
!pip install rouge-score

In [2]:
import datasets
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
import numpy as np
import random
from datasets import load_dataset, load_metric
from IPython.display import display, HTML

import warnings
warnings.filterwarnings('ignore')

In [3]:
rouge = load_metric('rouge', seed=42)

# Helper functions

def show_random_elements(dataset, num_examples=3):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

def rouge_2_recall(target_text_1, target_text_2):
    rouge_output = rouge.compute(predictions=target_text_2, references=target_text_1)
    ROUGE_2_recall = \
    round((rouge_output['rouge2'].low.recall \
           + rouge_output['rouge2'].mid.recall \
           + rouge_output['rouge2'].high.recall)/3, 1)
    
    return ROUGE_2_recall

def clean_string(string):
    string = re.sub('\*', '', string).lower().rstrip()
    return string

def remove_duplicate_sets_from_list(candidate_duplicates):
    candidate_duplicates_sets = list(map(set, candidate_duplicates))
    unique_sets = list(set(frozenset(item) for item in candidate_duplicates))
    candidate_duplicates_unique_sets = [set(item) for item in set(frozenset(item) for item in unique_sets)]
    
    candidate_duplicates_lists = []
    for item in candidate_duplicates_unique_sets:
        candidate_duplicates_lists.append(list(item))
        
    return candidate_duplicates_lists

## **Download Reddit TIFU**

- No train-val-test split for this dataset is provided or mentioned anywhere 
- We download Reddit TIFU from Hugging Face datasets 
- the split='train' downloads the whole dataset

In [4]:
reddit_tifu = load_dataset('reddit_tifu', 'long', split='train')

Found cached dataset reddit_tifu (C:/Users/Anna/.cache/huggingface/datasets/reddit_tifu/long/1.1.0/3136b11fbef3f2517de1d720621af110bd29e6083aebeab0d8ec198c9f95dc95)


In [5]:
reddit_tifu

Dataset({
    features: ['ups', 'num_comments', 'upvote_ratio', 'score', 'documents', 'tldr', 'title'],
    num_rows: 42139
})

In [6]:
reddit_tifu[8200]

{'ups': 0.0,
 'num_comments': 0.0,
 'upvote_ratio': 0.5,
 'score': 0.0,
 'documents': 'so this happened last week. \ni am a college junior and i am in a business communications course which is probably the most time consuming class i\'ve taken in college. \n\nanyways... the way this course is designed is your group gets a real live client (a local organization) and you do some sort of consulting for them. you always get a project manager (usually a ta) who you report everything you do to, including a prescreen of the final presentation to our client -- which is where the fuck up happens. \n\ngearing up for our final presentation, our project manager (who we will call gabe for the rest of this story) asked us to do the presentation for him before we present to the client. we all have extremely busy schedules, so the only time and place that worked for gabe and the team was 9pm in one of the reservable group rooms at the library. gabe had requested that we all show up in business profess

In [7]:
# 3 random examples from the Reddit TIFU dataset:

show_random_elements(reddit_tifu)

Unnamed: 0,ups,num_comments,upvote_ratio,score,documents,tldr,title
0,44.0,9.0,0.84,44.0,"tifu by trying to download diablo iii at work\n\nit started out like any other friday, with the regular flow of emails and support tickets getting handled fairly early on in the day. with these distractions out of the way i set about my normal course of action for the end of the work week: browsing reddit and watching the occasional video on youtube. \n\nby midday myself and the other support team members noticed that a large number of cars were missing from the parking lot. one coworker had taken the day off to extend his weekend playing the new wow expansion, and it appeared that every other employee except the support team and head software engineer had gotten a head start on the long holiday weekend. \n\nin a stroke of genius, i came to the sudden realization that diablo iii could be played on mac os x (our office is primarily mac based). i shared this information with my boss (who also happens to own diablo and is an avid work redditor), and half-jokingly suggested we both download a copy to our systems since our cubicles face out towards the hall, meaning our monitors are not in the public eye. \n\nhe was initially hesitant, as the head software engineer also oversees our servers, allowing him to monitor network activity if he so chose. we came to the conclusion that this was an unlikely possibility, as he has much more important tasks to take care of in his position. \n\nat this point, i eagerly began downloading battlenet to my computer, and my boss agreed to join me, opting to wait for my download to complete so that the could copy the directory files to his system so that we could minimize bandwidth usage. \n\nthings were going smoothly. \n\n2gb downloaded. \n\n4gb. \n\n6gb. \n\n8gb. \n\naround the 10gb mark, i get a message from a web developer (who i used to work with in support) in the room next door who had returned from lunch.\n\n""stop whatever you’re downloading”. “quick, stop now”. \n\nin mere seconds, the door to the support room opens, and low and behold, the head software engineer walks in with a slight smirk on his face. \n\n“hey, is anyone downloading a large file in here?” he asked.\n\n“uhhhhhhhhh….” was all i could muster in a high-pitched tone. “maybe”.\n\t\nhe followed with “is it work related?” \n\n“uhhhhhhhhh….” i replied again. “no….”\n\n“i don’t care”, he said. “but can you download on the wifi? i’m doing some work with the server”.\n\nahhh. sweet relief. i promptly uninstalled the portion i had downloaded to get rid of the evidence, and we had a good laugh in the support room. \n\n \n\t \nedit: fixed the tl;dr.\n\nupdate:\n\nmere minutes later. in the support room we somehow get on the topic of online and in person dating, as several guys on the team are single and one was flipping through tinder. my boss got on his case about going out to a bar with us so the tinder team member could meet people in person. \n\nwhen he said he couldn’t go, a joke was made about to chance of finding two girls, and me ending up with both of them since he couldn’t make it. another coworker chimes in with:\n\n“if there are ever groups of girls when you two are at a bar. call me.”\n\nanother joke is cracked as the conversation volume begins to escalate about the chances of a foursome being ruined if i call him. cue a message from friendly web developer next door:\n\n“we can hear every world ya’ll are saying. fyi”\n\nam i bad at my job? probably.","tried to download diablo iii at work, used all the office bandwidth.",trying to download diablo iii at work
1,206.0,46.0,0.95,206.0,"so, my tifu was actually last night. we were in a nerf war with the kids, the bullets were flying all over. we were in the kitchen, and i pointed my gun in her general direction without really looking and pulled the trigger - and then she hit the floor screaming. i had hit her squarely in the right eye from a distance of 10 feet with a nerf mega bullet. once she was able to open it, we saw that her normally blue eye was 1/2 dark - her eye was bleeding internally. fuck.\n3 hours in the er later, it appears to be a ruptured blood vessel. she's going to an ophthalmologist later today to get a better assessment of the situation. the eye already looks better this morning than it did last night, but we don't know for sure yet.\n\ni feel terrible about this.\n\n[picture of her eyes from last night](http://i.imgur.com/apnmyiq.jpg)","i accidentally shot my wife in the eye with a nerf gun from about 10 feet, causing her eye to bleed internally.",accidentally shooting my wife in the eye with a nerf gun
2,3.0,4.0,0.72,3.0,"before i start, i'm just going to say that i don't know if any of my coworkers browse reddit. they probably don't, but if they do then posting this is a bad idea but i will do so regardless. so if any of my coworkers happen to be reading this: fuck you.\n\nthis didn't happen today, but rather it happened on saturday and i had a talk with my manager about it today.\n\nalright, so i recently started a new job at a youth camp in my area. they have educational groups during the week and on the weekends it's church groups. i work the weekend shift, so i deal with the church peeps. saturday was my third day on the job, and i met some more of my coworkers (previously only met 3, which all seemed pretty chill). given that the 3 i already met were pretty chill i assumed that everyone else was as well since they're all friends, but man was i wrong...\n\nthe groups often play music in the dining hall while they eat, but since i work while church groups are there they always play gospel music. i went into the dining hall to get some tea and snacks while the group was in there eating dinner, and they were listening to a pretty wide variety of music. when i went back into the staff area i walked in the door and said ""man, first they played rap, then gospel and now rock. what's up with this group?"" there was nothing wrong with what i said, i was just a little surprised. but the concern was that one of the group members could have heard me (which i doubt. but even if they did, there was nothing offensive about it). i can understand the concern that if they did it might have been taken offensively, but my coworkers blew it out of proportion. when i met with my manager today i was told that i claimed they were evidently ""faking being christian"". uh, no. i don't give a fuck what they are, i'm not even religious. why would i judge how christian or not christian they are when i'm not even a christian?\n\nmoving on, while the staff were eating dinner someone else was making a joke about watching porn on a computer but everyone else only seeing a black screen. i commented that if you take the top layer of the computer screen off, cut it, and paste it onto some glasses then nobody can see what's on the screen unless they wear the glasses. but how did it pop up on the paper i got from my manager today? ""it was reported that during the staff meal the conversation in the conference room that ihatemycoworkers5eva used an innuendo about a darkened computer screen to discuss an act of masturbation."" but it gets even more pathetic...\n\nthey were discussing shaving at one point and i made a joke about men shaving their pits. on the sheet? ""it was also reported that ihatemycoworkers5eva was questioning the fact of armpit shaving.""\n\ni don't think my coworkers like me, which i don't understand because the other staff that met me told my manager that they thought i was a really nice guy. but no, they wanted to get offended over literally nothing and report me for stuff i didn't even do, or take light jokes and blow them out of proportion. the talk with my manager today was to say that it's policy that i not ""speak in a loud and self-opinionated manner about any of the groups music preferences or defame their character in any manner"" and also that i'm not allowed to discuss anything inappropriate that may make others uncomfortable.\n\ni get to keep my job, but i had to sign some paper saying it all happened - figured it wouldn't even be worth my time to refute it, it would probably cause more problems with my coworkers. if anything else like this happens i'll either be suspended or terminated. not sure which. i'm just really mad about this right now, i was hoping that everyone would be chill at this job. i'm not going to quit, it's still a great job, but i'm definitely not talking to any of my coworkers ever again unless it is absolutely necessary to do so to complete my daily tasks. but hey, maybe i'll get lucky and they'll all quit before summer, they are students after all.\n\n*",my coworkers are easily offended and blow things extremely out of proportion.,trying to get along with my coworkers


# **Step 1:** inspect Reddit TIFU for duplicates of the source texts ('documents' column)

In [8]:
reddit_tifu_df = reddit_tifu.to_pandas()

In [9]:
reddit_tifu_df.iloc[20094]

ups                                                           5.0
num_comments                                                  6.0
upvote_ratio                                                 0.87
score                                                         5.0
documents       earlier this week*\n\nso, i have this intervie...
tldr            had an interview. forgot interviewers name. ca...
title                           asking an interviewer for a name.
Name: 20094, dtype: object

In [10]:
len(reddit_tifu_df['documents'].value_counts())

42101

- The value 42101 is smaller than the number of examples in the dataset (42139).
- This indicates that there are duplicates, for the column 'documents', in the dataset.
- 42139-42101=38 exact duplicates that should be removed

In [11]:
# Find the indices of the reddit_tifu_df of the exact matches for the column 'documents'
# will be stored in the *exact_duplicates_texts_indices* variable  

# Count the values of the field 'documents' that occur more than once 
# print(len(reddit_tifu_df['documents'].value_counts()[reddit_tifu_df['documents'].value_counts() > 1]))

exact_duplicates = reddit_tifu_df['documents'].value_counts()[reddit_tifu_df['documents'].value_counts() > 1]

exact_duplicates_df = pd.DataFrame({'value': exact_duplicates.index, 'occurencies_count': exact_duplicates.values})

# exact_duplicates_df['occurencies_count'].sum()

exact_duplicates_texts_indices_lists = []

for element in exact_duplicates_df['value'].to_list():
    element_occurence_indices = reddit_tifu_df.index[reddit_tifu_df['documents'] == element].tolist()
    exact_duplicates_texts_indices_lists.append(element_occurence_indices)

# for the *exact_duplicates_texts_indices* we keep all the elements that are
# duplicates of the first element in each list, 
# each first element index is not stored in exact_duplicates_texts_indices since
# it itself is not a duplicate

exact_duplicates_texts_indices = []

for element in exact_duplicates_texts_indices_lists:
    for i in range(1, len(element)):
        exact_duplicates_texts_indices.append(element[i])

In [12]:
exact_duplicates_df

Unnamed: 0,value,occurencies_count
0,so this happened last week. \ni am a college j...,8
1,so this happened last week. \ni am a college j...,5
2,so this happened last week. \ni am a college j...,4
3,so this happened last week. \ni am a college j...,4
4,today i was invited to a mavericks game by my ...,2
5,"so this date backs to a couple of days ago, bu...",2
6,obligatory this didn't happen today. this happ...,2
7,a little bit of context for this. i am a 16 ye...,2
8,so i'm a young male and therefore an avid tind...,2
9,"this happened two days ago, and the only reaso...",2


# **Step 2:** inspect dataset for problematic source texts ('documents' column)

In [13]:
not_useful_texts_indices = []

# Find the indices of the 'documents' that empty or not text (e.g., punctuation marks only)

''' a regular expression that describes text: '''
text_pattern = re.compile("([a-z1-9])+.", re.IGNORECASE)

for i in range(len(reddit_tifu_df)):
    if len(reddit_tifu_df['documents']) == 0 or not(text_pattern.search(reddit_tifu_df['documents'].loc[i])):
        not_useful_texts_indices.append(i)

# **Step 3:** inspect dataset for problematic summaries ('tldr' column)

In [14]:
# Prepare a dataframe to examine the 'tldr' column values

reddit_tifu_targets = reddit_tifu_df['tldr']
reddit_tifu_targets_df = pd.DataFrame({'original_index': reddit_tifu_targets.index, 'text': reddit_tifu_targets.values})

# Remove the special character * that appears often in the original 'tldr' field
# but offers no practical value 
reddit_tifu_targets_df['clean_text'] = reddit_tifu_targets_df.apply(lambda x: clean_string(x['text']), axis=1)

In [15]:
reddit_tifu_targets_df

Unnamed: 0,original_index,text,clean_text
0,0,confuse a 5th grade girl for a boy in front of...,confuse a 5th grade girl for a boy in front of...
1,1,"i found my estranged dad, thought i loved him ...","i found my estranged dad, thought i loved him ..."
2,2,had my balls burned by sauron and was left dev...,had my balls burned by sauron and was left dev...
3,3,peppermint + bath = burning cold ladybits.,peppermint + bath = burning cold ladybits.
4,4,"got too high and too hot in the bath, almost c...","got too high and too hot in the bath, almost c..."
...,...,...,...
42134,42134,forgot my quarter for lunch at school for a we...,forgot my quarter for lunch at school for a we...
42135,42135,girlfriend prefers clean shaven groin. i try t...,girlfriend prefers clean shaven groin. i try t...
42136,42136,today i broke a window that costs more then i ...,today i broke a window that costs more then i ...
42137,42137,i invited over new girlfriend for dinner to sp...,i invited over new girlfriend for dinner to sp...


Find the indices of the items that are not useful (not informative);
*   nonsensical tldrs (e.g., punctuation marks only),
*   tldrs that clearly are not a summary (e.g., "see title") 

In [16]:
# Find the indices of the items that are not useful (not informative);
#   - nonsensical tldrs (e.g., punctuation marks only),
#   - tldrs that clearly are not a summary (e.g., "see title") 

not_useful_tldrs_indices = []

# Find the indices of the TLDRs that empty or not text (e.g., punctuation marks only)

''' a regular expression that describes text: '''
text_pattern = re.compile("([a-z1-9])+.", re.IGNORECASE)

for i in range(len(reddit_tifu_df)):
    if len(reddit_tifu_df['tldr']) == 0 or not(text_pattern.search(reddit_tifu_df['tldr'].loc[i])):
        not_useful_tldrs_indices.append(i)

# Find the indices of the TLDRs that are not useful, e.g., "see title"

not_useful_tldrs = ['title', 'title.',
                    'see title', 'see title.',
                    'read title', 'read title.',
                    'at bottom', 'at bottom.',
                    'at the bottom', 'at the bottom.',
                    'at the end', 'at the end.',
                    'version:']

for i in range(len(reddit_tifu_df)):
    if reddit_tifu_df.loc[i]['tldr'] in not_useful_tldrs:
        not_useful_tldrs_indices.append(i)

for indx in not_useful_tldrs_indices:
    print(reddit_tifu_df.loc[indx]['tldr'])

?
---------
**
( ͡° ͜ʖ ͡°)
,
**
**
~~
k
**
**
**
(( ͡° ͜ʖ ͡°)͜ʖ( ͡° ͜ʖ ͡°))*
-
**:
**
**
**
**
--
"
**
???
**
)**
*
:
*
**:
**
:
⬆️
**
:
,
;
/╲/( ͡° ͡° ͜ʖ ͡° ͡°)/\╱\
**
,
;
?**
**
**
]
*
**
-
**
*
**
**
:
;
**
**
:
**
:
**
**
.
**
;
/
**
;
*
💨 💨 🐝💦💦💻 😯😐
?
*
*
:
)
,
,
'
'
.**
'
]
.**
**
at the bottom
see title
version:
title.
at the bottom.
version:
at the end.
see title
at the bottom.
see title.
title.
see title
at the bottom.
at the bottom.
at bottom.
at the bottom.
see title
at the bottom.
title
at bottom.
title.
read title
at the bottom.
title
title
at the bottom.
at the bottom.
at the bottom
title
at the end.
title.
at bottom.
at the bottom
at bottom
at the bottom
at bottom.
at the bottom.
at the bottom
at the end.


# **Step 4:** Aggregate all the indices that should be removed, found so far

In [17]:
len(exact_duplicates_texts_indices)

38

In [18]:
len(not_useful_texts_indices)

1

In [19]:
len(not_useful_tldrs_indices)

121

In [20]:
# Aggregate all the indices that should be removed

indices_to_remove = exact_duplicates_texts_indices + not_useful_texts_indices + not_useful_tldrs_indices

In [21]:
len(indices_to_remove)

160

# **Step 5:** Remove the indices & inspect the rest of the dataset for duplicates of summaries (column 'tldr' -> 'clean_text')

In [22]:
# Select the reddit_tifu indices to keep by removing the indices to remove

all_indices = []
all_indices.extend(range(0,42139))

indices_to_keep = [x for x in all_indices if x not in indices_to_remove]

One more step: search for candidate duplicated using the values of the column 'tldr'

In [23]:
reddit_tifu_targets_df = reddit_tifu_targets_df.iloc[indices_to_keep]

len(reddit_tifu_targets_df)

41979

In [24]:
reddit_tifu_targets_df.reset_index(drop=True, inplace=True)

In [25]:
reddit_tifu_targets_df

Unnamed: 0,original_index,text,clean_text
0,0,confuse a 5th grade girl for a boy in front of...,confuse a 5th grade girl for a boy in front of...
1,1,"i found my estranged dad, thought i loved him ...","i found my estranged dad, thought i loved him ..."
2,2,had my balls burned by sauron and was left dev...,had my balls burned by sauron and was left dev...
3,3,peppermint + bath = burning cold ladybits.,peppermint + bath = burning cold ladybits.
4,4,"got too high and too hot in the bath, almost c...","got too high and too hot in the bath, almost c..."
...,...,...,...
41974,42134,forgot my quarter for lunch at school for a we...,forgot my quarter for lunch at school for a we...
41975,42135,girlfriend prefers clean shaven groin. i try t...,girlfriend prefers clean shaven groin. i try t...
41976,42136,today i broke a window that costs more then i ...,today i broke a window that costs more then i ...
41977,42137,i invited over new girlfriend for dinner to sp...,i invited over new girlfriend for dinner to sp...


In [26]:
reddit_tifu_targets_df.iloc[20005]['text']

'had an interview. forgot interviewers name. called expecting a receptionist. interviewer picked up. thought i was talking to receptionist until the very end.'

In [27]:
reddit_tifu_targets_df.iloc[20005]['original_index']

20094

* Next we look for candidate duplicates based on the values of the column 'tldr',
* we call them 'candidate' duplicates because: identical values in the 'tldr' column do not necessarily indicate a duplicate element in the Reddit TIFU

In [28]:
# E.g., the following two elements of Reddit TIFU,
# have the same 'tldr' but are not duplicates

print("\n**Reddit TIFU indx 20074**")
print(f"TLDR SUMMARY: {reddit_tifu_df.loc[20074]['tldr']}")
print(f"SOURCE TEXT: {reddit_tifu_df.loc[20074]['documents']}")

print("\n**Reddit TIFU indx 23123**")
print(f"TLDR SUMMARY: {reddit_tifu_df.loc[23123]['tldr']}")
print(f"SOURCE TEXT: {reddit_tifu_df.loc[23123]['documents']}")


**Reddit TIFU indx 20074**
TLDR SUMMARY: think before you speak
SOURCE TEXT: yipee, this just happened (+5 tifu points)

i was watching the anzac ceremony in gallipoli on tv, like a new zealander should, but the volume was on 2. usually my family listens to the tv on volume 8-11 so my ears struggled to pick up the sounds.

i was not in the vicinity of the remote so i was unable to do it myself.

heres the fu:

i said "its very quiet" while they were playing the national anthem of turkey.

in my head i was wanting to hear the national anthem as i had never heard it before. instead it sounded like an offensive joke because there was a silence while the anthems play. awkward looks ensued.

**Reddit TIFU indx 23123**
TLDR SUMMARY: think before you speak
SOURCE TEXT: little background info: sometimes i blurt out things before i realize it wasn't a good idea to say it.

anyway were standing around and my friend was talking about what this guy could play when he uses the wah wah pedal ( http

In [29]:
candidate_duplicates = []

reddit_tifu_targets_list = reddit_tifu_targets_df['clean_text'].tolist()

for indx in range(len(reddit_tifu_targets_df.index)):
    target = reddit_tifu_targets_list[indx]
    match = reddit_tifu_targets_df['clean_text'].eq(target)
    matching_indices_df = pd.DataFrame({'indx': match.index, 'bool': match.values})

    ''' matching_positions_list stores lists of all the positions(indices) that 
    match the target, the target position itself included.
    matching_positions_list contains repetitions (of the same set) and should 
    be filtered '''
    matching_positions_list = matching_indices_df.index[matching_indices_df['bool'] == True].tolist()
    if len(matching_positions_list) > 1 :
        candidate_duplicates.append(matching_positions_list)

''' candidate duplicates contains repetitions of the same set '''
candidate_duplicates_sets = list(map(set, candidate_duplicates))
unique_sets = list(set(frozenset(item) for item in candidate_duplicates))
candidate_duplicates_unique_sets = [set(item) for item in set(frozenset(item) for item in unique_sets)]

candidate_duplicates_lists = []
for item in candidate_duplicates_unique_sets:
    candidate_duplicates_lists.append(list(item))

* After finding the candidate duplicates based on the 'tldr' column
* we compare the corresponding source texts ('documents' column) to figure out if they are actual duplicates
* to compare the source texts for similarity ROUGE-2 recall is used
* two texts are considered duplicates if ROUGE-2 recall > 0.8 
* this way of computing similarity is based on the approach used in *Zhang, J., Zhao, Y., Saleh, M., & Liu, P. (2020, November). Pegasus: Pre-training with extracted gap-sentences for abstractive summarization. In International Conference on Machine Learning (pp. 11328-11339). PMLR.*

In [30]:
duplicates_tldrs_indices = []

for element in candidate_duplicates_lists:
    for i in range(1, len(element)):
        target_1 = reddit_tifu_df.loc[reddit_tifu_targets_df.loc[element[0]]['original_index']]['documents']
        target_2 = reddit_tifu_df.loc[reddit_tifu_targets_df.loc[element[i]]['original_index']]['documents']
      
    if rouge_2_recall([target_1], [target_2])>=0.8:        
        duplicates_tldrs_indices.append(reddit_tifu_targets_df.loc[element[i]]['original_index'])

In [31]:
# Sanity check: there should be no common elements in the two lists
set(duplicates_tldrs_indices) & set(indices_to_remove)

set()

# **Reddit TIFU indices that correspond to duplicates**

In [32]:
reddit_tifu_duplicates_indices = exact_duplicates_texts_indices + duplicates_tldrs_indices

In [33]:
len(reddit_tifu_duplicates_indices)

90

## **Reddit TIFU indices that will be removed from the dataset (duplicates + not useful)**

In [34]:
reddit_tifu_indices_to_remove = exact_duplicates_texts_indices \
                                + duplicates_tldrs_indices \
                                + not_useful_texts_indices \
                                + not_useful_tldrs_indices

In [35]:
len(reddit_tifu_indices_to_remove)

212

In [36]:
# Select the reddit_tifu indices to keep by removing the indices to remove

all_reddit_tifu_indices = []
all_reddit_tifu_indices.extend(range(len(reddit_tifu_df)))

reddit_tifu_indices_to_keep = [element for element in all_reddit_tifu_indices if element not in reddit_tifu_indices_to_remove]

In [37]:
len(reddit_tifu_indices_to_keep)

41927

In [38]:
with open('reddit_tifu_indices_to_keep.txt', 'w') as f:
    for item in reddit_tifu_indices_to_keep:
        f.write("%s\n" % item)