In [1]:
import requests
import time 
import json
from tqdm.notebook import tqdm
import pandas as pd

def getAllThreadsFromQuery(query):
    request = requests.get(query)
    json_response = request.json()
    return json_response['data']

def getTop100ScoringCommentsForThread(submissionID):
    request = requests.get("https://api.pushshift.io/reddit/comment/search?link_id=" + submissionID + "&subreddit=explainlikeimfive&sort_type=score&size=100")
    json_response = request.json()
    return json_response['data']

def getAllAnswersFromAuthorUnderComment(parentID, author):
    request = requests.get("https://api.pushshift.io/reddit/comment/search/?parent_id=" + parentID + "&author=" + author)
    if request.status_code != 200:
        #raise Exception("Sent requests too fast! Error code: " + str(request.status_code))
        print("Error code: " + str(request.status_code) + " parentID: " + parentID + " author: " + author)
        return []
    json_response = request.json()
    return json_response['data']

def getIdDictForURL(url, searched_ids):
    id_dict = dict()
    thread_list = getAllThreadsFromQuery(url) # 1 request
    start = time.time()
    for thread in thread_list: 
        if "Modteam" not in thread['author']:
            if thread['id'] not in searched_ids:
                searched_ids.append(thread['id'])
                id_dict[thread['id']] = thread
    end = time.time()
    time.sleep(max(1-(end-start), 0)) # API cooldown
    return id_dict

def extractDatapointsFromURL(url, dataset, searched_ids):
    print("Starting extraction for url: " + url)
    id_dict = getIdDictForURL(url, searched_ids)
    index = 0
    for i,submissionID in enumerate(tqdm(id_dict)):
        if index < 32:
            index += 1
            continue
        comment_list = getTop100ScoringCommentsForThread(submissionID) # 1 request
        start = time.time()
        for topComment in comment_list: 
            explainer = topComment['author']
            explainee = id_dict[submissionID]['author']
            end = time.time()
            time.sleep(max(1-(end-start), 0))
            nested_comment_list = getAllAnswersFromAuthorUnderComment(topComment['id'], explainee) # 1 request
            start = time.time()
            found = False
            datapoint = []
            explainer_request = True
            while(len(nested_comment_list) > 0):
                if not found:
                    datapoint.append(id_dict[submissionID])
                    datapoint.append(topComment)
                    num_turns = 2
                found = True
                datapoint.append(nested_comment_list[0])
                num_turns += 1
                if explainer_request:
                    end = time.time()
                    time.sleep(max(1-(end-start), 0))
                    nested_comment_list = getAllAnswersFromAuthorUnderComment(nested_comment_list[0]['id'], explainer) # 1 request
                    start = time.time()
                    explainer_request = False
                else:
                    end = time.time()
                    time.sleep(max(1-(end-start), 0))
                    nested_comment_list = getAllAnswersFromAuthorUnderComment(nested_comment_list[0]['id'], explainee) # 1 request
                    start = time.time()
                    explainer_request = True
            if found and num_turns >= 4: # minimum 4 turns
                dataset.append(datapoint)
                # save dataset as json, done here so that we have some checkpoints if there is a crash
                with open('eli5_explanation_dataset_improved.json', 'w') as f:
                    json.dump(dataset, f)

def extractExplanation(dialogueId, data):
    dataframeData = []
    turnId = 1
    for element in data:
        if turnId % 2 == 1:
            text = {'author':'explainee'}
        else:
            text = {'author':'explainer'}
        if turnId == 1:
            text['text'] = element['title']
        else:
            text['text'] = element['body']
        dataframeData.append([dialogueId, element['id'], turnId, text, element['score'], "https://www.reddit.com" + element['permalink']])
        turnId += 1
    return pd.DataFrame(dataframeData, columns=['dialogue_id', 'post_unique_id', 'turn_id', 'turn_text', 'turn_score', 'url'])

def datasetToDataFrame(dataset):
    dataframeData = []
    dialogueId = 0
    for data in dataset:
        turnId = 1
        for element in data:
            if turnId % 2 == 1:
                text = {'author':'explainee'}
            else:
                text = {'author':'explainer'}
            if turnId == 1:
                text['text'] = element['title']
            else:
                text['text'] = element['body']
            dataframeData.append([dialogueId, element['id'], turnId, text, element['score'], "https://www.reddit.com" + element['permalink']])
            turnId += 1
        dialogueId += 1
    return pd.DataFrame(dataframeData, columns=['dialogue_id', 'post_unique_id', 'turn_id', 'turn_text', 'turn_score', 'url'])

def saveDatasetAsCsv(dataset):
    df = datasetToDataFrame(dataset)
    df.to_json('eli5_explanation_dataset.json')
    df.to_csv('eli5_explanation_dataset.csv')
    
def saveDatasetAsParsedJson(dataset):
    jsonData = []
    dialogueId = 0
    for data in dataset:
        turnId = 1
        for element in data:
            if turnId % 2 == 1:
                text = {'author':'explainee'}
            else:
                text = {'author':'explainer'}
            if turnId == 1:
                text['text'] = element['title']
            else:
                text['text'] = element['body']
            jsonData.append([dialogueId, element['id'], turnId, text, element['score'], "https://www.reddit.com" + element['permalink']])
            turnId += 1
        dialogueId += 1
    with open('eli5_explanation_dataset_parsed.json', 'w') as f:
                    json.dump(jsonData, f)

def saveDatasetAsParsedGroupedJson(dataset):
    jsonData = []
    dialogueId = 0
    for data in dataset:
        turnId = 1
        jsonDataList = []
        for element in data:
            if turnId % 2 == 1:
                text = {'author':'explainee'}
            else:
                text = {'author':'explainer'}
            if turnId == 1:
                text['text'] = element['title']
            else:
                text['text'] = element['body']
            jsonDataList.append([dialogueId, element['id'], turnId, text, element['score'], "https://www.reddit.com" + element['permalink']])
            turnId += 1
        dialogueId += 1
        jsonData.append(jsonDataList)
    with open('eli5_explanation_dataset_parsed_grouped.json', 'w') as f:
                    json.dump(jsonData, f)
            
def getAllUrlsForYear(year):
    urlList = []
    urlYear = str(year)
    nextYear = str(year+1)
    for i in range(12)[1:]:
        index = str(i)
        next_index = str(i+1)
        if i < 10:
            index = "0" + index
        if i < 9:
            next_index = "0" + next_index
        urlList.append("https://api.pushshift.io/reddit/search/submission/?subreddit=explainlikeimfive&after=" + urlYear +"-" + index + "-01&before=" + urlYear +"-" + next_index + "-01&sort_type=num_comments&size=100&sort=desc")
    urlList.append("https://api.pushshift.io/reddit/search/submission/?subreddit=explainlikeimfive&after=" + urlYear +"-12-01&before=" + nextYear +"-01-01&sort_type=num_comments&size=100&sort=desc")
    return urlList
    

pd.set_option('display.max_colwidth', None)

In [None]:
eli5_2020_top100_num_comments_url = "https://api.pushshift.io/reddit/search/submission/?subreddit=explainlikeimfive&after=2020-01-01&sort_type=num_comments&size=100&sort=desc"
eli5_2020_top100_score_url = "https://api.pushshift.io/reddit/search/submission/?subreddit=explainlikeimfive&after=2020-01-01&sort_type=score&size=100&sort=desc"

searched_ids = []
try:
    with open('eli5_explanation_dataset_improved.json', 'r') as f:
        dataset = json.load(f)
    for data in dataset:
        searched_ids.append(data[0]['id'])
except:
    dataset = []

extractDatapointsFromURL(eli5_2020_top100_num_comments_url, dataset, searched_ids)
extractDatapointsFromURL(eli5_2020_top100_score_url, dataset, searched_ids)

In [2]:
searched_ids = []
try:
    with open('eli5_explanation_dataset_improved.json', 'r') as f:
        dataset = json.load(f)
    for data in dataset:
        searched_ids.append(data[0]['id'])
except:
    dataset = []

urls = getAllUrlsForYear(2021)   

for url in urls[9:]:
    extractDatapointsFromURL(url, dataset, searched_ids)

Starting extraction for url: https://api.pushshift.io/reddit/search/submission/?subreddit=explainlikeimfive&after=2021-09-01&before=2021-10-01&sort_type=num_comments&size=100&sort=desc


  0%|          | 0/99 [00:00<?, ?it/s]

Error code: 502 parentID: hd4vcuk author: Nerdferd
Error code: 504 parentID: hd4jv7e author: Nerdferd
Error code: 504 parentID: hd58a60 author: Nerdferd
Error code: 504 parentID: hd5jrq9 author: Nerdferd
Error code: 504 parentID: hd6ahzi author: Nerdferd
Error code: 504 parentID: hd5sgxd author: Nerdferd
Error code: 502 parentID: hd5wgxa author: Nerdferd
Starting extraction for url: https://api.pushshift.io/reddit/search/submission/?subreddit=explainlikeimfive&after=2021-10-01&before=2021-11-01&sort_type=num_comments&size=100&sort=desc


  0%|          | 0/98 [00:00<?, ?it/s]

KeyboardInterrupt: 

# New Analysis

In [3]:
import pandas as pd
import json

with open('eli5_explanation_dataset_improved.json', 'r') as f:
    dataset = json.load(f)
    
print("Dataset size: " + str(len(dataset)) + " explanation dialogues")

#eli5_2020_top100_num_comments = "https://api.pushshift.io/reddit/search/submission/?subreddit=explainlikeimfive&after=2020-01-01&sort_type=num_comments&size=100&sort=desc"
#eli5_2020_top100_score = "https://api.pushshift.io/reddit/search/submission/?subreddit=explainlikeimfive&after=2020-01-01&sort_type=score&size=100&sort=desc"

#id_dict_num = dict()
#thread_list = getAllThreadsFromQuery(eli5_2020_top100_num_comments) # 1 request
#start = time.time()
#for thread in thread_list: 
#    if("Modteam" not in thread['author']):
#        if(thread['id'] not in id_dict_num):
#            id_dict_num[thread['id']] = thread
#end = time.time()
#time.sleep(max(1-(end-start), 0))

#id_dict_score = dict()
#thread_list = getAllThreadsFromQuery(eli5_2020_top100_score) # 1 request
#start = time.time()
#for thread in thread_list:
#    if("Modteam" not in thread['author']):
#        if(thread['id'] not in id_dict_score):
#            id_dict_score[thread['id']] = thread
            
#num_counter = 0
#score_counter = 0
turn_num = 0
turn_num_list = []
for data in dataset:
    turn_num += len(data)
    turn_num_list.append(len(data))
    #if data[0]['id'] in id_dict_num:
    #    num_counter += 1
    #if data[0]['id'] in id_dict_score:
    #    score_counter += 1
turn_num_series = pd.Series(turn_num_list)

#print(str(num_counter) + " dialogues from top 100 num_comments")
#print(str(score_counter) + " dialogues from top 100 scores")

print(str(turn_num) + " turns are present in the dataset in total")
print("")
print("Here are some statistics for the turns:")
print(turn_num_series.describe())

Dataset size: 527 explanation dialogues
3864 turns are present in the dataset in total

Here are some statistics for the turns:
count    527.000000
mean       7.332068
std        8.955200
min        4.000000
25%        4.000000
50%        4.000000
75%        6.000000
max       83.000000
dtype: float64


# Extraction of an Explanation Dialog

### Good Example

In [2]:
with open('eli5_explanation_dataset_improved.json', 'r') as f:
    dataset = json.load(f)
    
extractExplanation(2, dataset[2])

Unnamed: 0,dialogue_id,post_unique_id,turn_id,turn_text,turn_score
0,2,surj25,1,"{'author': 'explainee', 'text': 'ELI5: What is the purpose of prison bail? If somebody should or shouldn’t be jailed, why make it contingent on an amount of money that they can buy themselves out with?'}",1
1,2,hxbhhgn,2,"{'author': 'explainer', 'text': 'Bail is for when a person has not been tried or convicted yet. It's basically collateral to ensure that the person shows up for trial. If they fail to appear, the bail money is forfeit. If the court decides that a person is likely to flee or just not show up, they will deny bail and hold them until trial.'}",9869
2,2,hxbinac,3,"{'author': 'explainee', 'text': 'So is the penalty for not showing back up too light that a lot of people just don’t otherwise?'}",1
3,2,hxblzvr,4,"{'author': 'explainer', 'text': 'The bail amount is set based on the person's resources. They want it high enough that they can't afford to lose it. They might also make them surrender their passport, depending on the person and the crime. If they dont show up for trial, the bail money is lost. Also, there is now a warrant out for their arrest that will follow them pretty much forever until the police finally catch up with them. Now, they will be held until their new trial date, which might be a while. This puts them in the same situation as they were in the first place, only now they lost all that bail money. On top of that, it now looks really bad for their trial because the prosecutor can use their flight from trail against them so a conviction is more likely. It's set up so that there is basically no upside in the long term for the accused to flee while on bail.'}",1


In [3]:
with open('eli5_explanation_dataset_improved.json', 'r') as f:
    dataset = json.load(f)
    
extractExplanation(47, dataset[47])

Unnamed: 0,dialogue_id,post_unique_id,turn_id,turn_text,turn_score
0,47,evoqm2,1,"{'author': 'explainee', 'text': 'ELI5:Why is the ""no true Scotsman"" considered a fallacy?'}",1
1,47,ffx3mxc,2,"{'author': 'explainer', 'text': 'How a person internally measures something isn't really material to the fallacy at hand. The fallacy comes from the assertion being amended to exclude the specific counter example out of hand instead of arguing the example or elaborating on the definition. In the (intentionally basic) wikipedia example, Person A is excluding Uncle Angus from objectively being a Scotsman based on his own subjective criteria, which he has changed during the debate.'}",1
2,47,ffx62fj,3,"{'author': 'explainee', 'text': 'But amending the assertion isn't fallacious in this case. It's just that the counter-argument has made you realize your assertion wasn't accurate enough, so now you are revising it to better represent how you think.'}",1
3,47,ffx6l4p,4,"{'author': 'explainer', 'text': 'The assertion isn't being amended; it's the same assertion. You're just adding a qualifier because you cannot dismiss Uncle Angus otherwise. The only assertion in the argument was ""Scotsman do not add sugar to porridge"" which is logically consistent (even though it's wrong). ""Uncle Angus is both a Scotsman and someone who adds sugar to porridge"" is both logical and proof that the assertion is incorrect. ""I do not agree that Uncle Angus is Scottish because of my initial assertion"" is illogical because the only argument against Uncle A requires that the original assertion be true.'}",1
4,47,ffx7ad0,5,"{'author': 'explainee', 'text': 'But it's not a fallacy because the two persons are talking about two different things here. Uncle Angus is talking about his Scottish nationality. He is talking about the official definition. The other person is talking about their own conception of what a Scotsman should be. That's what they were talking about from the start, they just weren't clear in communicating it. So now they are adding a ""true"" to make it clear.'}",1
5,47,ffx7its,6,"{'author': 'explainer', 'text': 'Then this isn't a debate; it's an argument over head-canon. If neither party can agree on what the terms of a debate even mean then no logical arguments can commence.'}",1
6,47,ffx7trx,7,"{'author': 'explainee', 'text': 'Hence my question: why are those types of situation considered a fallacy, when in reality they are just a case of someone having a subjective definition, which they then have to explain more clearly when other people don't get what they are talking about?'}",1
7,47,ffx8s6t,8,"{'author': 'explainer', 'text': 'It's not a fallacy if you are arguing something subjective because if it is subjective then it is illogical. No True Scotsman can only apply if the assertion is something that can be disproven because NTS is applied when the assertion is changed to exclude the offending proof. Likewise, if you clarify your position it is also not No True Scotsman. However; if you argue something to objectively be the case... You know what, let's change the example slightly to be less ambiguous: ""No person born in Edinburgh likes eggs"" ""I was born in Edinburgh and I like eggs"" ""No TRUE person from Edinburgh likes eggs"" Can you see where logic has broken down here?'}",1
8,47,ffxb09k,9,"{'author': 'explainee', 'text': 'But the thing is, people who get accused of ""no true Scotsman"" are never trying to argue something objective. They are always going by some personal conception they have. With your example, you are trying to think of something objective. It's hard to argue against where a person was born. But I doubt the person in this example would be denying that, technically, some people born in Edinburgh don't like eggs. Just like the Scotsman isn't denying that technically Scottish nationality is a thing. But they may have their own conception of what ""being born in Edinburgh"" means, which would involve both being born in Edinburgh (technically speaking) and not liking eggs.'}",1
9,47,ffxbm91,10,"{'author': 'explainer', 'text': 'Then every time you have seen an accusation of no true scotsman it has been used incorrectly. The person in my example was arguing exactly what they said because they were literally a hypothetical person who was arguing something easy to disprove. This is literally just a textbook example. It seems to me that you are primarily concerned with what the accusee in each example \*meant\* but that's not what debate is about, to the point of apologism for literal straw men. A debate is entirely logical. ""People who I consider to be Scottish do not add sugar to their porridge"" is not something anyone would ever argue.'}",1


### Bad Example

In [4]:
with open('eli5_explanation_dataset_improved.json', 'r') as f:
    dataset = json.load(f)
    
extractExplanation(6, dataset[6])

Unnamed: 0,dialogue_id,post_unique_id,turn_id,turn_text,turn_score
0,6,nkzui6,1,"{'author': 'explainee', 'text': 'ELI5: Can someone please tell me why the US gradeschool (even up through highschool) usesthe ""A, B, C,D and F"" system to grade? Why did they exclude the E?'}",1
1,6,gzgyixx,2,"{'author': 'explainer', 'text': 'Hi Everyone, This post is getting popular and that is wonderful. For those of you joining us from r/all, and otherwise, I would like to ask you to read rules before participating. In particular rule 3: top level comments must be explanations. This means that when reply directly to the post it has to be an objective explanation, it **cannot** just be an anecdote of what your school system was growing up (or is currently). If you would like to just share how your school did it then you are welcome to leave that as a reply to this comment. Please let me know if you have any questions'}",188
2,6,gzhf6ap,3,"{'author': 'explainee', 'text': 'Yea... My bad. I didn't expect this to become this popular... Hope I posted in the proper sub lol.'}",4
3,6,gzhf8uf,4,"{'author': 'explainer', 'text': 'You’re good. We are pretty strict on comments though'}",7
4,6,gzhfzyr,5,"{'author': 'explainee', 'text': 'Ok, cool. And yes, I know! It is great here! Your team does a great job, and aren't a-holes. Thank you.'}",6


# Save dataset as csv for export and better readability

In [2]:
with open('eli5_explanation_dataset_improved.json', 'r') as f:
    dataset = json.load(f)
    
saveDatasetAsCsv(dataset)
# saveDatasetAsParsedJson(dataset)
# saveDatasetAsParsedGroupedJson(dataset)