# Import deps

The requests library is what we will use to make our HTTP requests

In [19]:
import requests #The requests library for HTTP requests in Python

# Define the analysis function

TODO: explain this function

In [49]:
def analyze_words(words):
    analysis_string = words.split(' ')
    word_dict = {}
    for word in analysis_string:
        cleaned_word = word.replace('.', '').replace("'", '').replace('\n', '').replace(',', '').replace("’", '').lower()
        if cleaned_word not in word_dict:
            word_dict[cleaned_word] = 1
        else:
            word_dict[cleaned_word] += 1

    return word_dict

# Make the reddit api call

A number of global variables will be used

`comment_count` will keep track of how many comments are parsed

`comment_array` will be used to append comments to as they are parsed from the response

`more_comment_ids` will be used to append IDs for additional comments that will need to be fetched

In [None]:
# globals 
comment_count = 0
comment_array = []
more_comment_ids = []

A number of functions will help us work with the response data and parse the comments out.

A typical response from the API request will look as such:

```
{
    "kind": "Listing",
    "data": {
        "children": [
            "kind": "t1",
            "data": {
                "body": "",
                "replies: ""
            }
        ]
    }
}
```

A "[Listing](https://www.reddit.com/dev/api/#listings)" of type `t1` indicates that the data belongs to a comment. Within the comment data, we'll have the text of the comment, along with any possible replies which are located on the `replies` property. Replies are structured the same way as comments, so you can think of them as recursive children. Within every reply to a comment, we may see another reply to that reply comment. So in order to analyze all comments within a thread, we'll have to recursively sift through all comments and replies. 

If having to follow each individual comment tree recursively to its end wasn't tricky enough, there's another issue we have to worry about. Since comment threads can become quite long, not every comment is always displayed on the initial thread load. When this happens, reddit shows "load more replies" buttons within threads. So how do we get these as well? To handle these instances, the API will deliver a child with a `kind` property value of `more`. The array of `children` within the `more` object will contain a list of thread IDs which can be used to fetch additional comments. So in addition to parsing all comment trees recursively, we will also have to collect any additional comment thread IDs and then do the same thing for those. 

Hopefully you are still with me at this point. Talking about all of this without writing any code can be confusing, so let's try to break it down with some functions that will help us achieve this goal.

The first function we'll write is `parse_children_for_comments`. It will take an array of `children` objects that are sent back in the response data and will pull out the comment text which is found in the `body` property. For each child in the array argument, we will check its `kind`. If the `kind` is `more`, we will loop through and add each id to the global array we created, `more_comment_ids`. We will eventually come back to this array of ids and parse through it.

Next, if the `kind` is `t1`, that means we have a comment and we want to read its text. In order to do that, we simply get the text with `child['data']['body']` and append it to our global `comment_array` variable.


In [50]:
# make this an input
thread_url = 'https://www.reddit.com/r/FIFA/comments/koa79q/the_rfifa_daily_discussion_thread_january_01_2021.json'

# create options for removing common words, downcasing strings, stipping special chars

# define helper functions
def create_thread_url(thread_id):
    return thread_url.replace('.json', f'/{thread_id}.json')

def parse_children_for_comments(children):
    global comment_count
    global comment_array
    for child in children:
        if child['kind'] == "more":
            children = child['data']['children']
            for id in children:
                more_comment_ids.append(id)
        if child['kind'] == "t1":
            comment_count += 1
            comment_array.append(child['data']['body'])
            get_replies(child['data'])
                
def get_replies(comment):
    global comment_count
    if comment['replies'] != "":
        children = comment['replies']['data']['children']
        parse_children_for_comments(children)

# make network call
req_data = requests.get(thread_url, headers = {'User-agent': 'ollz'})

if req_data.status_code != 200:
    print('request failed')
    print(req_data.json())
    
if req_data.status_code == 200:
    json_data = req_data.json()
    for item in json_data:
        children = item['data']['children']
        parse_children_for_comments(children)
                
print(f'{comment_count} comments analyzed')
print(f'{len(more_comment_ids)} more comment thread ids')

# deal with extra comment ids
for id in more_comment_ids[:2]:
    print(create_thread_url(id))
    req_data = requests.get(create_thread_url(id), headers = {'User-agent': 'ollz'})
    if req_data.status_code != 200:
        print('request failed')
        print(req_data.json())
        
    if req_data.status_code == 200:
        json_data = req_data.json()
        for item in json_data:
            children = item['data']['children']
            parse_children_for_comments(children)
    
comment_string = ' '.join(comment_array)
results = analyze_words(comment_string)

sorted(results.items(), key=lambda x: x[1], reverse=True)

574 comments analyzed
311 more comment thread ids
https://www.reddit.com/r/FIFA/comments/koa79q/the_rfifa_daily_discussion_thread_january_01_2021/ghrh10l.json
https://www.reddit.com/r/FIFA/comments/koa79q/the_rfifa_daily_discussion_thread_january_01_2021/ghr9n12.json


[('the', 435),
 ('i', 397),
 ('to', 345),
 ('and', 344),
 ('a', 258),
 ('for', 243),
 ('you', 222),
 ('in', 165),
 ('get', 130),
 ('but', 127),
 ('have', 126),
 ('of', 125),
 ('is', 124),
 ('just', 122),
 ('it', 117),
 ('do', 112),
 ('on', 111),
 ('or', 109),
 ('if', 106),
 ('that', 103),
 ('with', 99),
 ('my', 94),
 ('game', 89),
 ('be', 84),
 ('this', 83),
 ('will', 81),
 ('as', 78),
 ('can', 75),
 ('so', 74),
 ('im', 71),
 ('icon', 69),
 ('play', 68),
 ('me', 67),
 ('swaps', 63),
 ('not', 63),
 ('at', 56),
 ('any', 56),
 ('are', 56),
 ('squad', 55),
 ('players', 54),
 ('pack', 54),
 ('some', 53),
 ('dont', 52),
 ('2', 50),
 ('from', 50),
 ('should', 48),
 ('then', 48),
 ('one', 44),
 ('when', 44),
 ('would', 44),
 ('use', 44),
 ('them', 43),
 ('ive', 43),
 ('its', 43),
 ('all', 43),
 ('they', 43),
 ('team', 41),
 ('him', 41),
 ('up', 40),
 ('what', 39),
 ('like', 39),
 ('want', 39),
 ('much', 39),
 ('kante', 39),
 ('got', 38),
 ('people', 38),
 ('out', 38),
 ('games', 37),
 ('how', 