# Data Collection using PRAW

## 1. Environment Setup

### *1.1. Python Environment Setup*

In [12]:
# Installing the latest version of PRAW
!pip install --upgrade https://github.com/praw-dev/praw/archive/master.zip

Collecting https://github.com/praw-dev/praw/archive/master.zip
  Downloading https://github.com/praw-dev/praw/archive/master.zip
[2K     [32m|[0m [32m25.1 MB[0m [31m6.2 MB/s[0m [33m0:00:03[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting prawcore<3,>=2.1 (from praw==7.7.2.dev0)
  Using cached prawcore-2.4.0-py3-none-any.whl (17 kB)
Collecting update_checker>=0.18 (from praw==7.7.2.dev0)
  Using cached update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Building wheels for collected packages: praw
  Building wheel for praw (pyproject.toml) ... [?25l[?25hdone
  Created wheel for praw: filename=praw-7.7.2.dev0-py3-none-any.whl size=189380 sha256=8ffb473cc92dbfa2a86a56ec2e35986604fcac108d5ff5634954983d5932441f
  Stored in directory: /tmp/pip-ephem-wheel-cache-yfxujbqt/wheels/58/e2/b8/ac335

In [16]:
# Import packages
import praw
import pandas as pd
from datetime import datetime
import time
import warnings
from google.colab import files
from google.colab import drive
from prawcore.exceptions import NotFound
import json

### *1.2. Reddit API Authentication Key Setup*

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Define the path to the authentication key file in your Google Drive
auth_file_path = '/content/drive/My Drive/2_Lund/2_SIMZ_Thesis/3.2_reddit_api_auth_key.json'

# Read authentication key from the file
with open(auth_file_path, 'r') as auth_file:
    auth_data = json.load(auth_file)

# Authentication using the read credentials
reddit = praw.Reddit(client_id=auth_data['client_id'],
                     client_secret=auth_data['client_secret'],
                     user_agent=auth_data['user_agent'])

### *1.3. Defining the Scope of the Data Pull*

In [5]:
# Defining the Subreddit
subreddit = reddit.subreddit("EatingDisorders")

In [3]:
# Defining the Usernames

# Download the list from Google Drive
!gdown https://drive.google.com/uc?id=1-3aS3jytXw-kQ9fZsULc6erCvcpKMDya

# Load data in from files
file_path = "2.1_usernames.txt"

# Read the text file into a list
with open(file_path, "r") as file:
    list = [line.strip() for line in file.readlines()]

# Transforming the list into the right format
cleaned_list = [name.strip("'") for name in list]

# Create the final list with ' ' around each username and join them with newline character
final_list = "['" + "',\n '".join(cleaned_list) + "']"

# Final list
usernames = final_list

Downloading...
From: https://drive.google.com/uc?id=1-3aS3jytXw-kQ9fZsULc6erCvcpKMDya
To: /content/2.1_usernames.txt
  0% 0.00/2.89k [00:00<?, ?B/s]100% 2.89k/2.89k [00:00<00:00, 10.7MB/s]


In [5]:
# Defining the Benchmark Submission IDs

# Download the list from Google Drive
!gdown https://drive.google.com/uc?id=1-5LHEB35Y5UtBqYdgW4KzZ4eHEFTev4S

# Load data in from files
file_path = "2.2_submissions.txt"

# Read the text file into a list
with open(file_path, "r") as file:
    list = [line.strip() for line in file.readlines()]

# Transforming the list into the right format
cleaned_list = [name.strip("'") for name in list]

# Create the final list with ' ' around each username and join them with newline character
final_list = "['" + "',\n '".join(cleaned_list) + "']"

# Final list
submission_ids = final_list

Downloading...
From: https://drive.google.com/uc?id=1-5LHEB35Y5UtBqYdgW4KzZ4eHEFTev4S
To: /content/2.2_submissions.txt
  0% 0.00/1.11k [00:00<?, ?B/s]100% 1.11k/1.11k [00:00<00:00, 5.37MB/s]


In [7]:
# Defining the Benchmark Comment Links

# Download the list from Google Drive
!gdown https://drive.google.com/uc?id=1-4YvRHbkKt6Z7rkcgk4FPX-YgbUaL_V6

# Load data in from files
file_path = "2.3_comments.txt"

# Read the text file into a list
with open(file_path, "r") as file:
    list = [line.strip() for line in file.readlines()]

# Transforming the list into the right format
cleaned_list = [name.strip("'") for name in list]

# Create the final list with ' ' around each username and join them with newline character
final_list = "['" + "',\n '".join(cleaned_list) + "']"

# Final list
comment_links = final_list

Downloading...
From: https://drive.google.com/uc?id=1-4YvRHbkKt6Z7rkcgk4FPX-YgbUaL_V6
To: /content/2.3_comments.txt
  0% 0.00/4.63k [00:00<?, ?B/s]100% 4.63k/4.63k [00:00<00:00, 19.0MB/s]


## 2. Selected users' Submissions

### *2.1. Pulling the Submissions*

In [5]:
# Creating dictionary for the submissions
submissions_dict = {"id" : [],
             "author" : [],
             "created": [],
             "title" : [],
             "body" : [],
             "score" : [],
             "comms_num": [],
             "url" : []
             }

In [6]:
# Iterate through each username in the list
for username in usernames:

    # Get the user's submissions in the subreddit
    submissions = list(subreddit.search(f'author:{username}', sort='new', syntax='cloudsearch'))

    # Pulling the submissions into a dictionary
    for submission in submissions:
        submissions_dict["author"].append(submission.author)
        submissions_dict["title"].append(submission.title)
        submissions_dict["score"].append(submission.score)
        submissions_dict["id"].append(submission.id)
        submissions_dict["url"].append(submission.url)
        submissions_dict["comms_num"].append(submission.num_comments)
        submissions_dict["created"].append(datetime.utcfromtimestamp(submission.created))
        submissions_dict["body"].append(submission.selftext)

    # Introduce a sleep to avoid API rate limits
    time.sleep(2)

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

In [7]:
# Transforming the dictionary into a data frame
submissions_data = pd.DataFrame(submissions_dict)
submissions_data

Unnamed: 0,id,author,created,title,body,score,comms_num,url
0,195b53a,HumbertHum,2024-01-13 00:54:08,I deleted the food tracker app I downloaded un...,"27F History of bulemia and food restrictions, ...",1,0,https://www.reddit.com/r/EatingDisorders/comme...
1,19agfy0,Golden-Lovers,2024-01-19 10:52:32,Can we stop accusing people of having food rul...,A lot of people don’t actually help others on ...,1,0,https://www.reddit.com/r/EatingDisorders/comme...
2,19084n7,DrumNBeans,2024-01-06 19:39:03,Help…,I am in need of advice. \n\nI am in my fourth ...,11,12,https://www.reddit.com/r/EatingDisorders/comme...
3,191ztwy,mindfullymoving,2024-01-08 23:40:05,What Stops You From Next Steps?,I've been on this sub for a while. I've gotte...,0,2,https://www.reddit.com/r/EatingDisorders/comme...
4,18yp4it,mindfullymoving,2024-01-04 21:55:20,After 17 years of EDs and diets... this is the...,I just wanted to give a heartfelt message to a...,50,15,https://www.reddit.com/r/EatingDisorders/comme...
...,...,...,...,...,...,...,...,...
164,195kgm8,Individual-Owl-9429,2024-01-13 09:41:50,28F Struggling mom,I had my first baby in July 2020 and my love f...,1,0,https://www.reddit.com/r/EatingDisorders/comme...
165,190ogww,ismircappuccino,2024-01-07 09:28:05,idk,i have an ed for 4 years now. i tried so many ...,4,0,https://www.reddit.com/r/EatingDisorders/comme...
166,18zs11x,ClueSmooth6913,2024-01-06 05:02:12,how do i get therapy,hi im 13 and for the past few months i’ve star...,16,18,https://www.reddit.com/r/EatingDisorders/comme...
167,18y8y21,Throwaway196747219,2024-01-04 09:16:50,How do I break it to a potential bf,I’ll try keep this short but ig there’s a lot....,1,0,https://www.reddit.com/r/EatingDisorders/comme...


### *2.2. Pulling the Comments belonging to the Submissions above*

In [8]:
# Creating dictionary for the comments
comments_dict = {
    "comment_id": [],
    "comment_submission_id": [],
    "comment_parent_id": [],
    "comment_author": [],
    "comment_body": [],
    "comment_score": [],
    "comment_replies": [],
    "comment_link": [],
    "comment_created": []
}

In [9]:
# Helper function to recursively count replies
def count_replies(comment):
    count = 0
    for reply in comment.replies:
        count += 1 + count_replies(reply)
    return count

In [10]:
# Assuming you have a DataFrame called submission_data with a column 'id'
for submission_id in submissions_data['id']:
    try:
        submission = reddit.submission(id=submission_id)

        # Iterate until there are no more "MoreComments" objects
        while True:
            try:
                submission.comments.replace_more(limit=None)
                break  # Break out of the loop if successful
            except Exception as e:
                print(f"Error: {e}")
                # Add a delay or handle the error as needed, then retry

        # Iterate through all comments
        for comment in submission.comments.list():
            comments_dict["comment_id"].append(comment.id)
            comments_dict["comment_submission_id"].append(comment.link_id)
            comments_dict["comment_parent_id"].append(comment.parent_id)
            comments_dict["comment_author"].append(comment.author.name if comment.author else None)
            comments_dict["comment_body"].append(comment.body)
            comments_dict["comment_score"].append(comment.score)
            comments_dict["comment_replies"].append(count_replies(comment))
            comments_dict["comment_link"].append(f"https://www.reddit.com{comment.permalink}")
            comments_dict["comment_created"].append(datetime.utcfromtimestamp(comment.created_utc))

            # Introduce a sleep to avoid API rate limits
            time.sleep(2)

    except Exception as e:
        print(f"Error processing submission with ID {submission_id}: {e}")
        continue

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

In [11]:
# Transforming the dictionary into a data frame
comments_data = pd.DataFrame(comments_dict)
comments_data

Unnamed: 0,comment_id,comment_submission_id,comment_parent_id,comment_author,comment_body,comment_score,comment_replies,comment_link,comment_created
0,kgoompu,t3_19084n7,t3_19084n7,HeyitsmeBeae,I’m just commenting that I experienced the sam...,6,2,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 04:00:29
1,kgpeywu,t3_19084n7,t3_19084n7,Pleasant_Contest_350,Hi OP! Im sorry you’re feeling so overwhelmed ...,4,7,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 08:03:43
2,kgrkepc,t3_19084n7,t3_19084n7,funkydyke,Work with a dietitian and a therapist. Take it...,1,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 18:41:11
3,kgutv60,t3_19084n7,t1_kgoompu,musingsofamdc,A doctor gave you weight loss meds during ED r...,1,1,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-08 05:52:42
4,kgpfdd1,t3_19084n7,t1_kgpeywu,Pleasant_Contest_350,The other thing I remember from what I learned...,3,4,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 08:08:21
...,...,...,...,...,...,...,...,...,...
801,kgyfrbh,t3_18zs11x,t1_kgojwkh,Gear_Familiar,"Awesome! I’m glad that book helped, I’ll look ...",1,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-08 21:37:19
802,kgqs5d3,t3_18zs11x,t1_kgqlmiv,harleydarling1793,"You’re so welcome! Highly recommend her, she’s...",2,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 15:59:34
803,kgojhkn,t3_18zs11x,t1_kgm9efs,ClueSmooth6913,yeah but next school year i will be going to p...,0,1,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 03:24:05
804,kgqlbcr,t3_18zs11x,t1_kgqflsl,ClueSmooth6913,thank u for the resource!,2,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 15:14:58


## 3. Selected users' comments

### *3.1. Pulling the Comments*

In [12]:
# Creating dictionary for the comments
comments_dict = {
    "comment_id": [],
    "comment_submission_id": [],
    "comment_parent_id": [],
    "comment_author": [],
    "comment_body": [],
    "comment_score": [],
    "comment_replies": [],
    "comment_link": [],
    "comment_created": []
}

In [13]:
# Helper function to recursively count replies
def count_replies(comment):
    count = 0
    for reply in comment.replies:
        count += 1 + count_replies(reply)
    return count

In [14]:
# Iterate through each username in the list
for username in usernames:
    try:
        # Get the user's comments in the subreddit
        user = reddit.redditor(username)
        # Attempt to access the 'id' attribute to check if the user exists
        user_id = user.id
        comments = user.comments.new(limit=None)

        # Iterate through the user's comments
        for comment in comments:
            if comment.subreddit.display_name.lower() == subreddit:
                comments_dict["comment_id"].append(comment.id)
                comments_dict["comment_submission_id"].append(comment.link_id)
                comments_dict["comment_parent_id"].append(comment.parent_id)
                comments_dict["comment_author"].append(comment.author.name if comment.author else None)
                comments_dict["comment_body"].append(comment.body)
                comments_dict["comment_score"].append(comment.score)
                comments_dict["comment_replies"].append(count_replies(comment))
                comments_dict["comment_link"].append(f"https://www.reddit.com{comment.permalink}")
                comments_dict["comment_created"].append(datetime.utcfromtimestamp(comment.created_utc))

        # Introduce a sleep to avoid API rate limits
        time.sleep(2)

    # Print non-existing, suspended, or deleted usernames
    except NotFound:
        print(f"User '{username}' not found. Skipping...")
        continue

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

User 'alsegodfan' not found. Skipping...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

User 'ovelettersxoxo' not found. Skipping...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

User 'SubjectivelyAlone`' not found. Skipping...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

User 'opsided_Wrangler_55' not found. Skipping...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

In [15]:
# Transforming the dictionary into a data frame
comments_data2 = pd.DataFrame(comments_dict)
comments_data2

Unnamed: 0,comment_id,comment_submission_id,comment_parent_id,comment_author,comment_body,comment_score,comment_replies,comment_link,comment_created
0,kj6r2ba,t3_19cn90t,t1_kj6g7fo,Clumsycatlover,I wish I could :( but it’s not safe for my mom...,1,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-23 12:46:50
1,kj5arwi,t3_19cn90t,t1_kj57mq5,Clumsycatlover,I know. I don’t really know haha? She just won...,1,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-23 03:43:32
2,kj56zdr,t3_19cn90t,t1_kj56qo4,Clumsycatlover,Im 13. My therapist doesn’t think my ED was ba...,1,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-23 03:17:20
3,kj55b78,t3_19cn90t,t1_kj544v4,Clumsycatlover,Thanks. It’s difficult being in recovery when ...,1,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-23 03:06:03
4,kj532z5,t3_19cn90t,t1_kj4y81e,Clumsycatlover,I suppose what I neglected to include is that ...,1,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-23 02:51:09
...,...,...,...,...,...,...,...,...,...
1791,khrf65f,t3_195lox8,t3_195lox8,goosechickens,I have to set a timer for everything to distra...,3,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-14 02:42:40
1792,kgqlmiv,t3_18zs11x,t1_kgp663m,ClueSmooth6913,thx for that! i haven’t ever heard of that bef...,2,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 15:17:02
1793,kgqlbcr,t3_18zs11x,t1_kgqflsl,ClueSmooth6913,thank u for the resource!,2,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 15:14:58
1794,kgojwkh,t3_18zs11x,t1_kglwz4n,ClueSmooth6913,thank u for the book suggestion! i’ve just fin...,2,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 03:27:03


### *3.2. Pulling the Submissions belonging to the Comments above*

In [16]:
# Extract submission IDs from the comment_submission_id column
comments_data2['submission_id'] = comments_data2['comment_submission_id'].str.replace('t3_', '')

# Remove duplicates and keep the unique submission IDs
unique_submission_ids = comments_data2['submission_id'].unique()

In [17]:
# Creating dictionary for the posts
submissions_dict = {"id" : [],
             "author" : [],
             "created": [],
             "title" : [],
             "body" : [],
             "score" : [],
             "comms_num": [],
             "url" : []
             }

In [18]:
# Iterate through unique submission IDs
for submission_id in unique_submission_ids:

    # Get the submission by its ID
    submission = reddit.submission(id=submission_id)

    # Append submission data to the dictionary
    submissions_dict["author"].append(submission.author)
    submissions_dict["title"].append(submission.title)
    submissions_dict["score"].append(submission.score)
    submissions_dict["id"].append(submission.id)
    submissions_dict["url"].append(submission.url)
    submissions_dict["comms_num"].append(submission.num_comments)
    submissions_dict["created"].append(datetime.utcfromtimestamp(submission.created))
    submissions_dict["body"].append(submission.selftext)

    # Introduce a sleep to avoid API rate limits
    time.sleep(2)

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

In [19]:
# Transforming the dictionary into a data frame
submissions_data2 = pd.DataFrame(submissions_dict)
submissions_data2

Unnamed: 0,id,author,created,title,body,score,comms_num,url
0,19cn90t,Clumsycatlover,2024-01-22 04:30:43,Triggered by my aunt,"Basically, for context disordered eating runs ...",2,11,https://www.reddit.com/r/EatingDisorders/comme...
1,192xbj3,ReturnOwn1757,2024-01-10 02:35:19,How to stop counting calories in college?,I'm fulyl into my recovery and intuitively eat...,12,7,https://www.reddit.com/r/EatingDisorders/comme...
2,192txfv,lumpy_space_queenie,2024-01-10 00:00:35,Recovery Burps…,"For recovering bulimics (or anyone, it may not...",4,5,https://www.reddit.com/r/EatingDisorders/comme...
3,192sv2n,Ransacked_Tiger186,2024-01-09 23:15:43,Fixing damaged metabolism?,I cut my caloric intake in half to get back to...,6,4,https://www.reddit.com/r/EatingDisorders/comme...
4,192z6g2,,2024-01-10 04:09:29,New Med/Weight Gain,[deleted],4,1,
...,...,...,...,...,...,...,...,...
731,19f4p5r,Allen892,2024-01-25 08:26:59,"I want to gain weight, hlp plz?",I struggle with my mental health and as a resu...,8,5,https://www.reddit.com/r/EatingDisorders/comme...
732,1943sj7,,2024-01-11 15:05:21,When does support become enabling?,[deleted],3,2,
733,194dlby,marmeladeinthesky,2024-01-11 21:51:43,How do I lose weight without losing myself?,"I am 7 years into recovery, after restricting ...",2,2,https://www.reddit.com/r/EatingDisorders/comme...
734,194uth5,Bubbly-Philosophy-52,2024-01-12 13:16:34,Completely lost my appetite,So I've struggled with eating disorders for ye...,2,2,https://www.reddit.com/r/EatingDisorders/comme...


### *3.3. Pulling the Comments belonging to the Submissions above*

In [20]:
# Creating dictionary for the comments
comments_dict = {
    "comment_id": [],
    "comment_submission_id": [],
    "comment_parent_id": [],
    "comment_author": [],
    "comment_body": [],
    "comment_score": [],
    "comment_replies": [],
    "comment_link": [],
    "comment_created": []
}

In [21]:
# Assuming you have a DataFrame called submission_data with a column 'id'
for submission_id in submissions_data2['id']:
    try:
        submission = reddit.submission(id=submission_id)

        # Iterate until there are no more "MoreComments" objects
        while True:
            try:
                submission.comments.replace_more(limit=None)
                break  # Break out of the loop if successful
            except Exception as e:
                print(f"Error: {e}")
                # Add a delay or handle the error as needed, then retry

        # Iterate through all comments
        for comment in submission.comments.list():
            comments_dict["comment_id"].append(comment.id)
            comments_dict["comment_submission_id"].append(comment.link_id)
            comments_dict["comment_parent_id"].append(comment.parent_id)
            comments_dict["comment_author"].append(comment.author.name if comment.author else None)
            comments_dict["comment_body"].append(comment.body)
            comments_dict["comment_score"].append(comment.score)
            comments_dict["comment_replies"].append(count_replies(comment))
            comments_dict["comment_link"].append(f"https://www.reddit.com{comment.permalink}")
            comments_dict["comment_created"].append(datetime.utcfromtimestamp(comment.created_utc))

            # Introduce a sleep to avoid API rate limits
            time.sleep(2)

    except Exception as e:
        print(f"Error processing submission with ID {submission_id}: {e}")
        continue

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

In [22]:
# Transforming the dictionary into a data frame
comments_data3 = pd.DataFrame(comments_dict)
comments_data3

Unnamed: 0,comment_id,comment_submission_id,comment_parent_id,comment_author,comment_body,comment_score,comment_replies,comment_link,comment_created
0,kj4y81e,t3_19cn90t,t3_19cn90t,,My aunt purposely triggered me then went on to...,2,10,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-23 02:19:29
1,kj532z5,t3_19cn90t,t1_kj4y81e,Clumsycatlover,I suppose what I neglected to include is that ...,1,9,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-23 02:51:09
2,kj544v4,t3_19cn90t,t1_kj532z5,,I’m sorry 😢 but she’s not your responsibility....,1,8,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-23 02:58:08
3,kj55b78,t3_19cn90t,t1_kj544v4,Clumsycatlover,Thanks. It’s difficult being in recovery when ...,1,7,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-23 03:06:03
4,kj56qo4,t3_19cn90t,t1_kj55b78,,"🙁 honestly I wish. You might not know it, but ...",1,6,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-23 03:15:39
...,...,...,...,...,...,...,...,...,...
7483,khsh8q8,t3_195lox8,t3_195lox8,mysupersalami,I had to resign from a job I absolutely loved ...,3,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-14 08:19:13
7484,khrf65f,t3_195lox8,t3_195lox8,goosechickens,I have to set a timer for everything to distra...,3,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-14 02:42:40
7485,khqtr32,t3_195lox8,t3_195lox8,Lonely-Iron-1038,lol. barely 🥲,2,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-14 00:23:50
7486,khy1fzq,t3_195lox8,t3_195lox8,Sea-Fix-3520,I got chronic fatigue syndrome and had to go o...,2,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-15 09:19:25


## 4. Additional Data Pull

### *4.1. Benchmark Submissions*

In [23]:
# Creating dictionary for the submissions
submissions_dict = {
    "id": [],
    "author": [],
    "created": [],
    "title": [],
    "body": [],
    "score": [],
    "comms_num": [],
    "url": []
}

In [24]:
# Iterate through each submission ID in the list
for submission_id in submission_ids:

    # Get the submission by ID
    submission = reddit.submission(id=submission_id)

    # Collect data for the current submission
    submissions_dict["id"].append(submission.id)
    submissions_dict["author"].append(submission.author.name if submission.author else None)
    submissions_dict["title"].append(submission.title)
    submissions_dict["score"].append(submission.score)
    submissions_dict["url"].append(submission.url)
    submissions_dict["comms_num"].append(submission.num_comments)
    submissions_dict["created"].append(datetime.utcfromtimestamp(submission.created_utc))
    submissions_dict["body"].append(submission.selftext)

    # Introduce a sleep to avoid API rate limits
    time.sleep(2)

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

In [25]:
# Transforming the dictionary into a data frame
submissions_data3 = pd.DataFrame(submissions_dict)

### *4.2. Benchmark Submissions' Comments*

In [26]:
# Creating dictionary for the comments
comments_dict = {
    "comment_id": [],
    "comment_submission_id": [],
    "comment_parent_id": [],
    "comment_author": [],
    "comment_body": [],
    "comment_score": [],
    "comment_replies": [],
    "comment_link": [],
    "comment_created": []
}

In [27]:
# Iterate through each submission ID in the list
for submission_id in submission_ids:
    try:
        # Get the submission by ID
        submission = reddit.submission(id=submission_id)

        # Get all comments for the current submission
        submission.comments.replace_more(limit=None)
        comments = submission.comments.list()

        # Iterate through the comments
        for comment in comments:
            comments_dict["comment_id"].append(comment.id)
            comments_dict["comment_submission_id"].append(comment.link_id)
            comments_dict["comment_parent_id"].append(comment.parent_id)
            comments_dict["comment_author"].append(comment.author.name if comment.author else None)
            comments_dict["comment_body"].append(comment.body)
            comments_dict["comment_score"].append(comment.score)
            comments_dict["comment_replies"].append(count_replies(comment))
            comments_dict["comment_link"].append(f"https://www.reddit.com{comment.permalink}")
            comments_dict["comment_created"].append(datetime.utcfromtimestamp(comment.created_utc))

        # Introduce a sleep to avoid API rate limits
        time.sleep(2)

    # Print message if the submission is not found
    except NotFound:
        print(f"Submission not found with ID '{submission_id}'. Skipping...")
        continue

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

In [28]:
# Transforming the dictionary into a data frame
comments_data4 = pd.DataFrame(comments_dict)
comments_data4

Unnamed: 0,comment_id,comment_submission_id,comment_parent_id,comment_author,comment_body,comment_score,comment_replies,comment_link,comment_created
0,c8munu7,t3_19cn90,t3_19cn90,shavinghobbit,Blueberries and whipped cream\n\nSugar free je...,12,3,https://www.reddit.com/r/keto/comments/19cn90/...,2013-02-27 22:28:29
1,c8mzmfy,t3_19cn90,t3_19cn90,googleyeye,My go to is [Blue Diamond Oven Roasted Dark Ch...,11,2,https://www.reddit.com/r/keto/comments/19cn90/...,2013-02-28 02:34:20
2,c8n37xr,t3_19cn90,t3_19cn90,,this is a fantastic cheesecake cupcake recipe\...,5,2,https://www.reddit.com/r/keto/comments/19cn90/...,2013-02-28 05:34:39
3,c8nc3u1,t3_19cn90,t3_19cn90,calabim,Aaaaaand saved.,4,0,https://www.reddit.com/r/keto/comments/19cn90/...,2013-02-28 18:05:07
4,c8mx7ep,t3_19cn90,t3_19cn90,spofoman,Lindt 90% Cocoa Dark Chocolate \n\nNutiva Coco...,3,0,https://www.reddit.com/r/keto/comments/19cn90/...,2013-02-28 00:33:19
...,...,...,...,...,...,...,...,...,...
618,kgyfrbh,t3_18zs11x,t1_kgojwkh,Gear_Familiar,"Awesome! I’m glad that book helped, I’ll look ...",1,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-08 21:37:19
619,kgqs5d3,t3_18zs11x,t1_kgqlmiv,harleydarling1793,"You’re so welcome! Highly recommend her, she’s...",2,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 15:59:34
620,kgojhkn,t3_18zs11x,t1_kgm9efs,ClueSmooth6913,yeah but next school year i will be going to p...,0,1,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 03:24:05
621,kgqlbcr,t3_18zs11x,t1_kgqflsl,ClueSmooth6913,thank u for the resource!,2,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 15:14:58


### *4.3. Benchmark Comments*

In [29]:
# Creating dictionary for the comments
comments_dict = {
    "comment_id": [],
    "comment_submission_id": [],
    "comment_parent_id": [],
    "comment_author": [],
    "comment_body": [],
    "comment_score": [],
    "comment_replies": [],
    "comment_link": [],
    "comment_created": []
}

In [30]:
# Iterate through each comment link in the list
for comment_link in comment_links:
    try:
        # Get the comment by URL
        comment = reddit.comment(url=comment_link)

        # Collect data for the current comment
        comments_dict["comment_id"].append(comment.id)
        comments_dict["comment_submission_id"].append(comment.link_id)
        comments_dict["comment_parent_id"].append(comment.parent_id)
        comments_dict["comment_author"].append(comment.author.name if comment.author else None)
        comments_dict["comment_body"].append(comment.body)
        comments_dict["comment_score"].append(comment.score)
        comments_dict["comment_replies"].append(count_replies(comment))
        comments_dict["comment_link"].append(comment_link)
        comments_dict["comment_created"].append(datetime.utcfromtimestamp(comment.created_utc))

        # Introduce a sleep to avoid API rate limits
        time.sleep(2)

    # Print message if the comment is not found
    except NotFound:
        print(f"Comment not found at link '{comment_link}'. Skipping...")
        continue

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

In [31]:
# Transforming the dictionary into a data frame
comments_data5 = pd.DataFrame(comments_dict)
comments_data5

Unnamed: 0,comment_id,comment_submission_id,comment_parent_id,comment_author,comment_body,comment_score,comment_replies,comment_link,comment_created
0,jehxfi1,t3_1286dek,t3_1286dek,cleanhouz,Three treatment centers and two years later I ...,16,0,https://www.reddit.com/r/EatingDisorders/comme...,2023-04-01 03:49:23
1,j67ynxg,t3_10n5kb0,t3_10n5kb0,Germmira,I've had anorexia and periods of experiencing ...,2,0,https://www.reddit.com/r/EatingDisorders/comme...,2023-01-28 11:00:25
2,dmfg6rf,t3_6xdfqw,t3_6xdfqw,littlesoubrette,I've been in recovery for two years and would ...,3,0,https://www.reddit.com/r/EatingDisorders/comme...,2017-09-01 15:48:58
3,kgkvxxn,t3_18z9k90,t3_18z9k90,mindfullymoving,I entered quasi recovery 10 years ago.\n\nI at...,3,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-06 13:20:52
4,kgmbtt4,t3_18z9k90,t3_18z9k90,Gear_Familiar,"Oh dear, recovery and healing are not a recipe...",2,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-06 19:04:09
5,kfacbqw,t3_18sdtgv,t3_18sdtgv,manders556,Hello (25 f) struggled with anorexia purge sub...,3,0,https://www.reddit.com/r/EatingDisorders/comme...,2023-12-28 16:55:23
6,kdpepgb,t3_18jlnt7,t3_18jlnt7,SomaticSpacePrincess,I didnt come to terms with the fact i had an E...,3,0,https://www.reddit.com/r/EatingDisorders/comme...,2023-12-17 02:53:58
7,kfblbe3,t3_18qz4ui,t3_18qz4ui,manders556,"Hello, (25 f fully recovered now) \n\nthis giv...",1,0,https://www.reddit.com/r/EatingDisorders/comme...,2023-12-28 21:21:26
8,kdu0sdw,t3_18jtpqd,t3_18jtpqd,fineboifranz,I've had trouble eating/ED since early stages ...,1,0,https://www.reddit.com/r/EatingDisorders/comme...,2023-12-18 01:31:37
9,kbn6448,t3_1881mi8,t3_1881mi8,gaykoalas,"I was in the same boat, recovered in my 20s af...",1,0,https://www.reddit.com/r/EatingDisorders/comme...,2023-12-02 04:09:59


### *4.4. Benchmark Comments' Submissions*

In [32]:
# Extract submission IDs from the comment_submission_id column
comments_data4['submission_id'] = comments_data4['comment_submission_id'].str.replace('t3_', '')

# Remove duplicates and keep the unique submission IDs
unique_submission_ids = comments_data4['submission_id'].unique()

In [33]:
# Creating dictionary for the posts
submissions_dict = {"id" : [],
             "author" : [],
             "created": [],
             "title" : [],
             "body" : [],
             "score" : [],
             "comms_num": [],
             "url" : []
             }

In [34]:
# Iterate through unique submission IDs
for submission_id in unique_submission_ids:

    # Get the submission by its ID
    submission = reddit.submission(id=submission_id)

    # Append submission data to the dictionary
    submissions_dict["author"].append(submission.author)
    submissions_dict["title"].append(submission.title)
    submissions_dict["score"].append(submission.score)
    submissions_dict["id"].append(submission.id)
    submissions_dict["url"].append(submission.url)
    submissions_dict["comms_num"].append(submission.num_comments)
    submissions_dict["created"].append(datetime.utcfromtimestamp(submission.created))
    submissions_dict["body"].append(submission.selftext)

    # Introduce a sleep to avoid API rate limits
    time.sleep(2)

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

In [35]:
# Transforming the dictionary into a data frame
submissions_data4 = pd.DataFrame(submissions_dict)
submissions_data4

Unnamed: 0,id,author,created,title,body,score,comms_num,url
0,19cn90,snowandbaggypants,2013-02-27 21:09:37,"Ketoers with a sweet tooth, what's your favori...",I have an insatiable sweet tooth. Luckily I've...,25,49,https://www.reddit.com/r/keto/comments/19cn90/...
1,19084n7,DrumNBeans,2024-01-06 19:39:03,Help…,I am in need of advice. \n\nI am in my fourth ...,11,12,https://www.reddit.com/r/EatingDisorders/comme...
2,18yp4it,mindfullymoving,2024-01-04 21:55:20,After 17 years of EDs and diets... this is the...,I just wanted to give a heartfelt message to a...,49,15,https://www.reddit.com/r/EatingDisorders/comme...
3,1775ux4,BoringButterscotch39,2023-10-13 18:53:02,Sharing some things that helped ME during reco...,"\n** only read if seeking recovery - also, not...",10,1,https://www.reddit.com/r/EatingDisorders/comme...
4,15q8qt2,AbbeyGirl4876,2023-08-13 20:04:59,I’m winning this thing called Ed on my own,"Hi, guys. I have been up and down in my recove...",12,7,https://www.reddit.com/r/EatingDisorders/comme...
...,...,...,...,...,...,...,...,...
65,198qec4,ElectricalBenefit666,2024-01-17 06:52:09,how to start recovery by myself though unmotiv...,backstory and i’m just going to be completely ...,8,4,https://www.reddit.com/r/EatingDisorders/comme...
66,197km9z,Acrobatic_Bed_431,2024-01-15 21:32:38,How can I stop emotional eating?,"So in the past year, I’ve had one of the harde...",9,8,https://www.reddit.com/r/EatingDisorders/comme...
67,1974pf3,No-Sport-8950,2024-01-15 09:23:27,Starter foods,My family’s a strict vegan and have pushed veg...,7,3,https://www.reddit.com/r/EatingDisorders/comme...
68,190ogww,ismircappuccino,2024-01-07 09:28:05,idk,i have an ed for 4 years now. i tried so many ...,3,0,https://www.reddit.com/r/EatingDisorders/comme...


### *4.5. Benchmark Comments' Submissions' Comments*

In [36]:
# Creating dictionary for the comments
comments_dict = {
    "comment_id": [],
    "comment_submission_id": [],
    "comment_parent_id": [],
    "comment_author": [],
    "comment_body": [],
    "comment_score": [],
    "comment_replies": [],
    "comment_link": [],
    "comment_created": []
}

In [37]:
# Iterate through each submission ID in the DataFrame
for submission_id in submissions_data4['id']:
    try:
        # Get the submission by ID
        submission = reddit.submission(id=submission_id)

        # Get all comments for the current submission
        submission.comments.replace_more(limit=None)
        comments = submission.comments.list()

        # Iterate through the comments
        for comment in comments:
            comments_dict["comment_id"].append(comment.id)
            comments_dict["comment_submission_id"].append(comment.link_id)
            comments_dict["comment_parent_id"].append(comment.parent_id)
            comments_dict["comment_author"].append(comment.author.name if comment.author else None)
            comments_dict["comment_body"].append(comment.body)
            comments_dict["comment_score"].append(comment.score)
            comments_dict["comment_replies"].append(count_replies(comment))
            comments_dict["comment_link"].append(f"https://www.reddit.com{comment.permalink}")
            comments_dict["comment_created"].append(datetime.utcfromtimestamp(comment.created_utc))

        # Introduce a sleep to avoid API rate limits
        time.sleep(2)

    # Print message if the submission is not found
    except NotFound:
        print(f"Submission not found with ID '{submission_id}'. Skipping...")
        continue

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

In [38]:
# Transforming the dictionary into a data frame
comments_data6 = pd.DataFrame(comments_dict)
comments_data6

Unnamed: 0,comment_id,comment_submission_id,comment_parent_id,comment_author,comment_body,comment_score,comment_replies,comment_link,comment_created
0,c8munu7,t3_19cn90,t3_19cn90,shavinghobbit,Blueberries and whipped cream\n\nSugar free je...,13,3,https://www.reddit.com/r/keto/comments/19cn90/...,2013-02-27 22:28:29
1,c8mzmfy,t3_19cn90,t3_19cn90,googleyeye,My go to is [Blue Diamond Oven Roasted Dark Ch...,11,2,https://www.reddit.com/r/keto/comments/19cn90/...,2013-02-28 02:34:20
2,c8n37xr,t3_19cn90,t3_19cn90,,this is a fantastic cheesecake cupcake recipe\...,5,2,https://www.reddit.com/r/keto/comments/19cn90/...,2013-02-28 05:34:39
3,c8nc3u1,t3_19cn90,t3_19cn90,calabim,Aaaaaand saved.,5,0,https://www.reddit.com/r/keto/comments/19cn90/...,2013-02-28 18:05:07
4,c8mx7ep,t3_19cn90,t3_19cn90,spofoman,Lindt 90% Cocoa Dark Chocolate \n\nNutiva Coco...,3,0,https://www.reddit.com/r/keto/comments/19cn90/...,2013-02-28 00:33:19
...,...,...,...,...,...,...,...,...,...
618,kgyfrbh,t3_18zs11x,t1_kgojwkh,Gear_Familiar,"Awesome! I’m glad that book helped, I’ll look ...",1,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-08 21:37:19
619,kgqs5d3,t3_18zs11x,t1_kgqlmiv,harleydarling1793,"You’re so welcome! Highly recommend her, she’s...",2,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 15:59:34
620,kgojhkn,t3_18zs11x,t1_kgm9efs,ClueSmooth6913,yeah but next school year i will be going to p...,0,1,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 03:24:05
621,kgqlbcr,t3_18zs11x,t1_kgqflsl,ClueSmooth6913,thank u for the resource!,2,0,https://www.reddit.com/r/EatingDisorders/comme...,2024-01-07 15:14:58


## 5. Merging and Saving the Data

### *5.1. Submissions Data*

In [39]:
# Merge the two submissions data frames
merged_submissions = pd.concat([submissions_data, submissions_data2, submissions_data3, submissions_data4], ignore_index=True)

# Deduplicate based on all columns
merged_submissions_final = merged_submissions.drop_duplicates()
merged_submissions_final

# Double check
row_count_submissions_data = submissions_data.shape[0]
print("Number of rows in submissions_data:", row_count_submissions_data)

row_count_submissions_data2 = submissions_data2.shape[0]
print("Number of rows in submissions_data2:", row_count_submissions_data2)

row_count_submissions_data3 = submissions_data3.shape[0]
print("Number of rows in submissions_data3:", row_count_submissions_data3)

row_count_submissions_data4 = submissions_data4.shape[0]
print("Number of rows in submissions_data4:", row_count_submissions_data4)

row_count_merged_submissions = merged_submissions.shape[0]
print("Number of rows in merged_submissions:", row_count_merged_submissions)

row_count_merged_submissions_final = merged_submissions_final.shape[0]
print("Number of rows in merged_submissions_final:", row_count_merged_submissions_final)

Number of rows in submissions_data: 169
Number of rows in submissions_data2: 736
Number of rows in submissions_data3: 103
Number of rows in submissions_data4: 70
Number of rows in merged_submissions: 1078
Number of rows in merged_submissions_final: 999


In [None]:
# Save dataset to the  Drive
merged_submissions_final.to_excel('/content/drive/My Drive/2_Lund/2_SIMZ_Thesis/3.3_submissions_data.xlsx', index=False)
print("Excel file saved to:", excel_file_path)

### *5.2. Comments Data*

In [41]:
comments_data['submission_id'] = comments_data['comment_submission_id'].str.replace('t3_', '')
comments_data2['submission_id'] = comments_data2['comment_submission_id'].str.replace('t3_', '')
comments_data3['submission_id'] = comments_data3['comment_submission_id'].str.replace('t3_', '')
comments_data4['submission_id'] = comments_data4['comment_submission_id'].str.replace('t3_', '')
comments_data5['submission_id'] = comments_data5['comment_submission_id'].str.replace('t3_', '')
comments_data6['submission_id'] = comments_data6['comment_submission_id'].str.replace('t3_', '')

In [42]:
# Merge the two comments data frames
merged_comments = pd.concat([comments_data, comments_data2, comments_data3, comments_data4, comments_data5, comments_data6], ignore_index=True)

# Deduplicate based on all columns
merged_comments_final = merged_comments.drop_duplicates()
merged_comments_final

# Double check
row_count_comments_data = comments_data.shape[0]
print("Number of rows in comments_data:", row_count_comments_data)

row_count_comments_data2 = comments_data2.shape[0]
print("Number of rows in comments_data2:", row_count_comments_data2)

row_count_comments_data3 = comments_data3.shape[0]
print("Number of rows in comments_data3:", row_count_comments_data3)

row_count_comments_data4 = comments_data4.shape[0]
print("Number of rows in comments_data4:", row_count_comments_data4)

row_count_comments_data5 = comments_data5.shape[0]
print("Number of rows in comments_data5:", row_count_comments_data5)

row_count_comments_data6 = comments_data6.shape[0]
print("Number of rows in comments_data6:", row_count_comments_data6)

row_count_merged_comments = merged_comments.shape[0]
print("Number of rows in merged_comments:", row_count_merged_comments)

row_count_merged_comments_final = merged_comments_final.shape[0]
print("Number of rows in merged_comments_final:", row_count_merged_comments_final)

Number of rows in comments_data: 806
Number of rows in comments_data2: 1796
Number of rows in comments_data3: 7488
Number of rows in comments_data4: 623
Number of rows in comments_data5: 44
Number of rows in comments_data6: 623
Number of rows in merged_comments: 11380
Number of rows in merged_comments_final: 8432


In [None]:
# Save dataset to the  Drive
merged_comments_final.to_excel('/content/drive/My Drive/2_Lund/2_SIMZ_Thesis/3.4_comments_data.xlsx', index=False)
print("Excel file saved to:", excel_file_path)