## Importing Libraries

In [None]:
import time
import os
from prawcore.exceptions import ServerError
import praw
import csv

## CSV File Containing 100 Subreddits

In [9]:
reddit = praw.Reddit(
    client_id = '*****',
    client_secret = '*****',
    user_agent = '*****',
    username = '*****',
    password = '*****'
)

# Get the top subreddits by subscribers
subreddits = reddit.subreddits.popular(limit=None)

# Sort the subreddits by subscribers in descending order
subreddits = sorted(subreddits, key=lambda x: x.subscribers, reverse=True)

# Select the top 100 subreddits
subreddits = subreddits[:100]

# Prepare the data for CSV
data = [[subreddit.display_name, subreddit.subscribers] for subreddit in subreddits]

# Write the data to a CSV file
with open('subreddits100.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Name', 'Population'])
    writer.writerows(data)

## Dataset Collection

In [8]:
class RedditDataCollector:
    def __init__(self, client_id, client_secret, user_agent, username, password):
        self.reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent,
            username=username,
            password=password
        )

    def retrieve_subreddit_names(self, csv_file_path):
        top_subreddits_names = []
        with open(csv_file_path, 'r', newline='', encoding='utf-8') as csv_file:
            csv_reader = csv.reader(csv_file)
            next(csv_reader)  # Skip the header row if needed

            for row in csv_reader:
                subreddit_name = row[0] 
                top_subreddits_names.append(subreddit_name)

        return top_subreddits_names

    def create_csv_writer(self, csv_file_name):
        csv_file = open(csv_file_name, 'w', newline='', encoding='utf-8')
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Subreddit', 'Population', 'Post Title', 'User ID', 'Users Who Commented', 'Upvotes', 'Date', 'Time'])
        return csv_file, csv_writer

    def retrieve_top_posts(self, subreddit_name, posts_per_subreddit, max_retries, retry_delay):
        subreddit = self.reddit.subreddit(subreddit_name)
        population = None
        retries = 0

        while retries < max_retries:
            try:
                population = subreddit.subscribers
                current_year = time.strftime('%Y')
                top_posts = subreddit.top(time_filter='year', limit=posts_per_subreddit)
                break

            except ServerError as e:
                print(f"An error occurred while retrieving top posts for subreddit {subreddit_name}: {e}")
                retries += 1
                if retries < max_retries:
                    print(f"Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                else:
                    print(f"Max retries exceeded. Skipping subreddit {subreddit_name}.")
                    break

        return population, top_posts

    def collect_data(self, subreddit_name, csv_writer, population, top_posts, time_delay):
        for post in top_posts:
            post_title = post.title
            user_id = post.author.name if post.author else None

            commenters = []
            for comment in post.comments.list():
                if isinstance(comment, praw.models.MoreComments):
                    continue
                if comment.author:
                    commenters.append(comment.author.name)

            upvotes = post.score
            post_date = time.strftime('%Y-%m-%d', time.localtime(post.created_utc))
            post_time = time.strftime('%H:%M:%S', time.localtime(post.created_utc))

            csv_writer.writerow([subreddit_name, population, post_title, user_id, ','.join(commenters), upvotes, post_date, post_time])

        time.sleep(time_delay)

    def run_pipeline(self, csv_file_path, posts_per_subreddit, rate_limit, max_retries, retry_delay, output_directory):
        top_subreddits_names = self.retrieve_subreddit_names(csv_file_path)
        time_delay = 60 / rate_limit

        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        for subreddit_name in top_subreddits_names:
            csv_file_name = os.path.join(output_directory, f'{subreddit_name}_data.csv')
            csv_file, csv_writer = self.create_csv_writer(csv_file_name)

            population, top_posts = self.retrieve_top_posts(subreddit_name, posts_per_subreddit, max_retries, retry_delay)

            if population is not None:
                self.collect_data(subreddit_name, csv_writer, population, top_posts, time_delay)

            csv_file.close()

            print(f"Data collected and saved to {csv_file_name}.")

# Reddit API credentials
client_id = '*****'
client_secret = '*****'
user_agent = '*****'
username = '*****'
password = '*****'

# Other settings
csv_file_path = 'subreddits100.csv'
posts_per_subreddit = 1000
rate_limit = 60
max_retries = 3
retry_delay = 5
output_directory = 'csv_files'

# Create an instance of the RedditDataCollector class
collector = RedditDataCollector(client_id, client_secret, user_agent, username, password)

# Run the pipeline
collector.run_pipeline(csv_file_path, posts_per_subreddit, rate_limit, max_retries, retry_delay, output_directory)

Data collected and saved to csv_files\pcmasterrace_data.csv.
Data collected and saved to csv_files\videos_data.csv.
