# Scraped Political Reddit Posts
- r/hasan_piker
- r/destin
- r/

### Install dependencies

In [None]:
%pip install tqdm

## Define models

In [1]:
class Post:
    def __init__(self, data):
        self.subreddit = data.get("subreddit", "")
        self.subreddit_id = data.get("subreddit_id", "")
        self.title = data.get("title", "")
        self.selftext = data.get("selftext", "")
        self.author = data.get("author", "")
        self.author_flair = data.get("author_flair_text", "")
        self.score = data.get("score", 0)
        self.upvote_ratio = data.get("upvote_ratio", 0.0)
        self.num_comments = data.get("num_comments", 0)
        self.created_utc = data.get("created_utc", 0)
        self.link_flair = data.get("link_flair_text", "")
        self.url = data.get("url", "")
        self.total_awards = data.get("total_awards_received", 0)
        self.controversiality = data.get("controversiality", 0)
        self.num_reports = data.get("num_reports", 0)
        self.comments = []

    def add_comment(self, comment_data):
        self.comments.append(Comment(comment_data))

    def to_dict(self):
        return {
            "subreddit": self.subreddit,
            "subreddit_id": self.subreddit_id,
            "title": self.title,
            "selftext": self.selftext,
            "author": self.author,
            "author_flair": self.author_flair,
            "score": self.score,
            "upvote_ratio": self.upvote_ratio,
            "num_comments": self.num_comments,
            "created_utc": self.created_utc,
            "link_flair": self.link_flair,
            "url": self.url,
            "total_awards": self.total_awards,
            "controversiality": self.controversiality,
            "num_reports": self.num_reports,
            "comments": [comment.to_dict() for comment in self.comments]
        }

class Comment:
    def __init__(self, data):
        self.author = data.get("author", "")
        self.author_flair = data.get("author_flair_text", "")
        self.body = data.get("body", "")
        self.score = data.get("score", 0)
        self.depth = data.get("depth", 0)
        self.controversiality = data.get("controversiality", 0)

    def to_dict(self):
        return {
            "author": self.author,
            "author_flair": self.author_flair,
            "body": self.body,
            "score": self.score,
            "depth": self.depth,
            "controversiality": self.controversiality
        }

## Define functions to help us clean parse and clean the raw json file (~4gb of data)

In [None]:
import json
from tqdm import tqdm

In [2]:
def parse_and_clean_json(input_file, output_file, chunk_size=1000):
    with open(input_file, 'r', encoding='utf-8') as f:
        try:
            # Attempt to load the entire content as a single JSON array
            json_data = json.load(f)
            if isinstance(json_data, list):
                process_json_array(json_data, output_file, chunk_size)
            else:
                raise ValueError("Expected a JSON array or line-delimited JSON")
        except json.JSONDecodeError:
            f.seek(0)
            process_line_by_line(f, output_file, chunk_size)

def process_json_array(json_data, output_file, chunk_size):
    with open(output_file, 'w', encoding='utf-8') as out:
        # Create chunks and show progress
        for i in tqdm(range(0, len(json_data), chunk_size), desc="Processing JSON array"):
            chunk = json_data[i:i + chunk_size]
            for item in chunk:
                cleaned_data = clean_post_data(item)
                json.dump(cleaned_data, out)
                out.write('\n')

def process_line_by_line(f, output_file, chunk_size):
    with open(output_file, 'w', encoding='utf-8') as out:
        lines = f.readlines()
        for i in tqdm(range(0, len(lines), chunk_size), desc="Processing JSON lines"):
            chunk = lines[i:i + chunk_size]
            for line in chunk:
                try:
                    json_data = json.loads(line.strip())
                    cleaned_data = clean_post_data(json_data)
                    json.dump(cleaned_data, out)
                    out.write('\n')
                except json.JSONDecodeError:
                    print(f"Skipping invalid JSON line: {line}")

def clean_post_data(json_data):
    post_data = json_data.get("data", {})
    comments_data = json_data.get("comments", {})

    post = Post(post_data)
    for _, comment_data in comments_data.items():
        post.add_comment(comment_data)

    return post.to_dict()

## Clean the data and save to an output file

In [3]:
# Usage
input_file = './json/input/posts-11-13-2024.json'
output_file = './json/output/cleaned-posts-11-13-2024.json'
parse_and_clean_json(input_file, output_file, chunk_size=500)

NameError: name 'json' is not defined