# Scraped Political Reddit Posts
- r/hasan_piker
- r/destin
- r/

### Install dependencies

In [8]:
%pip install pandas
%pip install tqdm

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.1.3-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl (11.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.4/11.4 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading numpy-2.1.3-cp312-cp312-macosx_14_0_arm64.whl (5.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pytz-2024.2-py2.py3-none-any.whl (508 kB)
Downloading tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, nump

## Define models

In [2]:
class Post:
    def __init__(self, data):
        self.subreddit = data.get("subreddit", "")
        self.subreddit_id = data.get("subreddit_id", "")
        self.title = data.get("title", "")
        self.selftext = data.get("selftext", "")
        self.author = data.get("author", "")
        self.author_flair = data.get("author_flair_text", "")
        self.score = data.get("score", 0)
        self.upvote_ratio = data.get("upvote_ratio", 0.0)
        self.num_comments = data.get("num_comments", 0)
        self.created_utc = data.get("created_utc", 0)
        self.link_flair = data.get("link_flair_text", "")
        self.url = data.get("url", "")
        self.total_awards = data.get("total_awards_received", 0)
        self.controversiality = data.get("controversiality", 0)
        self.num_reports = data.get("num_reports", 0)
        self.comments = []

    def add_comment(self, comment_data):
        self.comments.append(Comment(comment_data))

    def to_dict(self):
        return {
            "subreddit": self.subreddit,
            "subreddit_id": self.subreddit_id,
            "title": self.title,
            "selftext": self.selftext,
            "author": self.author,
            "author_flair": self.author_flair,
            "score": self.score,
            "upvote_ratio": self.upvote_ratio,
            "num_comments": self.num_comments,
            "created_utc": self.created_utc,
            "link_flair": self.link_flair,
            "url": self.url,
            "total_awards": self.total_awards,
            "controversiality": self.controversiality,
            "num_reports": self.num_reports,
            "comments": [comment.to_dict() for comment in self.comments]
        }

class Comment:
    def __init__(self, data):
        self.author = data.get("author", "")
        self.author_flair = data.get("author_flair_text", "")
        self.body = data.get("body", "")
        self.score = data.get("score", 0)
        self.depth = data.get("depth", 0)
        self.controversiality = data.get("controversiality", 0)

    def to_dict(self):
        return {
            "author": self.author,
            "author_flair": self.author_flair,
            "body": self.body,
            "score": self.score,
            "depth": self.depth,
            "controversiality": self.controversiality
        }

## Define functions to help us clean parse and clean the raw json file (~4gb of data)

In [3]:
import json
from tqdm import tqdm

In [4]:
def parse_and_clean_json(input_file, output_file, chunk_size=1000):
    with open(input_file, 'r', encoding='utf-8') as f:
        try:
            # Attempt to load the entire content as a single JSON array
            json_data = json.load(f)
            if isinstance(json_data, list):
                process_json_array(json_data, output_file, chunk_size)
            else:
                raise ValueError("Expected a JSON array or line-delimited JSON")
        except json.JSONDecodeError:
            f.seek(0)
            process_line_by_line(f, output_file, chunk_size)

def process_json_array(json_data, output_file, chunk_size):
    with open(output_file, 'w', encoding='utf-8') as out:
        # Create chunks and show progress
        for i in tqdm(range(0, len(json_data), chunk_size), desc="Processing JSON array"):
            chunk = json_data[i:i + chunk_size]
            for item in chunk:
                cleaned_data = clean_post_data(item)
                json.dump(cleaned_data, out)
                out.write('\n')

def process_line_by_line(f, output_file, chunk_size):
    with open(output_file, 'w', encoding='utf-8') as out:
        lines = f.readlines()
        for i in tqdm(range(0, len(lines), chunk_size), desc="Processing JSON lines"):
            chunk = lines[i:i + chunk_size]
            for line in chunk:
                try:
                    json_data = json.loads(line.strip())
                    cleaned_data = clean_post_data(json_data)
                    json.dump(cleaned_data, out)
                    out.write('\n')
                except json.JSONDecodeError:
                    print(f"Skipping invalid JSON line: {line}")

def clean_post_data(json_data):
    post_data = json_data.get("data", {})
    comments_data = json_data.get("comments", {})

    post = Post(post_data)
    for _, comment_data in comments_data.items():
        post.add_comment(comment_data)

    return post.to_dict()

## Clean the data and save to an output file

In [5]:
# Usage
INPUT_FILE = './json/input/posts-11-13-2024.json'
OUTPUT_FILE = './json/output/cleaned-posts-11-13-2024.json'
parse_and_clean_json(INPUT_FILE, OUTPUT_FILE, chunk_size=500)

Processing JSON array: 100%|██████████| 105/105 [00:15<00:00,  6.77it/s]


## Load dataset into dataframe
then show it

In [13]:
import pandas as pd

df = pd.read_json(OUTPUT_FILE, lines=True)
df.head()

Unnamed: 0,subreddit,subreddit_id,title,selftext,author,author_flair,score,upvote_ratio,num_comments,created_utc,link_flair,url,total_awards,controversiality,num_reports,comments
0,Destiny,t5_2qnvz,New Vegan,,TuningsGaming,,121,0.95,2,1720304607,Shitpost,https://i.redd.it/s420ibwt4zad1.jpeg,0,0,,"[{'author': 'ImOnYew', 'author_flair': None, '..."
1,Destiny,t5_2qnvz,Not enough people talking about Trump's mental...,"Second post sorry, anyways Vegan Gains comment...",Silly_Ad_2115,,22,0.7,11,1720301638,Politics,https://www.reddit.com/r/Destiny/comments/1dx0...,0,0,,"[{'author': 'joecool42069', 'author_flair': No..."
2,Destiny,t5_2qnvz,This is what a winning map looks like for Joe ...,,Superninja19,,6,0.87,21,1720304167,Discussion,https://i.redd.it/djafvwre3zad1.png,0,0,,"[{'author': 'Bteatesthighlander1', 'author_fla..."
3,Destiny,t5_2qnvz,Hear Me Out: Biden and Harris Should Switch Roles,"Vice presidents have unlimited terms, and this...",Prestigious-Copy-126,,0,0.31,6,1720304689,Shitpost,https://www.reddit.com/r/Destiny/comments/1dx1...,0,0,,"[{'author': 'Fit_Meringue_7313', 'author_flair..."
4,Destiny,t5_2qnvz,Robert Kennedy Jr's Troubled Marriages Detaile...,[deleted],[deleted],,0,0.5,1,1720303539,Politics,,0,0,,"[{'author': '[deleted]', 'author_flair': None,..."


## Only keep posts from the subreddit we're finetuning a model for
### change this with whatever you wanna finetune on
df = df[df['subreddit'] == 'Hasan_Piker']
#### or
df = df[df['upvote_ratio'] <= 0.5]
#### or
df = df[df['link_flair'] == 'Politics']

In [14]:
df = df[df['subreddit'] == 'Hasan_Piker']
df.head()

Unnamed: 0,subreddit,subreddit_id,title,selftext,author,author_flair,score,upvote_ratio,num_comments,created_utc,link_flair,url,total_awards,controversiality,num_reports,comments
49,Hasan_Piker,t5_qs1n3,“You can’t separate peace from freedom because...,,astronautducks,,240,0.96,28,1720298381,Twitter,https://i.redd.it/yad9pfebmyad1.jpeg,0,0,,"[{'author': 'throwaway-not-this-', 'author_fla..."
50,Hasan_Piker,t5_qs1n3,Why is no one talking about Reform's AI Genera...,,AssumedPersona,,7,0.89,2,1720290622,,/r/ukpolitics/comments/1dwoz1b/why_is_no_one_t...,0,0,,"[{'author': 'AssumedPersona', 'author_flair': ..."
52,Hasan_Piker,t5_qs1n3,Obama joining in for the 4th of July celebrations,,Jam_Marbera,,84,0.91,5,1720304655,,https://v.redd.it/v8lluoltvqad1,0,0,,"[{'author': 'cutmesomeflax', 'author_flair': N..."
54,Hasan_Piker,t5_qs1n3,Why is Hasan in this conversion therapy docume...,(Pray Away on Netflix),lilsam123,,282,0.92,19,1720297763,,https://i.redd.it/6uzcbf5hkyad1.jpeg,0,0,,"[{'author': 'toeknee88125', 'author_flair': 'P..."
56,Hasan_Piker,t5_qs1n3,Low effort meme ( also am canadian ),,doomdom123,,364,0.97,9,1720280769,,https://i.redd.it/grzcad6y5xad1.jpeg,0,0,,"[{'author': 'Lazy_Average_4187', 'author_flair..."
