In [1]:
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
# Function to make the API request
def fetch_reddit_data(subreddit, min_score, after, size, sort_type, headers):
    url = f"https://api.pullpush.io/reddit/search/submission/?subreddit={subreddit}&size={size}&after={after}&sort_type={sort_type}"
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Ensure we got a valid response
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

In [5]:
# Function to extract the main link from Reddit posts
def extract_main_link_from_reddit(post_url, headers):
    try:
        response = requests.get(post_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        shreddit_post = soup.find('shreddit-post')
        if shreddit_post:
            return shreddit_post.get('content-href')
        else:
            print(f"No shreddit-post element found in the HTML content for {post_url}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to extract link from {post_url}: {e}")
    return None

In [6]:
# Function to process the fetched Reddit data
def process_reddit_data(data, category, subreddit, headers):
    posts = []
    for submission in data.get('data', []):
        if not submission.get('is_self', True):
            post_url = f"https://www.reddit.com{submission.get('permalink')}"
            external_link = extract_main_link_from_reddit(post_url, headers)
            if external_link:
                # Simulate scraping the external link
                row = {
                    'external_link': external_link,
                    'title': submission.get('title', ''),
                    'category': category,
                    'subreddit': subreddit,
                    'score': submission.get('score', 0),
                    'upvote_ratio': submission.get('upvote_ratio', 0.0)
                }
                print(f"{submission.get('title', '')}: retrived")
                posts.append(row)
    return posts

In [7]:
# Main configuration
subreddit = "theonion"
min_score = 1
size = 100
sort_type = "score"
category = "satire"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Date range configuration
after = int(datetime(2023, 1, 1, 12, 0, 0).timestamp())

# Fetch and process data
data = fetch_reddit_data(subreddit, min_score, after, size, sort_type, headers)
if data:
    processed_data = process_reddit_data(data, category, subreddit, headers)
    
    # Create DataFrame and save to CSV
    df = pd.DataFrame(processed_data)
    df.insert(0, 'index', range(1, len(df) + 1))
    df.to_csv(f'./satire_dataset_{subreddit}_2023.csv', index=False)
    print(f"Data saved to 'satire_dataset_{subreddit}_2023.csv'")

‘The Onion’ Stands With Israel Because It Seems Like You Get In Less Trouble For That: retrived
Report: Trump Defiantly Pumped Fist For 20 Minutes After Assassination Attempt Searching For Camera: retrived
Iranian President Stoned To Death With Mountain: retrived
‘No Way To Prevent This,’ Says Only Nation Where This Regularly Happens: retrived
Sources Allege Tim Walz’s ‘Aw, Shucks’ Persona Merely Facade Concealing True ‘Gee Whiz’ Tendencies: retrived
Onion are funny but they ain't playing: retrived
Dems Alarmed By Joe Biden’s Poor Performance As Debate Viewer: retrived


KeyboardInterrupt: 