In [None]:
import praw
import os
from dotenv import load_dotenv
import json
from datetime import datetime

class RedditComment:
    def __init__(self, comment):
        self.id = comment.id
        self.author = str(comment.author)
        self.body = comment.body
        self.score = comment.score
        self.created_utc = comment.created_utc

    def to_dict(self):
        return {
            "id": self.id,
            "author": self.author,
            "body": self.body,
            "score": self.score,
            "created_utc": self.created_utc
        }

class RedditPost:
    def __init__(self, post):
        self.title = post.title
        self.score = post.score
        self.url = post.url
        self.created_utc = post.created_utc
        self.author = str(post.author)
        self.num_comments = post.num_comments
        self.is_self = post.is_self
        self.selftext = post.selftext if post.is_self else ""
        self.subreddit = post.subreddit.display_name
        self.comments = self.get_comments(post)

    def get_comments(self, post, limit=10):
        post.comments.replace_more(limit=0)  # Remove MoreComments objects
        return [RedditComment(comment) for comment in post.comments.list()[:limit]]

    def to_dict(self):
        return {
            "title": self.title,
            "score": self.score,
            "url": self.url,
            "created_utc": self.created_utc,
            "author": self.author,
            "num_comments": self.num_comments,
            "is_self": self.is_self,
            "selftext": self.selftext,
            "subreddit": self.subreddit,
            "comments": [comment.to_dict() for comment in self.comments]
        }

def initialize_reddit():
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT')
    )

def get_subreddit_posts(reddit, subreddit_name, limit=10, category='hot'):
    subreddit = reddit.subreddit(subreddit_name)
    if category == 'hot':
        return subreddit.hot(limit=limit)
    elif category == 'new':
        return subreddit.new(limit=limit)
    elif category == 'top':
        return subreddit.top(limit=limit)
    else:
        raise ValueError("Invalid category. Choose 'hot', 'new', or 'top'.")

def save_posts(posts, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([post.to_dict() for post in posts], f, ensure_ascii=False, indent=4)

def main():
    reddit = initialize_reddit()
    subreddit_name = 'PioneerDJ'
    limit = 20
    category = 'hot'

    posts = get_subreddit_posts(reddit, subreddit_name, limit, category)
    reddit_posts = [RedditPost(post) for post in posts]

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{subreddit_name}_{category}_{timestamp}.json"
        # Construct the full file path
    file_path = os.path.join('..', 'data', subreddit_name, filename)
    

    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    save_posts(reddit_posts, file_path)

    print(f"Saved {len(reddit_posts)} posts with comments to {filename}")

    # Print a sample of the data
    for post in reddit_posts[:3]:
        print(f"Title: {post.title}")
        print(f"Score: {post.score}")
        print(f"URL: {post.url}")
        print(f"Content: {post.selftext[:200]}..." if post.is_self else "This is a link post.")
        print(f"Number of comments: {len(post.comments)}")
        if post.comments:
            print(f"First comment: {post.comments[0].body[:100]}...")
        print("---")

if __name__ == "__main__":
    main()

In [None]:
import praw
import os
from dotenv import load_dotenv
import json
from datetime import datetime, timedelta
import time

# ... (RedditComment and RedditPost classes remain the same)

def initialize_reddit():
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT')
    )

def get_subreddit_posts_last_5_years(reddit, subreddit_name):
    subreddit = reddit.subreddit(subreddit_name)
    end_date = datetime.utcnow()
    start_date = end_date - timedelta(days=1*365)  # 5 years ago
    
    posts = []
    for post in subreddit.new(limit=None):
        post_date = datetime.utcfromtimestamp(post.created_utc)
        if post_date < start_date:
            break  # Stop if we've gone past 5 years
        posts.append(RedditPost(post))
        
        # Reddit API has a rate limit, so we need to be careful
        time.sleep(1.1)  # Sleep for 100ms between requests
        
        if len(posts) % 100 == 0:
            print(f"Fetched {len(posts)} posts...")
    
    return posts

def save_posts(posts, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([post.to_dict() for post in posts], f, ensure_ascii=False, indent=4)

def main():
    reddit = initialize_reddit()
    subreddit_name = 'PioneerDJ'

    print(f"Fetching posts from r/{subreddit_name} for the last 5 years...")
    reddit_posts = get_subreddit_posts_last_5_years(reddit, subreddit_name)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{subreddit_name}_last_5_years_{timestamp}.json"

    # Construct the full file path
    file_path = os.path.join('..', 'data', subreddit_name, filename)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    save_posts(reddit_posts, file_path)

    print(f"Saved {len(reddit_posts)} posts with comments to {file_path}")

    # Print a sample of the data
    for post in reddit_posts[:3]:
        print(f"Title: {post.title}")
        print(f"Score: {post.score}")
        print(f"URL: {post.url}")
        print(f"Date: {datetime.utcfromtimestamp(post.created_utc)}")
        print(f"Content: {post.selftext[:200]}..." if post.is_self else "This is a link post.")
        print(f"Number of comments: {len(post.comments)}")
        if post.comments:
            print(f"First comment: {post.comments[0].body[:100]}...")
        print("---")

if __name__ == "__main__":
    main()

In [None]:
import praw
import os
from dotenv import load_dotenv
import json
from datetime import datetime, timedelta
import time
from tqdm import tqdm

# ... (RedditComment and RedditPost classes remain the same)

def initialize_reddit():
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT')
    )

def get_subreddit_posts_last_5_years(reddit, subreddit_name,n_years=1):
    subreddit = reddit.subreddit(subreddit_name)
    end_date = datetime.utcnow()
    start_date = end_date - timedelta(days=n_years*365)  # 5 years ago
    
    posts = []
    pbar = tqdm(desc="Fetching posts", unit="post")
    
    for post in subreddit.new(limit=None):
        post_date = datetime.utcfromtimestamp(post.created_utc)
        if post_date < start_date:
            break  # Stop if we've gone past 5 years
        posts.append(RedditPost(post))
        pbar.update(1)
        
        # Reddit API has a rate limit, so we need to be careful
        time.sleep(0.1)  # Sleep for 100ms between requests
    
    pbar.close()
    return posts

def save_posts(posts, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([post.to_dict() for post in posts], f, ensure_ascii=False, indent=4)

def main():
    reddit = initialize_reddit()
    subreddit_name = 'PioneerDJ'
    n_years = 1

    print(f"Fetching posts from r/{subreddit_name} for the last {n_years} years...")
    reddit_posts = get_subreddit_posts_last_5_years(reddit, subreddit_name)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{subreddit_name}_last_5_years_{timestamp}.json"

    # Construct the full file path
    file_path = os.path.join('..', 'data', subreddit_name, filename)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    print(f"Saving {len(reddit_posts)} posts to file...")
    save_posts(reddit_posts, file_path)

    print(f"Saved {len(reddit_posts)} posts with comments to {file_path}")

    # Print a sample of the data
    print("\nSample of fetched data:")
    for post in reddit_posts[:3]:
        print(f"Title: {post.title}")
        print(f"Score: {post.score}")
        print(f"URL: {post.url}")
        print(f"Date: {datetime.utcfromtimestamp(post.created_utc)}")
        print(f"Content: {post.selftext[:200]}..." if post.is_self else "This is a link post.")
        print(f"Number of comments: {len(post.comments)}")
        if post.comments:
            print(f"First comment: {post.comments[0].body[:100]}...")
        print("---")

if __name__ == "__main__":
    main()

In [None]:
import praw
import os
from dotenv import load_dotenv
import json
from datetime import datetime, timedelta
import time
from tqdm import tqdm

# ... (RedditComment and RedditPost classes remain the same)

def initialize_reddit():
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT')
    )

def count_posts(reddit, subreddit_name, start_date):
    subreddit = reddit.subreddit(subreddit_name)
    count = 0
    for post in subreddit.new(limit=None):
        post_date = datetime.utcfromtimestamp(post.created_utc)
        if post_date < start_date:
            break
        count += 1
    return count

def get_subreddit_posts_last_n_years(reddit, subreddit_name, n_years=.01):
    subreddit = reddit.subreddit(subreddit_name)
    end_date = datetime.utcnow()
    start_date = end_date - timedelta(days=n_years*365)
    
    # First, count the posts
    print("Counting posts...")
    total_posts = count_posts(reddit, subreddit_name, start_date)
    print(f"Found {total_posts} posts in the last {n_years} years.")
    
    posts = []
    pbar = tqdm(total=total_posts, desc="Fetching posts", unit="post")
    
    for post in subreddit.new(limit=None):
        post_date = datetime.utcfromtimestamp(post.created_utc)
        if post_date < start_date:
            break
        posts.append(RedditPost(post))
        pbar.update(1)
        
        # Reddit API has a rate limit, so we need to be careful
        time.sleep(0.1)  # Sleep for 100ms between requests
    
    pbar.close()
    return posts

def save_posts(posts, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([post.to_dict() for post in posts], f, ensure_ascii=False, indent=4)

def main():
    reddit = initialize_reddit()
    subreddit_name = 'PioneerDJ'
    n_years = .01

    print(f"Fetching posts from r/{subreddit_name} for the last {n_years} years...")
    reddit_posts = get_subreddit_posts_last_n_years(reddit, subreddit_name, n_years)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{subreddit_name}_last_{n_years}_years_{timestamp}.json"

    # Construct the full file path
    file_path = os.path.join('..', 'data', subreddit_name, filename)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    print(f"Saving {len(reddit_posts)} posts to file...")
    save_posts(reddit_posts, file_path)

    print(f"Saved {len(reddit_posts)} posts with comments to {file_path}")

    # Print a sample of the data
    print("\nSample of fetched data:")
    for post in reddit_posts[:3]:
        print(f"Title: {post.title}")
        print(f"Score: {post.score}")
        print(f"URL: {post.url}")
        print(f"Date: {datetime.utcfromtimestamp(post.created_utc)}")
        print(f"Content: {post.selftext[:200]}..." if post.is_self else "This is a link post.")
        print(f"Number of comments: {len(post.comments)}")
        if post.comments:
            print(f"First comment: {post.comments[0].body[:100]}...")
        print("---")

if __name__ == "__main__":
    main()

In [None]:
import praw
import os
from dotenv import load_dotenv
import json
from datetime import datetime, timedelta
import time
from tqdm import tqdm

# ... (RedditComment and RedditPost classes remain the same)

def initialize_reddit():
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT')
    )

def count_posts(reddit, subreddit_name, start_date, end_date):
    subreddit = reddit.subreddit(subreddit_name)
    count = 0
    for post in subreddit.new(limit=None):
        post_date = datetime(post.created_utc)
        if post_date < start_date:
            break
        if post_date <= end_date:
            count += 1
    return count

def get_subreddit_posts_for_period(reddit, subreddit_name, start_date, end_date):
    subreddit = reddit.subreddit(subreddit_name)
    
    # First, count the posts
    print(f"Counting posts from {start_date.date()} to {end_date.date()}...")
    total_posts = count_posts(reddit, subreddit_name, start_date, end_date)
    print(f"Found {total_posts} posts in this period.")
    
    posts = []
    pbar = tqdm(total=total_posts, desc="Fetching posts", unit="post")
    
    for post in subreddit.new(limit=None):
        post_date = datetime(post.created_utc)
        if post_date < start_date:
            break
        if post_date <= end_date:
            posts.append(RedditPost(post))
            pbar.update(1)
        
        # Reddit API has a rate limit, so we need to be careful
        time.sleep(0.1)  # Sleep for 100ms between requests
    
    pbar.close()
    return posts

def save_posts(posts, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([post.to_dict() for post in posts], f, ensure_ascii=False, indent=4)

def main():
    reddit = initialize_reddit()
    subreddit_name = 'PioneerDJ'
    n_years = 1.5  # Change this to the desired number of years

    end_date = datetime()
    start_date = end_date - timedelta(days=n_years*365)

    # Split into year intervals or less
    current_start = start_date
    while current_start < end_date:
        current_end = min(current_start + timedelta(days=365), end_date)
        
        print(f"Fetching posts from r/{subreddit_name} for the period: {current_start.date()} to {current_end.date()}")
        reddit_posts = get_subreddit_posts_for_period(reddit, subreddit_name, current_start, current_end)

        # Create filename with date range
        filename = f"{subreddit_name}_{current_start.date()}_{current_end.date()}.json"

        # Construct the full file path
        file_path = os.path.join('..', 'data', subreddit_name, filename)
        
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        print(f"Saving {len(reddit_posts)} posts to file...")
        save_posts(reddit_posts, file_path)

        print(f"Saved {len(reddit_posts)} posts with comments to {file_path}")

        # Print a sample of the data
        print("\nSample of fetched data:")
        for post in reddit_posts[:3]:
            print(f"Title: {post.title}")
            print(f"Score: {post.score}")
            print(f"URL: {post.url}")
            print(f"Date: {datetime(post.created_utc)}")
            print(f"Content: {post.selftext[:200]}..." if post.is_self else "This is a link post.")
            print(f"Number of comments: {len(post.comments)}")
            if post.comments:
                print(f"First comment: {post.comments[0].body[:100]}...")
            print("---")

        # Move to the next period
        current_start = current_end + timedelta(seconds=1)

if __name__ == "__main__":
    main()

In [None]:
import praw
import os
from dotenv import load_dotenv
import json
from datetime import datetime, timedelta
import time
from tqdm import tqdm

# ... (RedditComment and RedditPost classes remain the same)

def initialize_reddit():
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT')
    )

def count_posts(reddit, subreddit_name, start_date):
    subreddit = reddit.subreddit(subreddit_name)
    count = 0
    for post in subreddit.new(limit=None):
        post_date = datetime.utcfromtimestamp(post.created_utc)
        if post_date < start_date:
            break
        count += 1
    return count

def get_subreddit_posts_for_period(reddit, subreddit_name, start_date):
    subreddit = reddit.subreddit(subreddit_name)
    
    # First, count the posts
    print(f"Counting posts from {start_date.date()} to now...")
    total_posts = count_posts(reddit, subreddit_name, start_date)
    print(f"Found {total_posts} posts in this period.")
    
    posts = []
    pbar = tqdm(total=total_posts, desc="Fetching posts", unit="post")
    
    for post in subreddit.new(limit=None):
        post_date = datetime.utcfromtimestamp(post.created_utc)
        if post_date < start_date:
            break
        posts.append(RedditPost(post))
        pbar.update(1)
        
        # Reddit API has a rate limit, so we need to be careful
        time.sleep(0.15)  # Sleep for 150ms between requests
    
    pbar.close()
    return posts

def get_subreddit_posts_for_period(reddit, subreddit_name, start_date, max_posts=100000):
    subreddit = reddit.subreddit(subreddit_name)
    
    posts = []
    pbar = tqdm(total=max_posts, desc="Fetching posts", unit="post")
    
    for post in subreddit.new(limit=None):
        post_date = datetime.utcfromtimestamp(post.created_utc)
        if post_date < start_date or len(posts) >= max_posts:
            break
        posts.append(RedditPost(post))
        pbar.update(1)
        
        time.sleep(0.1)  # Sleep for 100ms between requests
    
    pbar.close()
    print(f"Retrieved {len(posts)} posts. Oldest post date: {datetime.utcfromtimestamp(posts[-1].created_utc) if posts else 'N/A'}")
    return posts

def save_posts(posts, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([post.to_dict() for post in posts], f, ensure_ascii=False, indent=4)

def main():
    reddit = initialize_reddit()
    subreddit_name = 'PioneerDJ'
    n_years = 5  # Change this to the desired number of years

    end_date = datetime.utcnow()
    start_date = end_date - timedelta(days=n_years*365)
    
    print(f"Fetching posts from r/{subreddit_name} for the period: {start_date.date()} to {end_date.date()}")
    reddit_posts = get_subreddit_posts_for_period(reddit, subreddit_name, start_date)

    # Create filename with date range
    filename = f"{subreddit_name}_{start_date.date()}_{end_date.date()}.json"

    # Construct the full file path
    file_path = os.path.join('..', 'data', subreddit_name, filename)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    print(f"Saving {len(reddit_posts)} posts to file...")
    save_posts(reddit_posts, file_path)

    print(f"Saved {len(reddit_posts)} posts with comments to {file_path}")

if __name__ == "__main__":
    main()

In [None]:
import praw
import os
from dotenv import load_dotenv
import json
from datetime import datetime, timedelta
import time
from tqdm import tqdm
import requests

# ... (RedditComment and RedditPost classes remain the same)

def initialize_reddit():
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT')
    )

def get_pushshift_posts(subreddit_name, start_date, end_date, limit=1000):
    url = "https://api.pushshift.io/reddit/search/submission"
    start_timestamp = int(start_date.timestamp())
    end_timestamp = int(end_date.timestamp())
    
    params = {
        "subreddit": subreddit_name,
        "after": start_timestamp,
        "before": end_timestamp,
        "size": limit,
        "sort": "desc",
        "sort_type": "created_utc"
    }
    
    response = requests.get(url, params=params)
    data = response.json()
    return data['data']

def get_subreddit_posts_for_period(reddit, subreddit_name, start_date, end_date, max_posts=100000):
    posts = []
    pbar = tqdm(total=max_posts, desc="Fetching posts", unit="post")
    
    current_date = end_date
    while len(posts) < max_posts and current_date > start_date:
        pushshift_posts = get_pushshift_posts(subreddit_name, start_date, current_date, limit=1000)
        if not pushshift_posts:
            break
        
        for post_data in pushshift_posts:
            post_date = datetime.utcfromtimestamp(post_data['created_utc'])
            if post_date < start_date or len(posts) >= max_posts:
                break
            
            # Fetch full post data from Reddit API
            full_post = reddit.submission(id=post_data['id'])
            posts.append(RedditPost(full_post))
            pbar.update(1)
            
            time.sleep(0.1)  # Sleep for 100ms between requests
        
        current_date = datetime.utcfromtimestamp(pushshift_posts[-1]['created_utc'])
    
    pbar.close()
    print(f"Retrieved {len(posts)} posts. Oldest post date: {datetime.utcfromtimestamp(posts[-1].created_utc) if posts else 'N/A'}")
    return posts

def save_posts(posts, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([post.to_dict() for post in posts], f, ensure_ascii=False, indent=4)

def main():
    reddit = initialize_reddit()
    subreddit_name = 'PioneerDJ'
    n_years = 5  # Change this to the desired number of years

    end_date = datetime.utcnow()
    start_date = end_date - timedelta(days=n_years*365)
    
    print(f"Fetching posts from r/{subreddit_name} for the period: {start_date.date()} to {end_date.date()}")
    reddit_posts = get_subreddit_posts_for_period(reddit, subreddit_name, start_date, end_date)

    # Create filename with date range
    filename = f"{subreddit_name}_{start_date.date()}_{end_date.date()}.json"

    # Construct the full file path
    file_path = os.path.join('..', 'data', subreddit_name, filename)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    print(f"Saving {len(reddit_posts)} posts to file...")
    save_posts(reddit_posts, file_path)

    print(f"Saved {len(reddit_posts)} posts with comments to {file_path}")

    # Print a sample of the data
    print("\nSample of fetched data:")
    for post in reddit_posts[:3]:
        print(f"Title: {post.title}")
        print(f"Score: {post.score}")
        print(f"URL: {post.url}")
        print(f"Date: {datetime.utcfromtimestamp(post.created_utc)}")
        print(f"Content: {post.selftext[:200]}..." if post.is_self else "This is a link post.")
        print(f"Number of comments: {len(post.comments)}")
        if post.comments:
            print(f"First comment: {post.comments[0].body[:100]}...")
        print("---")

if __name__ == "__main__":
    main()

In [None]:
import praw
import os
from dotenv import load_dotenv
import json
from datetime import datetime, timedelta
import time
from tqdm import tqdm
import requests

# ... (RedditComment and RedditPost classes remain the same)

def initialize_reddit():
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT')
    )

def get_pushshift_posts(subreddit_name, start_date, end_date, limit=1000):
    url = "https://api.pushshift.io/reddit/search/submission"
    start_timestamp = int(start_date.timestamp())
    end_timestamp = int(end_date.timestamp())
    
    params = {
        "subreddit": subreddit_name,
        "after": start_timestamp,
        "before": end_timestamp,
        "size": limit,
        "sort": "desc",
        "sort_type": "created_utc"
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise an exception for bad status codes
        data = response.json()
        return data.get('data', [])
    except requests.RequestException as e:
        print(f"Error fetching data from Pushshift: {e}")
        return []

def get_subreddit_posts_for_period(reddit, subreddit_name, start_date, end_date, max_posts=100000):
    posts = []
    pbar = tqdm(total=max_posts, desc="Fetching posts", unit="post")
    
    current_date = end_date
    while len(posts) < max_posts and current_date > start_date:
        pushshift_posts = get_pushshift_posts(subreddit_name, start_date, current_date, limit=1000)
        if not pushshift_posts:
            print("No more posts found or error occurred.")
            break
        
        for post_data in pushshift_posts:
            post_date = datetime.utcfromtimestamp(post_data['created_utc'])
            if post_date < start_date or len(posts) >= max_posts:
                break
            
            try:
                # Fetch full post data from Reddit API
                full_post = reddit.submission(id=post_data['id'])
                posts.append(RedditPost(full_post))
                pbar.update(1)
            except praw.exceptions.PRAWException as e:
                print(f"Error fetching post {post_data['id']}: {e}")
            
            time.sleep(0.1)  # Sleep for 100ms between requests
        
        current_date = datetime.utcfromtimestamp(pushshift_posts[-1]['created_utc'])
    
    pbar.close()
    print(f"Retrieved {len(posts)} posts. Oldest post date: {datetime.utcfromtimestamp(posts[-1].created_utc) if posts else 'N/A'}")
    return posts

def save_posts(posts, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([post.to_dict() for post in posts], f, ensure_ascii=False, indent=4)

def main():
    reddit = initialize_reddit()
    subreddit_name = 'PioneerDJ'
    n_years = 5  # Change this to the desired number of years

    end_date = datetime.utcnow()
    start_date = end_date - timedelta(days=n_years*365)
    
    print(f"Fetching posts from r/{subreddit_name} for the period: {start_date.date()} to {end_date.date()}")
    reddit_posts = get_subreddit_posts_for_period(reddit, subreddit_name, start_date, end_date)

    # Create filename with date range
    filename = f"{subreddit_name}_{start_date.date()}_{end_date.date()}.json"

    # Construct the full file path
    file_path = os.path.join('..', 'data', subreddit_name, filename)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    print(f"Saving {len(reddit_posts)} posts to file...")
    save_posts(reddit_posts, file_path)

    print(f"Saved {len(reddit_posts)} posts with comments to {file_path}")

    # Print a sample of the data
    print("\nSample of fetched data:")
    for post in reddit_posts[:3]:
        print(f"Title: {post.title}")
        print(f"Score: {post.score}")
        print(f"URL: {post.url}")
        print(f"Date: {datetime.utcfromtimestamp(post.created_utc)}")
        print(f"Content: {post.selftext[:200]}..." if post.is_self else "This is a link post.")
        print(f"Number of comments: {len(post.comments)}")
        if post.comments:
            print(f"First comment: {post.comments[0].body[:100]}...")
        print("---")

if __name__ == "__main__":
    main()

In [None]:
import praw
import os
from dotenv import load_dotenv
import json
from datetime import datetime, timedelta
import time
from tqdm import tqdm

# ... (RedditComment and RedditPost classes remain the same)

def initialize_reddit():
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT')
    )

def get_subreddit_posts_for_period(reddit, subreddit_name, start_date, end_date, max_posts=100000):
    subreddit = reddit.subreddit(subreddit_name)
    
    posts = []
    pbar = tqdm(total=max_posts, desc="Fetching posts", unit="post")
    
    # Convert dates to timestamps for Reddit's API
    start_timestamp = int(start_date.timestamp())
    end_timestamp = int(end_date.timestamp())
    
    # Use empty string to search for all posts
    for post in subreddit.search('', sort='new', syntax='lucene', 
                                 time_filter='all', limit=None):
        post_date = datetime.utcfromtimestamp(post.created_utc)
        if post_date < start_date:
            break
        if start_date <= post_date <= end_date:
            posts.append(RedditPost(post))
            pbar.update(1)
            if len(posts) >= max_posts:
                break
        
        time.sleep(0.1)  # Sleep for 100ms between requests
    
    pbar.close()
    print(f"Retrieved {len(posts)} posts. Oldest post date: {datetime.utcfromtimestamp(posts[-1].created_utc) if posts else 'N/A'}")
    return posts

def save_posts(posts, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([post.to_dict() for post in posts], f, ensure_ascii=False, indent=4)

def main():
    reddit = initialize_reddit()
    subreddit_name = 'PioneerDJ'
    n_years = 5  # Change this to the desired number of years

    end_date = datetime.utcnow()
    start_date = end_date - timedelta(days=n_years*365)
    
    print(f"Fetching posts from r/{subreddit_name} for the period: {start_date.date()} to {end_date.date()}")
    reddit_posts = get_subreddit_posts_for_period(reddit, subreddit_name, start_date, end_date)

    # Create filename with date range
    filename = f"{subreddit_name}_{start_date.date()}_{end_date.date()}.json"

    # Construct the full file path
    file_path = os.path.join('..', 'data', subreddit_name, filename)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    print(f"Saving {len(reddit_posts)} posts to file...")
    save_posts(reddit_posts, file_path)

    print(f"Saved {len(reddit_posts)} posts with comments to {file_path}")

    # Print a sample of the data
    print("\nSample of fetched data:")
    for post in reddit_posts[:3]:
        print(f"Title: {post.title}")
        print(f"Score: {post.score}")
        print(f"URL: {post.url}")
        print(f"Date: {datetime.utcfromtimestamp(post.created_utc)}")
        print(f"Content: {post.selftext[:200]}..." if post.is_self else "This is a link post.")
        print(f"Number of comments: {len(post.comments)}")
        if post.comments:
            print(f"First comment: {post.comments[0].body[:100]}...")
        print("---")

if __name__ == "__main__":
    main()

In [None]:
import praw
import os
from dotenv import load_dotenv
import json
from datetime import datetime, timedelta
import time
from tqdm import tqdm

# ... (RedditComment and RedditPost classes remain the same)

def initialize_reddit():
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT')
    )

def get_subreddit_posts_for_period(reddit, subreddit_name, start_date, end_date, max_posts=100000):
    subreddit = reddit.subreddit(subreddit_name)
    
    posts = []
    pbar = tqdm(total=max_posts, desc="Fetching posts", unit="post")
    
    for post in subreddit.top('all', limit=None):
        post_date = datetime.utcfromtimestamp(post.created_utc)
        if start_date <= post_date <= end_date:
            posts.append(RedditPost(post))
            pbar.update(1)
            if len(posts) >= max_posts:
                break
        
        time.sleep(0.1)  # Sleep for 100ms between requests
    
    pbar.close()
    print(f"Retrieved {len(posts)} posts. Oldest post date: {datetime.utcfromtimestamp(posts[-1].created_utc) if posts else 'N/A'}")
    return posts

def save_posts(posts, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([post.to_dict() for post in posts], f, ensure_ascii=False, indent=4)

def main():
    reddit = initialize_reddit()
    subreddit_name = 'PioneerDJ'
    n_years = 5  # Change this to the desired number of years

    end_date = datetime.utcnow()
    start_date = end_date - timedelta(days=n_years*365)
    
    print(f"Fetching posts from r/{subreddit_name} for the period: {start_date.date()} to {end_date.date()}")
    reddit_posts = get_subreddit_posts_for_period(reddit, subreddit_name, start_date, end_date)

    # Create filename with date range
    filename = f"{subreddit_name}_{start_date.date()}_{end_date.date()}.json"

    # Construct the full file path
    file_path = os.path.join('..', 'data', subreddit_name, filename)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    print(f"Saving {len(reddit_posts)} posts to file...")
    save_posts(reddit_posts, file_path)

    print(f"Saved {len(reddit_posts)} posts with comments to {file_path}")

    # Print a sample of the data
    print("\nSample of fetched data:")
    for post in reddit_posts[:3]:
        print(f"Title: {post.title}")
        print(f"Score: {post.score}")
        print(f"URL: {post.url}")
        print(f"Date: {datetime.utcfromtimestamp(post.created_utc)}")
        print(f"Content: {post.selftext[:200]}..." if post.is_self else "This is a link post.")
        print(f"Number of comments: {len(post.comments)}")
        if post.comments:
            print(f"First comment: {post.comments[0].body[:100]}...")
        print("---")

if __name__ == "__main__":
    main()

In [None]:
import praw
import os
from dotenv import load_dotenv
import json
from datetime import datetime
import time
from tqdm import tqdm

# ... (RedditComment and RedditPost classes remain the same)

def initialize_reddit():
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT')
    )

def get_posts_from_urls(reddit, urls):
    posts = []
    pbar = tqdm(total=len(urls), desc="Fetching posts", unit="post")
    
    for url in urls:
        try:
            submission = reddit.submission(url=url)
            posts.append(RedditPost(submission))
            pbar.update(1)
            time.sleep(0.1)  # Sleep for 100ms between requests
        except praw.exceptions.PRAWException as e:
            print(f"Error fetching post {url}: {e}")
    
    pbar.close()
    print(f"Retrieved {len(posts)} posts.")
    return posts

def save_posts(posts, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump([post.to_dict() for post in posts], f, ensure_ascii=False, indent=4)

def main():
    reddit = initialize_reddit()
    
    # List of URLs to fetch
    urls = ["https://www.reddit.com/r/PioneerDJ/comments/1ekyq9f/how_to_load_music_from_pc_to_cdj_using_an_xdjxz/",
        # Add more URLs as needed
    ]
    
    print(f"Fetching {len(urls)} posts from Reddit")
    reddit_posts = get_posts_from_urls(reddit, urls)

    # Create filename with current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"Reddit_posts_{timestamp}.json"

    # Construct the full file path
    file_path = os.path.join('..', 'data', 'Reddit_posts', filename)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    print(f"Saving {len(reddit_posts)} posts to file...")
    save_posts(reddit_posts, file_path)

    print(f"Saved {len(reddit_posts)} posts with comments to {file_path}")

    # Print a sample of the data
    print("\nSample of fetched data:")
    for post in reddit_posts[:3]:
        print(f"Title: {post.title}")
        print(f"Score: {post.score}")
        print(f"URL: {post.url}")
        print(f"Date: {datetime.utcfromtimestamp(post.created_utc)}")
        print(f"Content: {post.selftext[:200]}..." if post.is_self else "This is a link post.")
        print(f"Number of comments: {len(post.comments)}")
        if post.comments:
            print(f"First comment: {post.comments[0].body[:100]}...")
        print("---")

if __name__ == "__main__":
    main()

In [None]:
import praw
import os
from dotenv import load_dotenv
from datetime import datetime, timedelta
import time
from tqdm import tqdm

def initialize_reddit():
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT')
    )

def get_all_post_urls(reddit, subreddit_name, limit=None):
    subreddit = reddit.subreddit(subreddit_name)
    urls = []
    
    # We'll use 'new' to get the most recent posts first
    for post in tqdm(subreddit.new(limit=limit), desc="Fetching posts", unit="post"):
        url = f"https://www.reddit.com{post.permalink}"
        urls.append(url)
        time.sleep(0.1)  # Sleep for 100ms between requests to respect rate limits
    
    return urls

def save_urls_to_file(urls, filename):
    with open(filename, 'w') as f:
        for url in urls:
            f.write(f"{url}\n")

def main():
    reddit = initialize_reddit()
    subreddit_name = 'PioneerDJ'
    
    print(f"Fetching post URLs from r/{subreddit_name}")
    urls = get_all_post_urls(reddit, subreddit_name)

    # Create filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{subreddit_name}_post_urls_{timestamp}.txt"

    # Construct the full file path
    file_path = os.path.join('..', 'data', subreddit_name, filename)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    print(f"Saving {len(urls)} URLs to file...")
    save_urls_to_file(urls, file_path)

    print(f"Saved {len(urls)} URLs to {file_path}")

    # Print a sample of the URLs
    print("\nSample of fetched URLs:")
    for url in urls[:5]:
        print(url)

if __name__ == "__main__":
    main()

In [7]:
import praw
import os
from dotenv import load_dotenv
import json
from datetime import datetime
import time
from tqdm import tqdm
import argparse

# ... (RedditComment and RedditPost classes remain the same)

def initialize_reddit():
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT')
    )

def get_posts_from_urls(reddit, urls):
    posts = []
    pbar = tqdm(total=len(urls), desc="Fetching posts", unit="post")
    
    for url in urls:
        try:
            submission = reddit.submission(url=url)
            posts.append(RedditPost(submission))
            pbar.update(1)
            time.sleep(0.1)  # Sleep for 100ms between requests
        except praw.exceptions.PRAWException as e:
            print(f"Error fetching post {url}: {e}")
        except Exception as e:
            print(f"Unexpected error fetching post {url}: {e}")
    
    pbar.close()
    print(f"Retrieved {len(posts)} posts.")
    return posts

def save_posts(posts, filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump([post.to_dict() for post in posts], f, ensure_ascii=False, indent=4)
        print(f"Saved {len(posts)} posts with comments to {filename}")
    except IOError as e:
        print(f"Error saving posts to file: {e}")

def load_urls_from_file(filename):
    try:
        with open(filename, 'r') as f:
            return [line.strip() for line in f if line.strip()]
    except IOError as e:
        print(f"Error loading URLs from file: {e}")
        return []

def main(input_file):
    reddit = initialize_reddit()
    
    urls = load_urls_from_file(input_file)
    
    if not urls:
        print("No URLs found. Please make sure the file exists and contains URLs.")
        return
    
    print(f"Fetching {len(urls)} posts from Reddit")
    reddit_posts = get_posts_from_urls(reddit, urls)

    # Create filename with current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"Reddit_posts_{timestamp}.json"

    # Construct the full file path
    file_path = os.path.join(os.path.dirname(input_file), filename)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    save_posts(reddit_posts, file_path)

    # Print a sample of the data
    print("\nSample of fetched data:")
    for post in reddit_posts[:3]:
        print(f"Title: {post.title}")
        print(f"Score: {post.score}")
        print(f"URL: {post.url}")
        print(f"Date: {datetime.utcfromtimestamp(post.created_utc)}")
        print(f"Content: {post.selftext[:200]}..." if post.is_self else "This is a link post.")
        print(f"Number of comments: {len(post.comments)}")
        if post.comments:
            print(f"First comment: {post.comments[0].body[:100]}...")
        print("---")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Fetch Reddit posts from URLs in a file.")
    parser.add_argument("input_file", help="Path to the file containing Reddit post URLs")
    args = parser.parse_args()
    
    main(args.input_file)

usage: ipykernel_launcher.py [-h] input_file
ipykernel_launcher.py: error: the following arguments are required: input_file


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
