In [2]:
import os
import re
import yaml
import json
import ast
import praw
from pydantic import BaseModel

In [None]:
api_file_path = 'api_keys.json'

with open(api_file_path, 'r') as file:
    api_data = json.load(file)  

reddit = praw.Reddit(client_id=api_data['praw']['client_id'],
                    client_secret=api_data['praw']['client_secret'],
                    user_agent=api_data['praw']['user_agent'],
                    username=api_data['praw']['username'],
                    check_for_async=False) 

In [None]:
cfg_file_path = 'casaai_config.yaml'

with open(cfg_file_path, 'r') as yaml_file:
    cfg = yaml.safe_load(yaml_file)  

gemini_generated_keywords = cfg.get('gemini_generated_keywords', [])
user_provided_keywords = cfg.get('user_provided_keywords', [])
broad_keywords = cfg.get('broad_keywords', [])

In [None]:
def search_subreddits(keyword):
    subreddits = []
    for subreddit in reddit.subreddits.search_by_name(keyword, exact=False):
        subreddits.append(subreddit.display_name)
    return subreddits

In [None]:
def search_posts(subreddit_name, keyword):
    posts = []
    post_ids = []
    subreddit = reddit.subreddit(subreddit_name)
    for post in subreddit.search(keyword, limit=10):
        post_data = {
            'title': post.title,
            "comment_id": post.id,
            'url': post.url,
            'score': post.score,
            'num_comments': post.num_comments,
            'Post_views': post.view_count,
            'upvote_ratio': post.upvote_ratio,
            'author': str(post.author),
            'created_utc': post.created_utc,
            'image_urls': [],
            'comments': []
        }

        # Check for images in the post
        if hasattr(post, 'url') and \
           ((post.url.endswith('.jpg') or post.url.endswith('.jpeg') or post.url.endswith('.png'))):
            
            post_data['image_urls'].append(post.url)
        
        elif hasattr(post, 'media_metadata'):
            
            for item_id in post.media_metadata:
                media_item = post.media_metadata[item_id]
                if 'm' in media_item and 'image' in media_item['m']:
                    url = media_item.get('s', {}).get('u', None)
                    if url:
                        post_data['image_urls'].append(url)

        # Fetching comments
        post.comments.replace_more(limit=0)
        for comment in post.comments.list():
            comment_data = {
                "comment_id": comment.id,
                "parent_id": comment.parent_id.split('_')[1],
                "text": comment.body,
                "author": str(comment.author),
                "score": comment.score,
                "created_utc": comment.created_utc,
                "image_url": ""
            }

            # Check for images in comments if applicable
            if hasattr(comment, 'body_html') and 'img src="' in comment.body_html:
                start_index = comment.body_html.find('img src="') + len('img src="')
                end_index = comment.body_html.find('"', start_index)
                comment_data['image_url'] = comment.body_html[start_index:end_index]

            post_data['comments'].append(comment_data)

        posts.append(post_data)
        post_ids.append(post.id)
    
    return posts, post_ids

In [None]:
def fetch_reddit():
    
    found_subreddits = [search_subreddits(keyword) for keyword in broad_keywords]

    all_subreddits = []
    for each in found_subreddits:
        all_subreddits.extend(each)
    all_subreddits = set(all_subreddits)    

    reddit_posts = {}
    reddit_post_ids = []

    for subreddit_name in all_subreddits:
        reddit_posts[subreddit_name] = {}
        for keyword in gemini_generated_keywords:
            posts, post_ids = search_posts(subreddit_name, keyword)    
            reddit_posts[subreddit_name][keyword] = posts
            reddit_post_ids.extend(post_ids)
    
    return reddit_posts, reddit_post_ids