# OpenAI, Consumer based sentiment Analysis

-- To Be Written --

## Data Collection

In [5]:
import os
import asyncio
import pandas as pd
import json
import datetime
from asyncpraw import Reddit
from asyncpraw.models import MoreComments


# Load Reddit API credentials from environment variables
USERNAME = os.getenv('USER')
if not USERNAME:
    raise ValueError("USER environment variable not set")

PASSWORD = os.getenv('PASSWORD')
if not PASSWORD:
    raise ValueError("PASSWORD environment variable not set")

CLIENT_ID = os.getenv('CLIENT_ID')
if not CLIENT_ID:
    raise ValueError("CLIENT_ID environment variable not set")

CLIENT_SECRET = os.getenv('CLIENT_SECRET')
if not CLIENT_SECRET:
    raise ValueError("CLIENT_SECRET environment variable not set")


async def create_reddit_instance():
    reddit = Reddit(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET,
        user_agent="my user agent",
        username=USERNAME,
        password=PASSWORD,
    )
    
    # Enable rate limit handling
    reddit.requestor.rate_limit_sleep = True  # ✅ Auto-handles rate-limiting
    return reddit


def load_csv_data(file_path):
    """
    Reads the CSV file and groups the queries by post_id.
    Ensures that all query values are strings before joining.
    Returns a dictionary mapping post_id -> query string.
    """
    df = pd.read_csv(file_path)

    # Convert queries to strings and handle NaN values
    df["query"] = df["query"].fillna("").astype(str)

    # Group by post_id and join unique queries with a semicolon.
    grouped = df.groupby("post_id")["query"].apply(lambda x: ";".join(set(x))).reset_index()
    mapping = dict(zip(grouped["post_id"], grouped["query"]))

    return mapping

async def fetch_post_and_comments(reddit, post_id):
    """
    Fetches the submission and its top 10 first-level comments.
    Returns a list of dictionaries containing post and comment details.
    """
    rows = []
    try:
        submission = await reddit.submission(id=post_id)  # ✅ Await here
        
        # Load submission details
        await submission.load()
        
        # Fetch top-level comments (limit: 10)
        await submission.comments.replace_more(limit=10)
        top_comments = submission.comments[:10]

        submission_details = {
            "post_id": submission.id,
            "subreddit": submission.subreddit.display_name,
            "post_title": submission.title,
            "post_body": submission.selftext,
            "number_of_comments": submission.num_comments,
            "readable_datetime": datetime.datetime.fromtimestamp(submission.created_utc).strftime("%Y-%m-%d %H:%M:%S"),
            "post_author": submission.author.name if submission.author else None,
        }

        if top_comments:
            for comment in top_comments:
                if isinstance(comment, MoreComments):  # Skip "load more" placeholders
                    continue
                row = submission_details.copy()
                row.update({
                    "comment_id": comment.id,
                    "comment_body": comment.body,
                    "number_of_upvotes": comment.score,
                    "comment_author": comment.author.name if comment.author else None,
                })
                rows.append(row)
        else:
            # No comments found; create a row with only post details
            row = submission_details.copy()
            row.update({
                "comment_id": None,
                "comment_body": None,
                "number_of_upvotes": None,
                "comment_author": None,
            })
            rows.append(row)

    except Exception as e:
        print(f"Error fetching data for post_id {post_id}: {e}")
        rows.append({
            "post_id": post_id,
            "subreddit": None,
            "post_title": None,
            "post_body": None,
            "number_of_comments": None,
            "readable_datetime": None,
            "post_author": None,
            "comment_id": None,
            "comment_body": None,
            "number_of_upvotes": None,
            "comment_author": None,
        })
    
    return rows


def write_csv(data, file_name):
    """
    Writes the provided data (a list of dictionaries) to a CSV file.
    """
    df = pd.DataFrame(data)
    df.to_csv(file_name, index=False)
    print(f"CSV file saved as {file_name}")

def write_json(data, file_name):
    """
    Writes the provided data (a list of dictionaries) to a JSON file.
    """
    with open(file_name, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, default=str)
    print(f"JSON file saved as {file_name}")

async def main():
    reddit = await create_reddit_instance()
    post_query_mapping = load_csv_data("combined_data.csv")
    post_ids = list(post_query_mapping.keys())
    output_data = []
    
    batch_size = 10  # Process 10 posts at a time; adjust based on your needs.
    
    for i in range(0, len(post_ids), batch_size):
        batch_ids = post_ids[i:i+batch_size]
        tasks = [fetch_post_and_comments(reddit, post_id) for post_id in batch_ids]
        results = await asyncio.gather(*tasks)
        
        # Process the results for this batch
        for post_id, rows in zip(batch_ids, results):
            query = post_query_mapping[post_id]
            for row in rows:
                row["query"] = query
                output_data.append(row)
                
        print(f"Processed batch {i // batch_size + 1} / {((len(post_ids) - 1) // batch_size) + 1}. Sleeping for 60 seconds...")
        await asyncio.sleep(60)  # Sleep for 60 seconds between batches

    write_csv(output_data, "new_combined_dataset.csv")
    write_json(output_data, "new_combined_dataset.json")


In [None]:
import nest_asyncio
nest_asyncio.apply()

async def run_async():
    await main()

await run_async()

  df = pd.read_csv(file_path)


Error fetching data for post_id 101o6zx: received 429 HTTP response
Error fetching data for post_id 101ms83: received 429 HTTP response
Error fetching data for post_id 101p00n: received 429 HTTP response
Error fetching data for post_id 100ye6s: received 429 HTTP response
Error fetching data for post_id 101chgd: received 429 HTTP response
Error fetching data for post_id  the next era is the niche content producer who produces high quality video content: received 429 HTTP response
Error fetching data for post_id 100ayoe: received 429 HTTP response
Error fetching data for post_id 101melg: received 429 HTTP response
Error fetching data for post_id 1007cpq: received 429 HTTP response
Error fetching data for post_id 1002dom: received 429 HTTP response
Processed batch 1 / 958. Sleeping for 60 seconds...
Error fetching data for post_id 102l28b: received 429 HTTP response
Error fetching data for post_id 102ci8x: received 429 HTTP response
Error fetching data for post_id 102jcse: received 429 HT

## Data Preprocessing

In [None]:
# Todo

## Tf-idf Vector

In [None]:
# Todo

## Embedding

In [None]:
# Todo

## Labeling

In [None]:
# Todo

## Pipeline (Part - C)