In [35]:
# Import libraries for data manipulation
import pandas as pd
import numpy as np
import random


In [36]:
# Load the Reddit Title dataset
df = pd.read_csv(
    "../data/raw/Reddit_Title.csv",
    delimiter=";",
    quoting=1,   # 1 = QUOTE_ALL
    on_bad_lines="skip",  # skip problematic lines
    engine="python"
)

In [44]:
# Sample 100 rows to keep dataset small
df = df.sample(n=100, random_state=42)  # random_state ensures reproducibility

In [38]:
# Add synthetic columns for recommendation system
# User IDs (1 to 50, simulating multiple posts per user)
df['user_id'] = np.random.randint(1, 51, size=100)

# Map label (0=stress-negative, 1=stress-positive) to mood
df['mood'] = df['label'].map({1: 'anxious', 0: 'calm'})

# Map label to stress_level
df['stress_level'] = df['label'].map({1: 'high', 0: 'low'})

# Define possible stress-relief activities
activities = ['meditation', 'music', 'journaling', 'breathing', 'walking']

# Assign random activities
df['activity'] = [random.choice(activities) for _ in range(100)]

# Assign feedback (1=liked, 0=not liked, 70% chance of liking)
df['feedback'] = np.random.choice([0, 1], size=100, p=[0.3, 0.7])

# Preview the processed data
print(df[['user_id', 'mood', 'stress_level', 'activity', 'feedback']].head())

      user_id     mood stress_level    activity  feedback
4456        2     calm          low       music         0
4626        9     calm          low  meditation         1
2802       44     calm          low  meditation         1
230        30  anxious         high     walking         0
3872        3     calm          low       music         0


In [39]:
# Save the processed dataset
df[['user_id', 'mood', 'stress_level', 'activity', 'feedback']].to_csv(
    '../data/processed/stress_relief_data.csv', index=False
)