In [1]:
# Import libraries for data manipulation
import pandas as pd
import numpy as np
import random


In [2]:
# Load the Reddit Title dataset
df = pd.read_csv(
    "../data/raw/Reddit_Title.csv",
    delimiter=";",
    quoting=1,   # 1 = QUOTE_ALL
    on_bad_lines="skip",  # skip problematic lines
    engine="python"
)

In [3]:
# Add synthetic columns for recommendation system
# Ensure a label column exists. Some raw files may use 'stress_label' or have no labels.
if 'label' not in df.columns:
    if 'stress_label' in df.columns:
        df = df.rename(columns={'stress_label': 'label'})
    else:
        # Fallback: derive a simple heuristic label from the title text.
        def classify_label(title):
            if not isinstance(title, str):
                return 0
            text = title.lower()
            keywords = ['stress', 'anx', 'panic', 'overwhelm', 'depress']
            return 1 if any(k in text for k in keywords) else 0
        df['label'] = df.get('title', pd.Series([''] * len(df))).apply(classify_label)

num_rows = len(df)
# Create synthetic user ids (simulate multiple posts per user)
df['user_id'] = np.random.randint(1, max(2, num_rows // 2 + 1), size=num_rows)

# Map label (0=stress-negative, 1=stress-positive) to mood and stress level
df['mood'] = df['label'].map({1: 'anxious', 0: 'calm'})
df['stress_level'] = df['label'].map({1: 'high', 0: 'low'})

# Define possible stress-relief activities and assign randomly
activities = ['meditation', 'music', 'journaling', 'breathing', 'walking']
df['activity'] = [random.choice(activities) for _ in range(num_rows)]

# Synthetic feedback: 1=liked, 0=not liked (70% chance of liking)
df['feedback'] = np.random.choice([0, 1], size=num_rows, p=[0.3, 0.7])

# Preview the processed data
print(df[['user_id', 'mood', 'stress_level', 'activity', 'feedback']].head())

   user_id     mood stress_level    activity  feedback
0      433     calm          low  journaling         0
1     1140  anxious         high  journaling         1
2      934  anxious         high  meditation         1
3      681  anxious         high  meditation         1
4     1623     calm          low       music         1


In [4]:
# Save the processed dataset
df[['user_id', 'mood', 'stress_level', 'activity', 'feedback']].to_csv(
    '../data/processed/stress_relief_data.csv', index=False
)
# Confirm file saved
print('Saved to ../data/processed/stress_relief_data.csv')

Saved to ../data/processed/stress_relief_data.csv
