## Import and Setup

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

RAW_DATA_PATH = '../data/raw/rsd_15k.csv'
PROCESSED_DIR = '../data/processed'

os.makedirs(PROCESSED_DIR, exist_ok=True)

print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")

Pandas version: 2.3.3
Numpy version: 1.26.4


In [5]:
try:
    df = pd.read_csv(
        RAW_DATA_PATH, 
        quotechar='"', 
        delimiter=',', 
    )
    print(f"count: {len(df)}")
    
except Exception as e:
    print(f"read error: {e}")

# check data is correct
df.head()

count: 14613


Unnamed: 0,users,text,sentiment,time
0,1,No one understands how much I desperately want...,Ideation,1648483701
1,2,Today I never wanted to live to see 25. That m...,Behavior,1651130449
2,3,Suicidal thoughts at / because of school For s...,Ideation,1662712545
3,4,I feel like the pain will never end Everyday f...,Ideation,1638628371
4,4,Is there even a point to living if you're not ...,Indicator,1639749228


## Text Cleaning & Time Conversion

In [8]:
# 1. Text Cleaning
# remove '\n' from the text to prevent errors during tokenization
df['text'] = df['text'].astype(str).str.replace('\n', ' ', regex=False).str.replace('\r', '', regex=False)

# 2. Timestamp Conversion (for approach 4)
# from unix time to readable datetime
df['timestamp_dt'] = pd.to_datetime(df['time'], unit='s')

# 3. Label Encoding
# follow our proposal and ref paper: Indicator < Ideation < Behavior < Attempt
label_map = {
    'Indicator': 0,
    'Ideation': 1,
    'Behavior': 2,
    'Attempt': 3
}

df['label_ordinal'] = df['sentiment'].map(label_map)

# remove data entries whose labels cannot be converted (if any)
if df['label_ordinal'].isnull().sum() > 0:
    n_missing = df['label_ordinal'].isnull().sum()
    print(f"remove {n_missing} data entries")
    df = df.dropna(subset=['label_ordinal'])

df['label_ordinal'] = df['label_ordinal'].astype(int)

# show results
df[['users', 'sentiment', 'label_ordinal', 'timestamp_dt']].head()

Unnamed: 0,users,sentiment,label_ordinal,timestamp_dt
0,1,Ideation,1,2022-03-28 16:08:21
1,2,Behavior,2,2022-04-28 07:20:49
2,3,Ideation,1,2022-09-09 08:35:45
3,4,Ideation,1,2021-12-04 14:32:51
4,4,Indicator,0,2021-12-17 13:53:48


## Data Splitting

In [10]:
# get all unique user id
unique_users = df['users'].unique()
n_users = len(unique_users)
print(f"Total Unique Users: {n_users}")

# ===== BEGIN: Gemini-generated block =====

# 1. split out 10% as the test set (keeping 90%)
train_val_users, test_users = train_test_split(unique_users, test_size=0.1, random_state=42)

# 2. From the remaining 90%, split out another 10% of the overall data as the validation set
# 1/9 of the remaining 90% is approximately 10% of the total dataset
train_users, val_users = train_test_split(train_val_users, test_size=0.1111, random_state=42)

print(f"Split results (number of users):")
print(f"Train Users: {len(train_users)} ({(len(train_users)/n_users)*100:.1f}%)")
print(f"Val Users:   {len(val_users)} ({(len(val_users)/n_users)*100:.1f}%)")
print(f"Test Users:  {len(test_users)} ({(len(test_users)/n_users)*100:.1f}%)")

# Split the original DataFrame based on User ID
train_df = df[df['users'].isin(train_users)].copy()
val_df = df[df['users'].isin(val_users)].copy()
test_df = df[df['users'].isin(test_users)].copy()

# reset index
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# ===== END: Gemini-generated block =====

print("-" * 30)
print(f"Final dataset size (number of posts):")
print(f"Train Set: {len(train_df)} posts")
print(f"Val Set:   {len(val_df)} posts")
print(f"Test Set:  {len(test_df)} posts")

Total Unique Users: 1265
Split results (number of users):
Train Users: 1011 (79.9%)
Val Users:   127 (10.0%)
Test Users:  127 (10.0%)
------------------------------
Final dataset size (number of posts):
Train Set: 11972 posts
Val Set:   1605 posts
Test Set:  1036 posts


## Store Data

In [12]:
# Save in pickle format (keeps datetime and int data types for faster loading).
train_path = os.path.join(PROCESSED_DIR, 'train.pkl')
val_path = os.path.join(PROCESSED_DIR, 'val.pkl')
test_path = os.path.join(PROCESSED_DIR, 'test.pkl')

train_df.to_pickle(train_path)
val_df.to_pickle(val_path)
test_df.to_pickle(test_path)

# ===== BEGIN: Gemini-generated block =====

# Sanity Check: Check whether Train and Test have overlapping users (should be 0)
overlap_users = set(train_df['users']) & set(test_df['users'])
if len(overlap_users) == 0:
    print("✅ Check passed: No overlapping users between the Train Set and Test Set (No data leakage).")
else:
    print(f"❌ Critical warning: Found {len(overlap_users)} overlapping users! Please check the data splitting logic.")

# ===== END: Gemini-generated block =====

✅ Check passed: No overlapping users between the Train Set and Test Set (No data leakage).
