In [2]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import GroupKFold
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', 0)  # Optional: prevents line wrapping

In [3]:

# Load Data <must be of the format: 
# claim_id, claim, claim_created_utc, 
# positive_id, positive_body, positive_created_utc, 
# negative_id, negative_body, negative_created_utc
input_file = "/cronus_data/araghavan/persuasion/data/pos_random_neg_20250303_dump_v001.jsonl"


In [4]:
df = pd.read_json(input_file, lines=True)
df.sort_values(by=['claim_created_utc', 'claim_id'], ascending=True, inplace=True)


In [8]:

# Step 1: Time-based cutoff split
cutoff = int(datetime.datetime(2024, 2, 1).timestamp())
heldout_timeout_test = df[df['claim_created_utc'] >= cutoff].copy(deep=True)
timein_data = df[df['claim_created_utc'] < cutoff].copy(deep=True)

print(f"Raw DataFrame Shape: {df.shape}")
print(f"Held Out Timeout Test Shape: {heldout_timeout_test.shape}")
print(f"Time-in Data Shape: {timein_data.shape}")


Raw DataFrame Shape: (76075, 9)
Held Out Timeout Test Shape: (8155, 9)
Time-in Data Shape: (67920, 9)


In [9]:

# Step 2: Random 20% of unique claim_ids as held-out in-time test (group-safe)
unique_claims = timein_data['claim_id'].unique()
np.random.seed(42)  # For reproducibility
heldout_claims = np.random.choice(
    unique_claims, 
    size=int(0.2 * len(unique_claims)), 
    replace=False
)

heldout_in_time_test = timein_data[timein_data['claim_id'].isin(heldout_claims)].copy(deep=True)
train_val_df = timein_data[~timein_data['claim_id'].isin(heldout_claims)].copy(deep=True)

print(f"Held-out In-Time Test Shape (20% groups): {heldout_in_time_test.shape}")
print(f"Train-Val Data Shape (80% groups): {train_val_df.shape}")


Held-out In-Time Test Shape (20% groups): (13560, 9)
Train-Val Data Shape (80% groups): (54360, 9)


In [10]:

# Step 3: Sanity check for no group overlap
assert set(heldout_in_time_test['claim_id']).isdisjoint(set(train_val_df['claim_id'])), "Overlap detected!"

# Step 4: GroupKFold CV split on remaining train_val_df
gkf = GroupKFold(n_splits=5)
train_val_df['folds_col'] = -1  # Initialize fold column

X = train_val_df.drop(columns=['claim_id'])  # Or keep if needed downstream

for fold_idx, (train_idx, val_idx) in enumerate(gkf.split(X=X, y=None, groups=train_val_df['claim_id'])):
    train_val_df.loc[train_val_df.iloc[val_idx].index, 'folds_col'] = fold_idx
    print(f"Inner Fold {fold_idx}: Train size={len(train_idx)}, Val size={len(val_idx)}")


Inner Fold 0: Train size=43488, Val size=10872
Inner Fold 1: Train size=43488, Val size=10872
Inner Fold 2: Train size=43488, Val size=10872
Inner Fold 3: Train size=43488, Val size=10872
Inner Fold 4: Train size=43488, Val size=10872


In [11]:

# Step 5 (Optional): Summary stats
print("\n✅ Final Train/Val folding complete. Fold sizes:")
print(train_val_df['folds_col'].value_counts())

def melt_claims(df, folds_col_exist=False):
    base_cols = ['claim_id', 'claim_body', 'claim_created_utc']
    pos_cols = base_cols + ['positive_id', 'positive_body', 'positive_created_utc']
    neg_cols = base_cols + ['negative_id', 'negative_body', 'negative_created_utc']
    if folds_col_exist:
        pos_cols.append('folds_col')
        neg_cols.append('folds_col')
    pos = df[pos_cols].copy(deep=True)
    pos.rename(columns={
        'positive_id': 'message_id',
        'positive_body': 'message',
        'positive_created_utc': 'message_created_utc',
        'claim_body':'claim',
    }, inplace=True)
    pos['persuaded'] = 1
    neg = df[neg_cols].copy(deep=True)
    neg.rename(columns={
        'negative_id': 'message_id',
        'negative_body': 'message',
        'negative_created_utc': 'message_created_utc',
        'claim_body':'claim',
    }, inplace=True)
    neg['persuaded'] = 0
    melted = pd.concat([pos, neg], ignore_index=True)
    return melted


train_val_melted = melt_claims(train_val_df, folds_col_exist=True)
heldout_in_time_melted = melt_claims(heldout_in_time_test)
heldout_timeout_melted = melt_claims(heldout_timeout_test)




✅ Final Train/Val folding complete. Fold sizes:
folds_col
0    10872
4    10872
3    10872
2    10872
1    10872
Name: count, dtype: int64


In [12]:
train_val_melted[train_val_melted['claim_id'] == "1afxgbv"]

Unnamed: 0,claim_id,claim,claim_created_utc,message_id,message,message_created_utc,folds_col,persuaded
54359,1afxgbv,The Earh could be turned into a utopian commun...,1706747109,kod5d6s,"Ending your post with ""fight me"" doesn't reall...",1706747536,0,1
108719,1afxgbv,The Earh could be turned into a utopian commun...,1706747109,kodcfpi,There's going to be a lot of overlap between t...,1706750267,0,0


In [13]:
train_val_melted.drop_duplicates(subset=['message_id']).shape

(108720, 8)

In [14]:
train_val_melted.shape

(108720, 8)

In [15]:

# (Optional) Step 6: Save outputs if needed
train_val_melted.to_json("../../data/pos_random_neg_20250303_dump_v001_to_percvs_v002_train_grpstrat_80pct.jsonl", orient='records', lines=True)
heldout_in_time_melted.to_json("../../data/pos_random_neg_20250303_dump_v001_to_percvs_v002_testintime_grpstrat_20pct.jsonl", orient='records', lines=True)
heldout_timeout_melted.to_json("../../data/pos_random_neg_20250303_dump_v001_to_percvs_v002_testouttime_grpstrat_20pct.jsonl", orient='records', lines=True)


In [16]:
heldout_timeout_melted['claim'].str.split().str.len().mean()

np.float64(357.62857142857143)

In [17]:
heldout_timeout_melted[heldout_timeout_melted['message'].str.split().str.len()<50]

Unnamed: 0,claim_id,claim,claim_created_utc,message_id,message,message_created_utc,persuaded


In [18]:
train_val_melted['claim'].str.split().str.len().mean()

np.float64(364.7174025018396)

In [19]:
train_val_melted['message'].str.split().str.len().mean()

np.float64(175.48437270051508)

In [20]:
heldout_in_time_melted['claim'].str.split().str.len().mean()

np.float64(365.207005899705)

In [21]:
heldout_in_time_melted['message'].str.split().str.len().mean()

np.float64(175.37400442477878)