In [None]:
# ============================================================
# Step 1 ‚Äî Install tools (gdown) and import libraries
# ============================================================

!pip install -q gdown

import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm

print("‚úÖ Libraries imported")
print("pandas:", pd.__version__)
print("numpy:", np.__version__)


‚úÖ Libraries imported
pandas: 2.2.2
numpy: 2.0.2


In [None]:
# ============================================================
# Step 2 ‚Äî Configuration: Google Drive ZIP IDs & column names
# ============================================================

# TODO: put your actual Google Drive file IDs here
ZIP_IDS = [
    "1st google drive Zip_File_ID",   # e.g. "1abcD123..."https://drive.google.com/file/d/1irbew-MG2RngIPia0uvk6bU1fZDLM25W/view?usp=sharing
    "2st google drive Zip_File_ID"   # e.g. "1xyzK789..." https://drive.google.com/file/d/1nLrQVxY56bL0Ynbi0wE6_NwOPj6bRG5c/view?usp=sharing
]

# Folder where we will unpack all CSV files
DATA_ROOT = "/content/evac_raw_runs"
os.makedirs(DATA_ROOT, exist_ok=True)

# Column names in EACH CSV file (adjust if your headers differ)
RUN_COL      = "run"       # we will CREATE this column from file index
AGENT_COL    = "agentid"
TIME_COL     = "evacuation_time" # Fixed: Changed from "step" to "evacuation_time"
X_COL        = "xcor"
Y_COL        = "ycor"
EMOTION_COL  = "emotion"   # contains strings like "0[calm 0.46]"

# Sliding window parameters
PAST_STEPS   = 8
FUTURE_STEPS = 12

# Output NPZ path
OUT_NPZ_PATH = "/content/evac_P8F12_P5features_100runs.npz"

print("‚úÖ Config set")
print("ZIP_IDS:", ZIP_IDS)
print("DATA_ROOT:", DATA_ROOT)
print("Output:", OUT_NPZ_PATH)


‚úÖ Config set
ZIP_IDS: ['1irbew-MG2RngIPia0uvk6bU1fZDLM25W', '1nLrQVxY56bL0Ynbi0wE6_NwOPj6bRG5c']
DATA_ROOT: /content/evac_raw_runs
Output: /content/evac_P8F12_P5features_100runs_emotionVAL.npz


In [None]:
# ============================================================
# Step 3 ‚Äî Download & unzip 2 ZIP files (each with 50 CSVs)
# ============================================================

import gdown

zip_paths = []

for i, fid in enumerate(ZIP_IDS):
    zip_path = f"/content/evac_runs_{i+1}.zip"
    zip_paths.append(zip_path)
    print(f"‚¨áÔ∏è Downloading ZIP {i+1} from Google Drive ID: {fid}")
    gdown.download(id=fid, output=zip_path, quiet=False)

    # Unzip into separate subfolders
    out_dir = os.path.join(DATA_ROOT, f"zip_{i+1}")
    os.makedirs(out_dir, exist_ok=True)
    !unzip -q "{zip_path}" -d "{out_dir}"
    print(f"‚úÖ Unzipped ZIP {i+1} into {out_dir}")

print("All ZIPs processed.")

‚¨áÔ∏è Downloading ZIP 1 from Google Drive ID: 1irbew-MG2RngIPia0uvk6bU1fZDLM25W


Downloading...
From (original): https://drive.google.com/uc?id=1irbew-MG2RngIPia0uvk6bU1fZDLM25W
From (redirected): https://drive.google.com/uc?id=1irbew-MG2RngIPia0uvk6bU1fZDLM25W&confirm=t&uuid=039c2cce-5052-46d5-a2c4-ec546e1e86dd
To: /content/evac_runs_1.zip
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 44.7M/44.7M [00:03<00:00, 13.3MB/s]


‚úÖ Unzipped ZIP 1 into /content/evac_raw_runs/zip_1
‚¨áÔ∏è Downloading ZIP 2 from Google Drive ID: 1nLrQVxY56bL0Ynbi0wE6_NwOPj6bRG5c


Downloading...
From (original): https://drive.google.com/uc?id=1nLrQVxY56bL0Ynbi0wE6_NwOPj6bRG5c
From (redirected): https://drive.google.com/uc?id=1nLrQVxY56bL0Ynbi0wE6_NwOPj6bRG5c&confirm=t&uuid=5c237ec5-0d61-48d5-a4de-27c31cf3bed6
To: /content/evac_runs_2.zip
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 44.7M/44.7M [00:03<00:00, 14.0MB/s]


‚úÖ Unzipped ZIP 2 into /content/evac_raw_runs/zip_2
All ZIPs processed.


In [None]:
# ============================================================
# Step 4 (fixed) ‚Äî Load and concatenate all CSVs, assign run IDs
# ============================================================

import glob
import os

# Recursive search for ANY .csv under DATA_ROOT
all_csv_paths = sorted(glob.glob(os.path.join(DATA_ROOT, "**", "*.csv"), recursive=True))

print(f"Found {len(all_csv_paths)} CSV files.")
for p in all_csv_paths[:10]:
    print("  ", p)

if len(all_csv_paths) == 0:
    raise RuntimeError("No CSV files found. Check folder structure or file extensions (maybe .CSV?).")

dfs = []
for run_idx, csv_path in enumerate(all_csv_paths):
    df_run = pd.read_csv(csv_path)
    df_run[RUN_COL] = run_idx  # assign run ID from 0..N-1
    dfs.append(df_run)

df = pd.concat(dfs, ignore_index=True)
print("‚úÖ Combined dataframe shape:", df.shape)
display(df.head())
print("\nüìå Columns:", df.columns.tolist())

Found 100 CSV files.
   /content/evac_raw_runs/zip_1/evacuation_data/evacuation_data (1).csv
   /content/evac_raw_runs/zip_1/evacuation_data/evacuation_data (10).csv
   /content/evac_raw_runs/zip_1/evacuation_data/evacuation_data (11).csv
   /content/evac_raw_runs/zip_1/evacuation_data/evacuation_data (12).csv
   /content/evac_raw_runs/zip_1/evacuation_data/evacuation_data (13).csv
   /content/evac_raw_runs/zip_1/evacuation_data/evacuation_data (14).csv
   /content/evac_raw_runs/zip_1/evacuation_data/evacuation_data (15).csv
   /content/evac_raw_runs/zip_1/evacuation_data/evacuation_data (16).csv
   /content/evac_raw_runs/zip_1/evacuation_data/evacuation_data (17).csv
   /content/evac_raw_runs/zip_1/evacuation_data/evacuation_data (18).csv
‚úÖ Combined dataframe shape: (6063397, 7)


Unnamed: 0,agentid,xcor,ycor,speed,emotion,evacuation_time,run
0,31416,6,59,1,0[calm 0.569863308714048],1,0
1,31446,88,20,1,0[calm 0.5692018454353232],1,0
2,31161,50,79,1,0[calm 0.5267792252953608],1,0
3,31218,117,56,1,0[calm 0.5064889130697154],1,0
4,31299,34,48,1,0[calm 0.619967059129151],1,0



üìå Columns: ['agentid', 'xcor', 'ycor', 'speed', 'emotion', 'evacuation_time', 'run']


In [None]:
# ============================================================
# Step 5 ‚Äî Parse EMOTION_COL to extract emotion_val (float)
# ============================================================

def parse_emotion_val(s):
    """
    Extracts the last float from strings like:
      '0[calm 0.464198865]' ‚Üí 0.464198865
      '0[alarm 18.70831108]' ‚Üí 18.70831108
    Returns NaN if parsing fails.
    """
    if pd.isna(s):
        return np.nan
    try:
        # e.g. "0[calm 0.464198865]" ‚Üí "calm 0.464198865"
        inside = s.split("[", 1)[-1].replace("]", "")
        # split by space ‚Üí ["calm", "0.464198865"]
        parts = inside.split()
        val = float(parts[-1])
        return val
    except Exception:
        return np.nan

df["emotion_val"] = df[EMOTION_COL].apply(parse_emotion_val)

print("‚úÖ emotion_val parsed. Sample:")
display(df[[EMOTION_COL, "emotion_val"]].head(10))

‚úÖ emotion_val parsed. Sample:


Unnamed: 0,emotion,emotion_val
0,0[calm 0.569863308714048],0.569863
1,0[calm 0.5692018454353232],0.569202
2,0[calm 0.5267792252953608],0.526779
3,0[calm 0.5064889130697154],0.506489
4,0[calm 0.619967059129151],0.619967
5,0[calm 0.5431628227922762],0.543163
6,0[calm 0.412143833094967],0.412144
7,0[calm 0.5073583890525479],0.507358
8,0[calm 0.48003930242884574],0.480039
9,0[calm 0.5266064576163437],0.526606


In [None]:
# ============================================================
# Step 6 ‚Äî Sort by (run, agent, step) and compute vx, vy
# ============================================================

# Sort rows globally
df = df.sort_values([RUN_COL, AGENT_COL, TIME_COL]).reset_index(drop=True)

# Initialize velocity columns
df["vx"] = 0.0
df["vy"] = 0.0

group_cols = [RUN_COL, AGENT_COL]

for (_, _), idx in tqdm(df.groupby(group_cols).groups.items(), desc="Computing vx, vy"):
    # idx is an index array for this (run, agent)
    sub = df.loc[idx, [TIME_COL, X_COL, Y_COL]].sort_values(TIME_COL)

    vx = sub[X_COL].diff().fillna(0.0)
    vy = sub[Y_COL].diff().fillna(0.0)

    df.loc[vx.index, "vx"] = vx
    df.loc[vy.index, "vy"] = vy

print("‚úÖ vx, vy computed.")
display(df[[RUN_COL, AGENT_COL, TIME_COL, X_COL, Y_COL, "vx", "vy", "emotion_val"]].head(15))

Computing vx, vy: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 60000/60000 [47:19<00:00, 21.13it/s]


‚úÖ vx, vy computed.


Unnamed: 0,run,agentid,evacuation_time,xcor,ycor,vx,vy,emotion_val
0,0,30988,1,18,18,0.0,0.0,0.556507
1,0,30988,2,17,18,-1.0,0.0,1.110235
2,0,30988,3,16,18,-1.0,0.0,1.661938
3,0,30988,4,15,18,-1.0,0.0,2.217914
4,0,30988,5,14,18,-1.0,0.0,2.764877
5,0,30988,6,13,18,-1.0,0.0,3.309624
6,0,30988,7,13,19,0.0,1.0,3.860895
7,0,30988,8,12,19,-1.0,0.0,4.409835
8,0,30988,9,11,19,-1.0,0.0,4.954068
9,0,30988,10,11,20,0.0,1.0,5.49217


In [None]:
# ============================================================
# Step 7 ‚Äî Keep required columns; drop rows with NaN emotion_val
# ============================================================

needed_cols = [RUN_COL, AGENT_COL, TIME_COL, X_COL, Y_COL, "vx", "vy", "emotion_val"]

df_small = df[needed_cols].copy()

before = df_small.shape[0]
df_small = df_small.dropna(subset=["emotion_val"]).reset_index(drop=True)
after = df_small.shape[0]

print(f"‚úÖ df_small shape: {df_small.shape} (dropped {before - after} rows with NaN emotion_val)")
display(df_small.head())


‚úÖ df_small shape: (6063397, 8) (dropped 0 rows with NaN emotion_val)


Unnamed: 0,run,agentid,evacuation_time,xcor,ycor,vx,vy,emotion_val
0,0,30988,1,18,18,0.0,0.0,0.556507
1,0,30988,2,17,18,-1.0,0.0,1.110235
2,0,30988,3,16,18,-1.0,0.0,1.661938
3,0,30988,4,15,18,-1.0,0.0,2.217914
4,0,30988,5,14,18,-1.0,0.0,2.764877


In [None]:
# ============================================================
# Step 8 ‚Äî Build sliding windows per (run, agent)
#          X: past PAST_STEPS of 5 features
#          Y: next FUTURE_STEPS of (x,y) flattened
# ============================================================

FEATURE_COLS = [X_COL, Y_COL, "vx", "vy", "emotion_val"]

def build_windows_for_agent(agent_df, past_steps, future_steps):
    """
    Build sliding windows for a single (run, agent) trajectory.
    agent_df must be sorted by TIME_COL.

    Returns lists of:
      X: (past_steps, 5)
      Y: (future_steps*2,)
    """
    feats = agent_df[FEATURE_COLS].to_numpy(dtype=np.float32)
    xy    = agent_df[[X_COL, Y_COL]].to_numpy(dtype=np.float32)

    total = len(agent_df)
    max_start = total - (past_steps + future_steps) + 1

    X_list, Y_list = [], []

    for start in range(max_start):
        past = feats[start : start + past_steps]                  # (P, 5)
        fut  = xy[start + past_steps : start + past_steps + future_steps]  # (F, 2)

        X_list.append(past)
        Y_list.append(fut.reshape(-1))  # flatten to (F*2,)

    return X_list, Y_list


def build_dataset(df_small, past_steps, future_steps):
    all_X, all_Y = [], []

    grouped = df_small.sort_values([RUN_COL, AGENT_COL, TIME_COL]).groupby([RUN_COL, AGENT_COL])

    for (_, _), g in tqdm(grouped, desc="Building windows"):
        X_list, Y_list = build_windows_for_agent(g, past_steps, future_steps)
        all_X.extend(X_list)
        all_Y.extend(Y_list)

    X = np.array(all_X, dtype=np.float32)          # (N, P, 5)
    Y = np.array(all_Y, dtype=np.float32)          # (N, F*2)
    return X, Y

X_all, Y_all = build_dataset(df_small, PAST_STEPS, FUTURE_STEPS)

print("‚úÖ Dataset built.")
print("X_all shape:", X_all.shape)  # (N, 8, 5)
print("Y_all shape:", Y_all.shape)  # (N, 24)


Building windows: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 60000/60000 [00:59<00:00, 1001.13it/s]

‚úÖ Dataset built.
X_all shape: (4924366, 8, 5)
Y_all shape: (4924366, 24)





In [None]:
# ============================================================
# Step 9 ‚Äî Train/Test split (random 80/20 on windows)
# ============================================================

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X_all, Y_all, test_size=0.2, random_state=42
)

print("‚úÖ Train/Test split done.")
print("X_train:", X_train.shape, "Y_train:", Y_train.shape)
print("X_test:",  X_test.shape,  "Y_test:",  Y_test.shape)


‚úÖ Train/Test split done.
X_train: (3939492, 8, 5) Y_train: (3939492, 24)
X_test: (984874, 8, 5) Y_test: (984874, 24)


In [None]:
# ============================================================
# Step 10 ‚Äî Save cleaned dataset to .npz
# ============================================================

feature_cols = np.array(["xcor", "ycor", "vx", "vy", "emotion_val"])

np.savez_compressed(
    OUT_NPZ_PATH,
    X_train = X_train,
    Y_train = Y_train,
    X_test  = X_test,
    Y_test  = Y_test,
    feature_cols = feature_cols,
    past_steps   = PAST_STEPS,
    future_steps = FUTURE_STEPS
)

print("‚úÖ Saved to:", OUT_NPZ_PATH)


‚úÖ Saved to: /content/evac_P8F12_P5features_100runs_emotionVAL.npz


In [None]:
# ============================================================
# Step 11 ‚Äî Sanity check: reload and inspect shapes
# ============================================================

data = np.load(OUT_NPZ_PATH)

print("Keys:", list(data.keys()))
print("feature_cols:", data["feature_cols"])
print("past_steps:",  data["past_steps"])
print("future_steps:", data["future_steps"])
print("X_train shape:", data["X_train"].shape)
print("Y_train shape:", data["Y_train"].shape)
print("X_test shape:",  data["X_test"].shape)
print("Y_test shape:",  data["Y_test"].shape)


Keys: ['X_train', 'Y_train', 'X_test', 'Y_test', 'feature_cols', 'past_steps', 'future_steps']
feature_cols: ['xcor' 'ycor' 'vx' 'vy' 'emotion_val']
past_steps: 8
future_steps: 12
X_train shape: (3939492, 8, 5)
Y_train shape: (3939492, 24)
X_test shape: (984874, 8, 5)
Y_test shape: (984874, 24)
