# Splitting update1_fulltrain.pkl into training, test, and validation sets
Challenge here was to ensure splitting was not done on the recording level, and also manually assign audio to the val dataset ensuring that recordings of the same bird did not end up in the val dataset.\
Also had to slim the train and val df to just one "virail" column and not a column for each XC species.

In [None]:
# the cnn module provides classes for training/predicting with various types of CNNs
from opensoundscape import CNN

#other utilities and packages
import torch
import pandas as pd
from pathlib import Path
import numpy as np
import pandas as pd
import random
import subprocess
from glob import glob
import sklearn
from sklearn.model_selection import train_test_split

#set up plotting
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize']=[15,5] #for large visuals
%config InlineBackend.figure_format = 'retina'

In [None]:
df = pd.read_pickle("/home/brg226/projects/vira_beg/training_data/update2_fulltrain_with_field.pkl") 
df.head(20)

In [None]:
df.tail()

In [None]:
len(df)

In [None]:
# Print column names
print("Column names:")
print(df.columns.tolist())
print(f"\nNumber of columns: {len(df.columns)}")
print(f"Index name: {df.index.name}")
print(f"DataFrame shape: {df.shape}")

In [None]:
# Investigate the DataFrame structure more deeply
print("=== DETAILED DATAFRAME INVESTIGATION ===")
print(f"DataFrame type: {type(df)}")
print(f"Index type: {type(df.index)}")
print(f"Columns type: {type(df.columns)}")

print(f"\n=== INDEX INFORMATION ===")
print(f"Index names: {df.index.names}")
print(f"Index nlevels: {df.index.nlevels}")
if hasattr(df.index, 'levels'):
    print(f"Index levels: {len(df.index.levels)} levels")
    for i, level in enumerate(df.index.levels):
        print(f"  Level {i}: {level[:5].tolist()}..." if len(level) > 5 else f"  Level {i}: {level.tolist()}")

print(f"\n=== COLUMNS INFORMATION ===")  
print(f"Column names: {df.columns.names}")
print(f"Column nlevels: {df.columns.nlevels}")
print(f"Actual columns: {df.columns.tolist()}")

print(f"\n=== DATAFRAME STRUCTURE ===")
print(f"Shape: {df.shape}")
print("\nDataFrame info:")
df.info()

print(f"\n=== SAMPLE DATA ===")
print("First few rows with reset_index():")
try:
    print(df.reset_index().head())
except:
    print("reset_index() failed")
    print("Regular head():")
    print(df.head())

In [None]:
# Check if we have the file column
if 'file' in df.columns:
    print(f"Success! 'file' column found with {df['file'].nunique()} unique files")
    print(f"Sample file values: {df['file'].head(3).tolist()}")
else:
    print("'file' column not found in columns")

From Sam \
Workflow might look like:
select a test set if you have a good one: eg, representative of the field data;\
combine all remaining positives and negatives into one big tran&val set\
split into training and validation with file-level splitting\
resample training set for even class representation\
Does that make sense?\
In your case there is no good test set, you’ll just have a train and validation set

In [None]:
# Manual file splitting - specify validation files explicitly + 20% of NON-virail files
audio_path = "/home/brg226/projects/vira_beg/training_data/annotated_positive_audio/audio_viratrain"
manual_val_files = [f"{audio_path}/104566671.wav", f"{audio_path}/252407901.wav", f"{audio_path}/357857961.wav"]

# Get all unique files
unique_files = df['file'].unique()

# Check which manual files exist
manual_files_found = [f for f in manual_val_files if f in unique_files]
print(f"Manual validation files found: {len(manual_files_found)}")
for file in manual_files_found:
    print(f"✓ {file}")


In [None]:

# Get files that come from virail column (positive examples)
virail_files = df[df['virail'] == 1]['file'].unique()
print(f"\nTotal files from virail column: {len(virail_files)}")

# Remaining virail files (not in manual validation) go to training
remaining_virail_files = [f for f in virail_files if f not in manual_files_found]
print(f"Remaining virail files (going to training): {len(remaining_virail_files)}")

# Get non-virail files (negative examples)
non_virail_files = df[df['virail'] != 1]['file'].unique()
print(f"Non-virail files available: {len(non_virail_files)}")

# Split non-virail files 80/20
if non_virail_files.size > 0:
    from sklearn.model_selection import train_test_split
    train_files_non_virail, val_files_non_virail = train_test_split(
        non_virail_files, test_size=0.2, random_state=42
    )
else:
    train_files_non_virail, val_files_non_virail = [], []

# Combine for final sets
val_files = list(manual_files_found) + list(val_files_non_virail)
train_files = list(remaining_virail_files) + list(train_files_non_virail)

print(f"\nFinal split result:")
print(f"Validation files: {len(val_files)} files ({len(manual_files_found)} manual virail + {len(val_files_non_virail)} non-virail)")
print(f"Training files: {len(train_files)} files ({len(remaining_virail_files)} remaining virail + {len(train_files_non_virail)} non-virail)")
print(f"Total files: {len(val_files) + len(train_files)}")

print(f"\nValidation files breakdown:")
print(f"  Manual virail files: {len(manual_files_found)}")
print(f"  Auto-selected non-virail files: {len(val_files_non_virail)}")
print(f"\nTraining files breakdown:")
print(f"  Remaining virail files: {len(remaining_virail_files)}")
print(f"  Auto-selected non-virail files: {len(train_files_non_virail)}")

In [None]:
# Create separate training and validation DataFrames based on file splits
train_df = df[df['file'].isin(train_files)]
val_df = df[df['file'].isin(val_files)]

print(f"Training set: {len(train_df)} clips from {len(train_files)} files")
print(f"Validation set: {len(val_df)} clips from {len(val_files)} files")
print(f"Total clips: {len(train_df) + len(val_df)}")

# Check class distribution in each set
if 'virail' in df.columns:
    print(f"\nTraining set class distribution:")
    print(train_df['virail'].value_counts())
    print(f"\nValidation set class distribution:")
    print(val_df['virail'].value_counts())

In [None]:
# Create slimmed training set - randomly select 5,000 non-virail rows
print("Creating slimmed training set...")

# Separate virail and non-virail rows in training set
train_virail = train_df[train_df['virail'] == 1]
train_non_virail = train_df[train_df['virail'] != 1]

print(f"Original training set:")
print(f"  Virail rows: {len(train_virail)}")
print(f"  Non-virail rows: {len(train_non_virail)}")

# Randomly sample 5,000 non-virail rows
if len(train_non_virail) > 5000:
    train_non_virail_slim = train_non_virail.sample(n=5000, random_state=42)
    print(f"\nSlimmed non-virail rows: {len(train_non_virail_slim)} (sampled from {len(train_non_virail)})")
else:
    train_non_virail_slim = train_non_virail
    print(f"\nUsing all {len(train_non_virail_slim)} non-virail rows (less than 5,000 available)")

# Combine virail rows with slimmed non-virail rows
train_df_slim = pd.concat([train_virail, train_non_virail_slim], ignore_index=True)

print(f"\nSlimmed training set:")
print(f"  Total rows: {len(train_df_slim)}")
print(f"  Virail rows: {len(train_df_slim[train_df_slim['virail'] == 1])}")
print(f"  Non-virail rows: {len(train_df_slim[train_df_slim['virail'] != 1])}")

print(f"\nClass distribution in slimmed training set:")
print(train_df_slim['virail'].value_counts())

# Report how many 1s from each column are in the slim training set
print(f"\nDetailed breakdown of 1s and 1.0s in slimmed training set:")
for col in train_df_slim.columns:
    try:
        ones_count = ((train_df_slim[col] == 1) | (train_df_slim[col] == 1.0)).sum()
        print(f"  {col}: {ones_count} rows with value 1 or 1.0")
    except:
        print(f"  {col}: Cannot check for 1s/1.0s (non-numeric column)")

# Update train_df to use the slimmed version
train_df = train_df_slim

In [None]:
train_df.head()

In [None]:
# Check data types of all columns, especially target columns
print("=== DATA TYPE ANALYSIS ===")
print(f"Column data types in training set:")
for i, (col, dtype) in enumerate(train_df.dtypes.items()):
    print(f"  {i}: {col} -> {dtype}")

# Focus on columns 3-34 (likely your target/label columns)
print(f"\n=== TARGET COLUMNS (3-34) ANALYSIS ===")
target_cols = train_df.columns[3:35]  # columns 3-34 (0-indexed, so 3:35)
print(f"Target columns: {target_cols.tolist()}")

# Check if all target columns have the same data type
target_dtypes = [train_df[col].dtype for col in target_cols]
unique_dtypes = set(target_dtypes)
print(f"\nUnique data types in target columns: {unique_dtypes}")

if len(unique_dtypes) == 1:
    print(f"✓ All target columns have the same data type: {list(unique_dtypes)[0]}")
else:
    print("⚠ Target columns have different data types:")
    for col in target_cols:
        print(f"  {col}: {train_df[col].dtype}")

# Check unique values in a few target columns to understand the data
print(f"\n=== SAMPLE TARGET COLUMN VALUES ===")
for col in target_cols[:5]:  # Check first 5 target columns
    unique_vals = train_df[col].unique()
    print(f"{col}: {unique_vals[:10]}...")  # Show first 10 unique values

In [None]:
# Convert target columns (3-34) to int64
print("=== CONVERTING TARGET COLUMNS TO INT64 ===")
target_cols = train_df.columns[3:35]  # columns 3-34

print("Converting target columns to int64...")
for col in target_cols:
    old_dtype = train_df[col].dtype
    train_df[col] = train_df[col].astype('int64')
    print(f"  {col}: {old_dtype} -> {train_df[col].dtype}")

# Also convert validation set to match - fix the copy warning
print(f"\nConverting validation set target columns to int64...")
val_df = val_df.copy()  # Create explicit copy to avoid SettingWithCopyWarning
for col in target_cols:
    if col in val_df.columns:
        old_dtype = val_df[col].dtype
        val_df[col] = val_df[col].astype('int64')
        print(f"  {col}: {old_dtype} -> {val_df[col].dtype}")

# Verify all target columns are now int64
print(f"\n=== VERIFICATION ===")
target_dtypes = [train_df[col].dtype for col in target_cols]
unique_dtypes = set(target_dtypes)
print(f"Unique data types in target columns after conversion: {unique_dtypes}")

if len(unique_dtypes) == 1 and 'int64' in str(list(unique_dtypes)[0]):
    print("✓ All target columns successfully converted to int64")
else:
    print("⚠ Some columns may not have converted properly")

In [None]:
# Display the head of the DataFrame with updated data types
print("=== TRAINING DATAFRAME WITH UPDATED DATA TYPES ===")
print(f"Training DataFrame shape: {train_df.shape}")
print(f"Training DataFrame dtypes:")
print(train_df.dtypes)


In [None]:
# Create slimmed validation set - randomly select 1,000 non-virail rows
print("=== SLIMMING VALIDATION SET ===")
val_virail = val_df[val_df['virail'] == 1]
val_non_virail = val_df[val_df['virail'] != 1]

print(f"Original validation set:")
print(f"  Virail rows: {len(val_virail)}")
print(f"  Non-virail rows: {len(val_non_virail)}")

# Randomly sample 1,000 non-virail rows
if len(val_non_virail) > 1000:
    val_non_virail_slim = val_non_virail.sample(n=1000, random_state=42)
    print(f"\nSlimmed validation non-virail rows: {len(val_non_virail_slim)} (sampled from {len(val_non_virail)})")
else:
    val_non_virail_slim = val_non_virail
    print(f"\nUsing all {len(val_non_virail_slim)} non-virail rows (less than 1,000 available)")

# Combine virail rows with slimmed non-virail rows for validation
val_df_slim = pd.concat([val_virail, val_non_virail_slim], ignore_index=True)

print(f"\nSlimmed validation set:")
print(f"  Total rows: {len(val_df_slim)}")
print(f"  Virail rows: {len(val_df_slim[val_df_slim['virail'] == 1])}")
print(f"  Non-virail rows: {len(val_df_slim[val_df_slim['virail'] != 1])}")

print(f"\nClass distribution in slimmed validation set:")
print(val_df_slim['virail'].value_counts())

# Update val_df to use the slimmed version
val_df = val_df_slim

# Create new DataFrame with only essential columns
essential_cols = ['file', 'start_time', 'end_time', 'virail']
train_df_essential = train_df[essential_cols].copy()
val_df_essential = val_df[essential_cols].copy()

# Fix data types for validation set to match training set
print("\n=== FIXING VALIDATION SET DATA TYPES ===")
print("Validation set data types before fix:")
print(val_df_essential.dtypes)

# Convert virail to int64 in validation set if needed
if val_df_essential['virail'].dtype != 'int64':
    val_df_essential['virail'] = val_df_essential['virail'].astype('int64')
    print("✓ Converted virail column to int64")

print("\nValidation set data types after fix:")
print(val_df_essential.dtypes)

print("\n=== ESSENTIAL COLUMNS DATAFRAMES ===")
print(f"Training set essential columns: {essential_cols}")
print(f"Training DataFrame shape: {train_df_essential.shape}")
print(f"Training DataFrame dtypes:")
print(train_df_essential.dtypes)

print(f"\nTraining DataFrame head:")
print(train_df_essential.head())

print(f"\nValidation DataFrame head:")
print(val_df_essential.head())

print(f"\nClass distribution in essential training set:")
print(train_df_essential['virail'].value_counts())

print(f"\nClass distribution in essential validation set:")
print(val_df_essential['virail'].value_counts())

In [None]:
# Save both full and essential DataFrames as pickle files with date
import os
from datetime import datetime

# Create output directory if it doesn't exist
output_dir = "/home/brg226/projects/vira_beg/training_data/second_pass_nov25"
os.makedirs(output_dir, exist_ok=True)

# Get current date for filename
date_str = "nov25_2025"

print("=== SAVING FULL DATAFRAMES ===")
# Save full training DataFrame (all columns)
train_full_output_path = os.path.join(output_dir, f"train_df_full_{date_str}.pkl")
train_df.to_pickle(train_full_output_path)
print(f"Saved full training DataFrame to: {train_full_output_path}")
print(f"Full training set shape: {train_df.shape}")
print(f"Full training set columns: {len(train_df.columns)}")

# Save full validation DataFrame (all columns)
val_full_output_path = os.path.join(output_dir, f"val_df_full_{date_str}.pkl")
val_df.to_pickle(val_full_output_path)
print(f"Saved full validation DataFrame to: {val_full_output_path}")
print(f"Full validation set shape: {val_df.shape}")
print(f"Full validation set columns: {len(val_df.columns)}")

print("\n=== SAVING ESSENTIAL DATAFRAMES ===")
# Save essential training DataFrame (4 columns only)
train_output_path = os.path.join(output_dir, f"train_df_essential_{date_str}.pkl")
train_df_essential.to_pickle(train_output_path)
print(f"Saved essential training DataFrame to: {train_output_path}")
print(f"Essential training set shape: {train_df_essential.shape}")

# Save essential validation DataFrame (4 columns only)
val_output_path = os.path.join(output_dir, f"val_df_essential_{date_str}.pkl")
val_df_essential.to_pickle(val_output_path)
print(f"Saved essential validation DataFrame to: {val_output_path}")
print(f"Essential validation set shape: {val_df_essential.shape}")

print(f"\n=== SUMMARY ===")
print(f"All DataFrames saved to: {output_dir}")
print(f"\nFull datasets:")
print(f"  - train_df_full_{date_str}.pkl")
print(f"  - val_df_full_{date_str}.pkl")
print(f"\nEssential datasets:")
print(f"  - train_df_essential_{date_str}.pkl")
print(f"  - val_df_essential_{date_str}.pkl")

print(f"\nFinal dataset summary:")
print(f"Training set: {len(train_df_essential)} clips")
print(f"  - Virail=1: {(train_df_essential['virail'] == 1).sum()}")
print(f"  - Virail=0: {(train_df_essential['virail'] == 0).sum()}")

print(f"Validation set: {len(val_df_essential)} clips")
print(f"  - Virail=1: {(val_df_essential['virail'] == 1).sum()}")
print(f"  - Virail=0: {(val_df_essential['virail'] == 0).sum()}")

print(f"\nEssential columns: {train_df_essential.columns.tolist()}")
print(f"Essential data types: {train_df_essential.dtypes.to_dict()}")
print(f"Full dataset has {len(train_df.columns)} total columns")