In [2]:
pip install pytorch-lightning transformers wandb scikit-learn pandas numpy torch h5py --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle
from typing import List, Tuple
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from transformers import AutoTokenizer

def split_and_save_data(train_df: pd.DataFrame, 
                       val_df: pd.DataFrame,
                       train_chunk_size: int = 500000,
                       val_chunk_size: int = 100000,
                       output_dir: str = 'data_chunks') -> Tuple[List[str], List[str]]:
    """
    Split and save training and validation data into smaller chunks
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Shuffle the dataframes
    train_df = shuffle(train_df, random_state=42)
    val_df = shuffle(val_df, random_state=42)
    
    # Split and save training data
    train_chunks = []
    for i in range(0, len(train_df), train_chunk_size):
        chunk = train_df.iloc[i:i + train_chunk_size]
        filename = f'{output_dir}/train_chunk_{i//train_chunk_size}.csv'
        chunk.to_csv(filename, index=False)
        train_chunks.append(filename)
    
    # Split and save validation data
    val_chunks = []
    for i in range(0, len(val_df), val_chunk_size):
        chunk = val_df.iloc[i:i + val_chunk_size]
        filename = f'{output_dir}/val_chunk_{i//val_chunk_size}.csv'
        chunk.to_csv(filename, index=False)
        val_chunks.append(filename)
    
    return train_chunks, val_chunks

def train_model_incrementally(train_chunks: List[str],
                            val_chunks: List[str],
                            model_dir: str = 'models',
                            batch_size: int = 32,
                            epochs_per_chunk: int = 10):
    """
    Train the model incrementally on data chunks
    """
    os.makedirs(model_dir, exist_ok=True)
    tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')
    
    model = None
    label_encoder = None
    
    for chunk_idx, (train_chunk, val_chunk) in enumerate(zip(train_chunks, val_chunks)):
        print(f"\nTraining on chunk {chunk_idx + 1}/{len(train_chunks)}")
        
        # Load data chunks
        train_df = pd.read_csv(train_chunk)
        val_df = pd.read_csv(val_chunk)
        
        # Create datasets
        if model is None:  # First chunk
            train_dataset = AccidentDataset(train_df, tokenizer)
            val_dataset = AccidentDataset(val_df, tokenizer,
                                        text_embeddings=train_dataset.get_text_embeddings(val_df['Description'].tolist()))
            
            # Initialize model
            model = AccidentPredictor(
                tabular_dim=train_dataset.tabular_features.shape[1],
                embedding_dim=train_dataset.text_embeddings.shape[1],
                num_classes=len(train_dataset.label_encoder.classes_)
            )
            label_encoder = train_dataset.label_encoder
        else:  # Subsequent chunks
            train_dataset = AccidentDataset(train_df, tokenizer, label_encoder=label_encoder)
            val_dataset = AccidentDataset(val_df, tokenizer,
                                        text_embeddings=train_dataset.get_text_embeddings(val_df['Description'].tolist()),
                                        label_encoder=label_encoder)
        
        # Create dataloaders
        train_loader = DataLoader(train_dataset, 
                                batch_size=batch_size, 
                                shuffle=True, 
                                num_workers=4,
                                pin_memory=True)
        val_loader = DataLoader(val_dataset, 
                              batch_size=batch_size, 
                              shuffle=False, 
                              num_workers=4,
                              pin_memory=True)
        
        # Initialize wandb logger with unique name for each chunk
        wandb_logger = WandbLogger(project='accident-severity-prediction',
                                 name=f'hybrid-model-chunk-{chunk_idx}')
        
        # Initialize trainer
        trainer = pl.Trainer(
            max_epochs=epochs_per_chunk,
            logger=wandb_logger,
            callbacks=[
                ModelCheckpoint(
                    dirpath=f'{model_dir}/chunk_{chunk_idx}',
                    filename='accident-predictor-{epoch:02d}-{val_loss:.2f}',
                    save_top_k=1,
                    mode='min'
                ),
                EarlyStopping(monitor='val_loss', patience=3, mode='min'),
                LitProgressBar()
            ],
            accelerator='auto',
            devices=1,
            log_every_n_steps=10
        )
        
        # Train model
        trainer.fit(model, train_loader, val_loader)
        
        # Save model after each chunk
        torch.save({
            'model_state_dict': model.state_dict(),
            'label_encoder': label_encoder
        }, f'{model_dir}/model_after_chunk_{chunk_idx}.pt')
        
        # Clear memory
        del train_dataset, val_dataset, train_loader, val_loader
        torch.cuda.empty_cache()
    
    return model, label_encoder

# Modified AccidentDataset class
class AccidentDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, text_embeddings=None, 
                 is_test: bool = False, label_encoder=None):
        self.df = df
        self.tokenizer = tokenizer
        self.is_test = is_test
        
        # Process tabular features
        print("Processing tabular features...")
        self.process_tabular_features()
        
        # Store or compute text embeddings
        if text_embeddings is not None:
            self.text_embeddings = text_embeddings
        else:
            print("Computing text embeddings...")
            self.text_embeddings = self.get_text_embeddings(df['Description'].tolist())
        
        if not is_test:
            if label_encoder is None:
                self.label_encoder = LabelEncoder()
                self.labels = self.label_encoder.fit_transform(df['Severity'])
            else:
                self.label_encoder = label_encoder
                self.labels = self.label_encoder.transform(df['Severity'])


# Load your full data
train_df = pd.read_csv('train.csv')  # (5796296, 34)
val_df = pd.read_csv('val.csv')      # (1159259, 34)

# Split data into chunks and save
train_chunks, val_chunks = split_and_save_data(
    train_df, 
    val_df,
    train_chunk_size=500000,
    val_chunk_size=100000,
    output_dir='data_chunks'
)

# Train model incrementally
model, label_encoder = train_model_incrementally(
    train_chunks,
    val_chunks,
    model_dir='models',
    batch_size=32,
    epochs_per_chunk=10
)

# Save final model
torch.save({
    'model_state_dict': model.state_dict(),
    'label_encoder': label_encoder
}, 'models/final_model.pt')


In [3]:
import torch
from transformers import AutoTokenizer, AutoModel

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')

# Process embeddings for train and validation sets
process_text_embeddings(
    '/teamspace/studios/this_studio/Assignment-TechstaX/data/imputed_dataset.csv',
    '/teamspace/studios/this_studio/Assignment-TechstaX/data/train_embeddings.h5',
    tokenizer,
    chunk_size=10000,  # Adjust based on your GPU memory
    batch_size=1000     # Adjust based on your GPU memory
)

process_text_embeddings(
    '/teamspace/studios/this_studio/Assignment-TechstaX/data/val_data.csv',
    '/teamspace/studios/this_studio/Assignment-TechstaX/data/val_embeddings.h5',
    tokenizer,
    chunk_size=10000,
    batch_size=1000
)
# Train model
model = train_model('/teamspace/studios/this_studio/Assignment-TechstaX/data/imputed_dataset.csv', '/teamspace/studios/this_studio/Assignment-TechstaX/data/val_data.csv', 
                   '/teamspace/studios/this_studio/Assignment-TechstaX/data/train_embeddings.h5', '/teamspace/studios/this_studio/Assignment-TechstaX/data/val_embeddings.h5')


Calculating total rows...
Creating HDF5 file with shape (5800000, 384)


Processing text embeddings: 0it [00:00, ?it/s]

KeyboardInterrupt: 