In [2]:
pip install pytorch-lightning transformers wandb scikit-learn pandas numpy torch h5py --quiet

Note: you may need to restart the kernel to use updated packages.


In [11]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle
from typing import List, Tuple
import torch
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl
import pandas as pd
import numpy as np
import torch
from torch import nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, TQDMProgressBar
from pytorch_lightning.loggers import WandbLogger
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.utils.data import Dataset, DataLoader
import wandb
from sklearn.metrics import classification_report, accuracy_score, f1_score
from typing import Dict, Tuple
from tqdm.auto import tqdm

class LitProgressBar(TQDMProgressBar):
    def init_validation_tqdm(self):
        bar = super().init_validation_tqdm()
        bar.set_description('Validation')
        return bar
    
    def init_train_tqdm(self):
        bar = super().init_train_tqdm()
        bar.set_description('Training')
        return bar
    
    
class AccidentPredictor(pl.LightningModule):
    def __init__(self, 
                 tabular_dim: int,
                 embedding_dim: int,
                 num_classes: int,
                 learning_rate: float = 1e-3):
        super().__init__()
        self.save_hyperparameters()
        
        # Network architecture
        self.tabular_network = nn.Sequential(
            nn.Linear(tabular_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        self.text_network = nn.Sequential(
            nn.Linear(embedding_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, num_classes)
        )
        
        self.criterion = nn.CrossEntropyLoss()
        
    def forward(self, tabular_features, text_embedding):
        tabular_features = self.tabular_network(tabular_features)
        text_features = self.text_network(text_embedding)
        combined = torch.cat([tabular_features, text_features], dim=1)
        return self.classifier(combined)
    
    def training_step(self, batch, batch_idx):
        tabular_features, text_embedding, labels = batch
        logits = self(tabular_features, text_embedding)
        loss = self.criterion(logits, labels)
        
        preds = torch.argmax(logits, dim=1)
        acc = accuracy_score(labels.cpu(), preds.cpu())
        f1 = f1_score(labels.cpu(), preds.cpu(), average='weighted')
        
        self.log('train_loss', loss, prog_bar=True)
        self.log('train_acc', acc, prog_bar=True)
        self.log('train_f1', f1, prog_bar=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        tabular_features, text_embedding, labels = batch
        logits = self(tabular_features, text_embedding)
        loss = self.criterion(logits, labels)
        
        preds = torch.argmax(logits, dim=1)
        acc = accuracy_score(labels.cpu(), preds.cpu())
        f1 = f1_score(labels.cpu(), preds.cpu(), average='weighted')
        
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        self.log('val_f1', f1, prog_bar=True)
        
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), 
                                   lr=self.hparams.learning_rate)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.1, patience=5, verbose=True
        )
        return {
            'optimizer': optimizer,
            'lr_scheduler': scheduler,
            'monitor': 'val_loss'
        }
        
def split_and_save_data(train_df: pd.DataFrame, 
                       val_df: pd.DataFrame,
                       train_chunk_size: int = 500000,
                       val_chunk_size: int = 100000,
                       output_dir: str = 'data_chunks') -> Tuple[List[str], List[str]]:
    """
    Split and save training and validation data into smaller chunks
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Shuffle the dataframes
    train_df = shuffle(train_df, random_state=42)
    val_df = shuffle(val_df, random_state=42)
    
    # Split and save training data
    train_chunks = []
    for i in range(0, len(train_df), train_chunk_size):
        chunk = train_df.iloc[i:i + train_chunk_size]
        filename = f'{output_dir}/train_chunk_{i//train_chunk_size}.csv'
        chunk.to_csv(filename, index=False)
        train_chunks.append(filename)
    
    # Split and save validation data
    val_chunks = []
    for i in range(0, len(val_df), val_chunk_size):
        chunk = val_df.iloc[i:i + val_chunk_size]
        filename = f'{output_dir}/val_chunk_{i//val_chunk_size}.csv'
        chunk.to_csv(filename, index=False)
        val_chunks.append(filename)
    
    return train_chunks, val_chunks

def train_model_incrementally(train_chunks: List[str],
                            val_chunks: List[str],
                            model_dir: str = 'models',
                            batch_size: int = 32,
                            epochs_per_chunk: int = 10):
    """
    Train the model incrementally on data chunks
    """
    os.makedirs(model_dir, exist_ok=True)
    tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')
    
    model = None
    label_encoder = None
    
    for chunk_idx, (train_chunk, val_chunk) in enumerate(zip(train_chunks, val_chunks)):
        print(f"\nTraining on chunk {chunk_idx + 1}/{len(train_chunks)}")
        
        # Load data chunks
        train_df = pd.read_csv(train_chunk)
        val_df = pd.read_csv(val_chunk)
        
        # Create datasets
        if model is None:  # First chunk
            train_dataset = AccidentDataset(train_df, tokenizer)
            val_dataset = AccidentDataset(val_df, tokenizer,
                                        text_embeddings=train_dataset.get_text_embeddings(val_df['Description'].tolist()))
            
            # Initialize model
            model = AccidentPredictor(
                tabular_dim=train_dataset.tabular_features.shape[1],
                embedding_dim=train_dataset.text_embeddings.shape[1],
                num_classes=len(train_dataset.label_encoder.classes_)
            )
            label_encoder = train_dataset.label_encoder
        else:  # Subsequent chunks
            train_dataset = AccidentDataset(train_df, tokenizer, label_encoder=label_encoder)
            val_dataset = AccidentDataset(val_df, tokenizer,
                                        text_embeddings=train_dataset.get_text_embeddings(val_df['Description'].tolist()),
                                        label_encoder=label_encoder)
        
        # Create dataloaders
        train_loader = DataLoader(train_dataset, 
                                batch_size=batch_size, 
                                shuffle=True, 
                                num_workers=4,
                                pin_memory=True)
        val_loader = DataLoader(val_dataset, 
                              batch_size=batch_size, 
                              shuffle=False, 
                              num_workers=4,
                              pin_memory=True)
        
        # Initialize wandb logger with unique name for each chunk
        wandb_logger = WandbLogger(project='accident-severity-prediction',
                                 name=f'hybrid-model-chunk-{chunk_idx}')
        
        # Initialize trainer
        trainer = pl.Trainer(
            max_epochs=epochs_per_chunk,
            logger=wandb_logger,
            callbacks=[
                ModelCheckpoint(
                    dirpath=f'{model_dir}/chunk_{chunk_idx}',
                    filename='accident-predictor-{epoch:02d}-{val_loss:.2f}',
                    save_top_k=1,
                    mode='min'
                ),
                EarlyStopping(monitor='val_loss', patience=5, mode='min'),
                LitProgressBar()
            ],
            accelerator='auto',
            devices=1,
            log_every_n_steps=10
        )
        
        # Train model
        trainer.fit(model, train_loader, val_loader)
        
        # Save model after each chunk
        torch.save({
            'model_state_dict': model.state_dict(),
            'label_encoder': label_encoder
        }, f'{model_dir}/model_after_chunk_{chunk_idx}.pt')
        
        # Clear memory
        del train_dataset, val_dataset, train_loader, val_loader
        torch.cuda.empty_cache()
    
    return model, label_encoder

# Modified AccidentDataset class
class AccidentDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, text_embeddings=None, 
                 is_test: bool = False, label_encoder=None):
        self.df = df
        self.tokenizer = tokenizer
        self.is_test = is_test
        
        # Process tabular features
        print("Processing tabular features...")
        self.process_tabular_features()
        
        # Store or compute text embeddings
        if text_embeddings is not None:
            self.text_embeddings = text_embeddings
        else:
            print("Computing text embeddings...")
            self.text_embeddings = self.get_text_embeddings(df['Description'].tolist())
        
        if not is_test:
            if label_encoder is None:
                self.label_encoder = LabelEncoder()
                self.labels = self.label_encoder.fit_transform(df['Severity'])
            else:
                self.label_encoder = label_encoder
                self.labels = self.label_encoder.transform(df['Severity'])
    
    def process_tabular_features(self):
        # Select all columns except 'Severity' and 'Description'
        tabular_cols = [col for col in self.df.columns 
                       if col not in ['Severity', 'Description']]
        
        # Convert boolean columns to int
        bool_cols = self.df[tabular_cols].select_dtypes(include=['bool']).columns
        for col in bool_cols:
            self.df[col] = self.df[col].astype(int)
        
        # Convert categorical columns to numeric using label encoding
        cat_cols = self.df[tabular_cols].select_dtypes(include=['object']).columns
        self.label_encoders = {}
        for col in tqdm(cat_cols, desc="Encoding categorical columns"):
            self.label_encoders[col] = LabelEncoder()
            self.df[col] = self.label_encoders[col].fit_transform(self.df[col])
        
        # Scale numeric features
        self.scaler = StandardScaler()
        self.tabular_features = self.scaler.fit_transform(self.df[tabular_cols])
    
    @staticmethod
    def get_batch_embeddings(texts: list, tokenizer, model, device='cuda', batch_size=32) -> np.ndarray:
        embeddings = []
        
        for i in tqdm(range(0, len(texts), batch_size), desc="Computing embeddings"):
            batch_texts = texts[i:i + batch_size]
            encoded_input = tokenizer(batch_texts, 
                                    padding=True, 
                                    truncation=True, 
                                    max_length=512,
                                    return_tensors='pt')
            
            # Move input to device
            encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
            
            with torch.no_grad():
                model_output = model(**encoded_input)
                batch_embeddings = model_output[0][:, 0]  # CLS token embedding
                batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
                embeddings.append(batch_embeddings.cpu().numpy())
        
        return np.vstack(embeddings)
    
    def get_text_embeddings(self, texts: list) -> np.ndarray:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5')
        model = model.to(device)
        model.eval()
        
        return self.get_batch_embeddings(texts, self.tokenizer, model, device)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        tabular_features = torch.FloatTensor(self.tabular_features[idx])
        text_embedding = torch.FloatTensor(self.text_embeddings[idx])
        
        if self.is_test:
            return tabular_features, text_embedding
        
        label = torch.LongTensor([self.labels[idx]])[0]
        return tabular_features, text_embedding, label



In [12]:

# Load your full data
# train_df = pd.read_csv('/teamspace/studios/this_studio/Assignment-TechstaX/data/imputed_dataset.csv')  # (5796296, 34)
# val_df = pd.read_csv('/teamspace/studios/this_studio/Assignment-TechstaX/data/val_data.csv')      # (1159259, 34)

# # Split data into chunks and save
# train_chunks, val_chunks = split_and_save_data(
#     train_df, 
#     val_df,
#     train_chunk_size=500000,
#     val_chunk_size=100000,
#     output_dir='/teamspace/studios/this_studio/Assignment-TechstaX/data/data_chunks'
# )

train_chunks = ['/teamspace/studios/this_studio/Assignment-TechstaX/data/data_chunks/train_chunk_0.csv']
val_chunks = ['/teamspace/studios/this_studio/Assignment-TechstaX/data/data_chunks/val_chunk_0.csv']
# Train model incrementally
model, label_encoder = train_model_incrementally(
    train_chunks,
    val_chunks,
    model_dir='/teamspace/studios/this_studio/Assignment-TechstaX/models',
    batch_size=32,
    epochs_per_chunk=10
)

# Save final model
torch.save({
    'model_state_dict': model.state_dict(),
    'label_encoder': label_encoder
}, '/teamspace/studios/this_studio/Assignment-TechstaX/models/final_model.pt')



Training on chunk 1/1


Processing tabular features...


Encoding categorical columns:   0%|          | 0/7 [00:00<?, ?it/s]

Computing text embeddings...


Computing embeddings:   0%|          | 0/15625 [00:00<?, ?it/s]

Computing embeddings:   0%|          | 0/3125 [00:00<?, ?it/s]

Processing tabular features...


Encoding categorical columns:   0%|          | 0/7 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: Currently logged in as: [33manindyamitra2018[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type             | Params | Mode 
-------------------------------------------------------------
0 | tabular_network | Sequential       | 12.5 K | train
1 | text_network    | Sequential       | 57.5 K | train
2 | classifier      | Sequential       | 8.5 K  | train
3 | criterion       | CrossEntropyLoss | 0      | train
-------------------------------------------------------------
78.5 K    Trainable params
0         Non-trainable params
78.5 K    Total params
0.314     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training: |          | 0/? [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling p

Validation: |          | 0/? [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation: |          | 0/? [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation: |          | 0/? [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation: |          | 0/? [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:
import os

os.listdir("/teamspace/studios/this_studio/Assignment-TechstaX/data/data_chunks")