In [1]:
import os
import torch
import logging
from config import Config  # Configurations (learning rate, batch size, etc.)
from training.train import train_model  # Core training function
from data_processing.data_loading import prepare_data  # DataLoader setup
from models.model import FusionNet  # The TCCT-Net model combining two streams
from utils.logger import setup_logging  # Logger setup
from utils.seed import set_seed  # Setting random seeds for reproducibility
from sklearn.model_selection import train_test_split

# Instantiate the Config object
config = Config()



In [2]:
# Check for GPU availability
device = torch.device('cuda')
print(f"Using device: {device}")


Using device: cuda


In [3]:
# import pandas as pd

# # Load the label file (Replace this path with the actual path to your Excel file)
# label_file_path = 'data/train_engagement_labels.xlsx'  # Update with the correct path
# label_df = pd.read_excel(label_file_path)

# # Check the unique values in the 'label' column
# unique_labels = label_df['label'].unique()
# print("Unique labels in the dataset:", unique_labels)

# # Create a label mapping based on the unique labels
# label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
# print("Label mapping:", label_mapping)

# # Apply the label mapping to convert textual labels to numerical values
# label_df['label'] = label_df['label'].map(label_mapping)

# # Now the 'label' column contains numeric values based on the mapping
# print(label_df.head())


In [4]:
# from torch.utils.data import DataLoader
# from data_processing import load_csv_data, preprocess_data, augment_dataset, EngagementDataset
# from data_processing.data_loading import prepare_data

# # Custom collate function for padding sequences
# def collate_fn(batch):
#     data, labels = zip(*batch)
#     sequence_lengths = [seq.shape[0] for seq in data]
#     data_padded = torch.nn.utils.rnn.pad_sequence(data, batch_first=True)
#     labels = torch.stack(labels)
#     return data_padded, labels, sequence_lengths

# def prepare_data(config):
#     # Load file paths and labels from CSVs
#     file_paths, labels, feature_columns = load_csv_data(
#         folder_path=config.data_dir,
#         label_file=config.label_file,
#         label_column=config.label_column,
#         exclude_columns=config.exclude_columns
#     )

#     # Preprocess data, now passing the labels as well
#     features_list, scaler, feature_names = preprocess_data(
#         file_paths, exclude_columns=config.exclude_columns, 
#         missing_value_strategy=config.missing_value_strategy
#     )

#     # Split data into train and validation sets
#     X_train, X_val, y_train, y_val = train_test_split(
#         features_list, labels, test_size=config.test_size, 
#         stratify=labels, random_state=config.random_state
#     )

#     # Augment training data
#     X_train_augmented, y_train_augmented = augment_dataset(
#         X_train, y_train, segment_length=config.segment_length, 
#         num_augmented_samples=config.num_augmented_samples
#     )

#     # Create PyTorch Dataset and DataLoader objects
#     train_dataset = EngagementDataset(X_train_augmented, y_train_augmented, feature_names, mode='train')
#     val_dataset = EngagementDataset(X_val, y_val, feature_names, mode='val')

#     train_loader = DataLoader(
#         train_dataset, batch_size=config.batch_size, shuffle=True, 
#         num_workers=config.num_workers, collate_fn=collate_fn, pin_memory=True
#     )
#     val_loader = DataLoader(
#         val_dataset, batch_size=config.batch_size, shuffle=False, 
#         num_workers=config.num_workers, collate_fn=collate_fn, pin_memory=True
#     )

#     return train_loader, val_loader, train_dataset, val_dataset



# if __name__ == "__main__":

#     train_loader, val_loader, train_dataset, val_dataset = prepare_data(config)
#     print(f"Training data: {len(train_loader.dataset)} samples")
#     print(f"Validation data: {len(val_loader.dataset)} samples")


In [5]:
import torch
import logging
import os
from torch.utils.data import DataLoader
from tqdm import tqdm  # Progress bar
from data_processing import load_csv_data, preprocess_data, augment_dataset, EngagementDataset
from sklearn.model_selection import train_test_split
from training.train import train_model


if __name__ == "__main__":

    # Initialize logger
    logger = setup_logging()

    # Prepare data
    train_loader, val_loader, train_dataset, val_dataset = prepare_data(config, logger)
    logger.info(f"Training data: {len(train_loader.dataset)} samples")
    logger.info(f"Validation data: {len(val_loader.dataset)} samples")
    
    # Start training
    train_model(train_loader, val_loader, config, logger)


Collecting File Paths and Labels: 100%|██████████| 15966/15966 [00:10<00:00, 1543.26it/s]


In [None]:
# Instantiate the Config object
    config = Config()

    # Update config with command line arguments
    config = update_config_from_args(config, args)

    # Set up logging
    logger = setup_logging()

    # Check and log device usage
    check_device(config)
    
    logger.info("Preparing data with the following configuration:")
    for key, value in vars(config).items():
        logger.info(f"{key}: {value}")

    # Set seed for reproducibility
    set_seed(config.random_seed)

    # Prepare data
    train_loader, val_loader, train_dataset, val_dataset = prepare_data(config)
    
    logger.info(f"Training data: {len(train_loader.dataset)} samples")
    logger.info(f"Validation data: {len(val_loader.dataset)} samples")

    # Display progress in loading batches
    for batch_idx, (data, labels, seq_lengths) in tqdm(enumerate(train_loader), total=len(train_loader), desc="Loading Training Data"):
        logger.info(f"Batch {batch_idx + 1}: Data shape: {data.shape}, Labels shape: {labels.shape}")

    # Insert your training logic here
    try:
        logger.info("Starting training process...")
        train_model(train_loader, val_loader, config)
        logger.info("Training process completed successfully.")
    
    except Exception as e:
        logger.error(f"An error occurred during training: {str(e)}", exc_info=True)
        sys.exit(1)


In [None]:
# Initialize the FusionNet (TCCT-Net) model
model = FusionNet(config).to(device)


In [None]:
# Setup logging to a file
log_file = os.path.join(config.logs_dir, 'training.log')
setup_logging(log_file)
logging.info("Starting training for TCCT-Net model.")


In [None]:
# Set random seed for reproducibility
set_seed(config.random_seed)


In [None]:
# Train the model using the training function
train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    config=config,
    device=device
)


In [None]:
# Optional: Integrate TensorBoard for monitoring
%load_ext tensorboard
%tensorboard --logdir=logs/


In [None]:
from checkpoint import save_checkpoint

# Example of saving a checkpoint during training
save_checkpoint(model, optimizer, epoch, config.checkpoint_dir)


In [None]:
from evaluator import evaluate_model

# Evaluate the model on the validation set
evaluate_model(model, val_loader, config, device)
