Authors : Jinsu Kim, JunHo Park

ⓒ 2022 CCNets, Inc. All Rights Reserved.

https://ccnets.org

In [1]:
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

In [2]:
# https://www.kaggle.com/datasets/ruslankl/mice-protein-expression

import os
import pandas as pd

# Define the base directory and CSV file name
base_dir = path_append + "../data/mice_protein_expression/"  # Update this to the directory where your data folder is located
csv_file = "Data_Cortex_Nuclear.csv"  # Update this to your CSV file name if different

# Full path to the CSV file
full_path = os.path.join(base_dir, csv_file)

# Load the dataset
try:
    df = pd.read_csv(full_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print("Failed to load data. File not found at:", full_path)

# No need for image_size here unless it is used later in your code


Data loaded successfully!


TrainLoader / DataLoader

In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# Preprocess dataset
df = df.drop("MouseID", axis=1)
label_cols = ["Genotype", "Treatment", "Behavior", "class"]
for col in label_cols:
    df[col] = LabelEncoder().fit_transform(df[col].values)

In [4]:
# Impute missing values
imputer = IterativeImputer(max_iter=10, random_state=0)  # max_iter was num_features; adjust as appropriate
df[:] = imputer.fit_transform(df)



In [5]:
# Scale features
from sklearn.model_selection import train_test_split

feature_cols = df.columns[df.columns != 'class']
df[feature_cols] = StandardScaler().fit_transform(df[feature_cols])

# Determine number of features and classes
num_features = len(feature_cols)
num_classes = len(df['class'].unique())

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df['class'], test_size=0.2, random_state=1)
num_features, num_classes

(80, 8)

In [6]:
# Custom dataset class
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.x = torch.tensor(features, dtype=torch.float32)
        self.y = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        y_one_hot = torch.nn.functional.one_hot(self.y[index], num_classes=num_classes)
        return self.x[index], y_one_hot


In [7]:
# Create Dataset instances
train_dataset = CustomDataset(X_train.values, y_train.values)
test_dataset = CustomDataset(X_test.values, y_test.values)


In [8]:
train_dataset.x.shape, train_dataset.y.shape, test_dataset.x.shape, test_dataset.y.shape

(torch.Size([864, 80]),
 torch.Size([864]),
 torch.Size([216, 80]),
 torch.Size([216]))

In [9]:
# Example usage
for features, labels in train_dataset:
    print(features, labels)
    break

tensor([ 0.8567,  0.8996,  0.5220,  0.2666,  0.3052,  0.1107, -0.1130,  1.1694,
        -0.1509,  0.2293,  0.8203,  0.1649,  0.4288,  0.2231, -0.3929, -0.5361,
        -0.6118,  0.9344,  1.0031, -0.1890,  0.9307, -0.1968, -0.0757, -0.0926,
         0.3065,  1.3649,  0.3003,  0.0717,  0.7352,  0.1070,  0.1126,  0.4942,
        -0.8181, -0.3349, -0.6556, -0.3706, -0.7622, -0.0408, -0.2814,  0.3589,
        -0.4635, -0.9334,  0.1160, -0.1771, -0.1465,  0.0760,  0.5365,  0.7723,
         0.4190,  1.1157, -0.2373, -0.3129, -0.1937, -0.0316,  0.5051,  0.3011,
        -0.1636, -0.1046, -0.4791, -0.2392, -0.3477, -0.3318, -0.3252, -0.1085,
        -1.0837, -0.3432,  1.1123,  1.0492, -0.5837, -1.4140, -0.0316, -1.4158,
         0.1160, -0.8185, -1.3089, -1.0408,  0.6297, -0.9459, -0.9459, -1.0282]) tensor([1, 0, 0, 0, 0, 0, 0, 0])


In [10]:
from tools.setting.ml_params import MLParameters
from tools.setting.data_config import DataConfig

data_config = DataConfig(dataset_name = 'mice_protein_expression', task_type='multi_class_classification', obs_shape=[num_features], label_size=num_classes)
#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(ccnet_network = 'gpt', encoder_network = 'none')

In [11]:
from trainer_hub import TrainerHub

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False)

In [12]:
trainer_hub.train(train_dataset, test_dataset)

Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

[3/100][11/13][Time 3.26]
Unified LR across all optimizers: 0.0001995308238189185
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.1355	Gen: 0.5654	Rec: 0.5560	E: 0.1448	R: 0.1261	P: 0.9860
--------------------Test Metrics------------------------
accuracy: 0.9688
precision: 0.9665
recall: 0.9808
f1_score: 0.9716

[7/100][9/13][Time 3.05]
Unified LR across all optimizers: 0.00019907191565870155
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0099	Gen: 0.3267	Rec: 0.3263	E: 0.0103	R: 0.0095	P: 0.6432
--------------------Test Metrics------------------------
accuracy: 0.9844
precision: 0.9896
recall: 0.9750
f1_score: 0.9807

[11/100][7/13][Time 3.05]
Unified LR across all optimizers: 0.00019861406295796434
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0028	Gen: 0.2672	Rec: 0.2671	E: 0.0029	R: 0.0027	P: 0.5315
--------------------Test Metrics------------------------
accuracy: 1.0000
precision: 1.0000
