In [3]:
"""Train the model"""
from comet_ml import Experiment


import argparse
import logging
import os
import math
import glob

import numpy as np
import torch
import torch.optim as optim
from torch.autograd import Variable
from tqdm import tqdm

import features.utils.utils as utils

from features.bin.val import evaluate
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torchvision import models
from torchvision import transforms
import torchvision
import features.utils.metrics_code as metrics_code

import importlib




ModuleNotFoundError: No module named 'utils'

In [4]:
import random
import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import json
import pandas as pd
import torchvision.transforms as transforms


 

class dataset(Dataset):
    """
    Data loader for NYU FFPE data. Reads in

    file_path: path to file containing the list of tuples
    split: Train, Val, or Test
    tranformer: transform
    classes: list of classes to use, or "all"
    """
    def __init__(self, file_path, split, classes):


        self.transformer = transforms.Compose([
            transforms.Scale(299),
            transforms.ToTensor(),
            transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225])])

        self.file_path = file_path
        self.split = split

        with open(self.file_path, 'r') as filehandle:
            metadata_list = json.load(filehandle)

        metadata_df = pd.DataFrame(metadata_list, columns =['Path', 'label', 'slide', 'split'])
        metadata_split = metadata_df[metadata_df['split'] == self.split]

        if classes != "all":
            metadata_split = metadata_split[metadata_split['label'].isin(classes)]
            metadata_split.label = pd.Categorical(metadata_split.label)
            metadata_split['label_num'] = metadata_split.label.cat.codes
        else:
            metadata_split.label = pd.Categorical(metadata_split.label)
            metadata_split['label_num'] = metadata_split.label.cat.codes

        self.image_paths = metadata_split['Path']
        self.labels = metadata_split['label_num']

    def __len__(self):
        # return size of dataset
        return len(self.image_paths)

    def __getitem__(self, idx):

        image = Image.open(self.image_paths.iloc[idx])  # PIL image
        label = self.labels.iloc[idx]
        image = self.transformer(image)

        return image, label


In [9]:
metadata_file = "/home/ay1392/anna_beegfs/pytorch_projects/features_proj/features/project_specific/metadata_split.txt"
train_dataset = dataset(file_path = metadata_file, split = "Train",
                             classes = ["acinar", "normal"])
train_loader = DataLoader(dataset = train_dataset, batch_size = 2,
                            shuffle = True, num_workers = 8)

for i in enumerate(train_loader):
    print(i)

(0, [tensor([[[[ 1.3927,  1.4269,  1.6838,  ...,  0.8447,  0.5193,  0.1083],
          [ 1.2043,  1.4612,  1.7352,  ...,  0.0398, -0.5424, -1.0562],
          [ 1.1872,  1.3413,  1.5125,  ..., -0.7822, -1.3302, -1.5357],
          ...,
          [ 0.0912,  0.7077,  1.9064,  ...,  1.4440,  1.5639,  1.3927],
          [ 0.1768,  1.3070,  1.9235,  ...,  0.8618,  0.7762,  0.4679],
          [ 0.5878,  1.7009,  1.9235,  ...,  0.1939, -0.3712, -0.7650]],

         [[ 0.2402,  0.2927,  0.5378,  ..., -0.2500, -0.3550, -0.6702],
          [ 0.1702,  0.4328,  0.7304,  ..., -0.5651, -0.8627, -1.3004],
          [ 0.1702,  0.3452,  0.5203,  ..., -1.1604, -1.4580, -1.6331],
          ...,
          [-0.9503, -0.3375,  0.9230,  ...,  0.0476,  0.2227,  0.1352],
          [-0.7052,  0.4328,  1.0455,  ..., -0.3901, -0.2325, -0.3550],
          [-0.1625,  0.9405,  1.0980,  ..., -0.7577, -0.9503, -1.0903]],

         [[ 1.3502,  1.3677,  1.5594,  ...,  1.0365,  1.0365,  0.7751],
          [ 1.2631,  1.45

(3, [tensor([[[[ 1.9578,  1.9407,  1.9235,  ...,  1.8893,  1.9064,  1.9064],
          [ 1.9749,  1.9578,  1.9235,  ...,  1.9235,  1.9578,  1.9407],
          [ 1.9578,  1.9407,  1.9064,  ...,  1.9407,  1.9749,  2.0092],
          ...,
          [ 1.9578,  1.9920,  1.9578,  ...,  0.4851,  0.3138,  0.5536],
          [ 1.9235,  1.9578,  1.9064,  ...,  0.5364,  0.5707,  0.9474],
          [ 1.9064,  1.9407,  1.8722,  ...,  0.9817,  0.9988,  0.4337]],

         [[ 1.7808,  1.7808,  1.7808,  ...,  1.8859,  1.8859,  1.8683],
          [ 1.8333,  1.8158,  1.7808,  ...,  1.8508,  1.8683,  1.8333],
          [ 1.8333,  1.8333,  1.7983,  ...,  1.7983,  1.8158,  1.8158],
          ...,
          [ 1.9559,  1.9909,  1.9559,  ..., -1.0378, -1.1078, -0.8452],
          [ 1.9209,  1.9559,  1.9209,  ..., -0.9853, -0.9678, -0.8627],
          [ 1.9034,  1.9734,  1.9209,  ..., -0.7402, -0.7227, -0.9153]],

         [[ 2.2914,  2.2740,  2.2566,  ...,  2.2566,  2.2914,  2.3263],
          [ 2.3088,  2.27

(6, [tensor([[[[ 1.9749,  1.9578,  1.9749,  ...,  2.0434,  2.0434,  2.0092],
          [ 1.9920,  1.9749,  1.9920,  ...,  2.0263,  2.0263,  2.0263],
          [ 1.9749,  1.9749,  1.9920,  ...,  1.9920,  1.9920,  2.0092],
          ...,
          [ 2.0434,  2.0434,  2.0434,  ...,  2.0263,  2.0092,  2.0092],
          [ 2.0263,  2.0263,  2.0434,  ...,  2.0092,  1.9578,  1.9920],
          [ 2.0434,  2.0263,  2.0092,  ...,  2.0434,  1.9920,  2.0092]],

         [[ 2.1660,  2.1485,  2.1660,  ...,  2.1835,  2.1835,  2.1485],
          [ 2.1835,  2.1660,  2.1660,  ...,  2.1660,  2.1660,  2.1660],
          [ 2.1660,  2.1660,  2.1660,  ...,  2.1310,  2.1310,  2.1485],
          ...,
          [ 2.2185,  2.2185,  2.2185,  ...,  2.1660,  2.1660,  2.1485],
          [ 2.2010,  2.2010,  2.2185,  ...,  2.1835,  2.1485,  2.1660],
          [ 2.2185,  2.2010,  2.1835,  ...,  2.2185,  2.1835,  2.2185]],

         [[ 2.4134,  2.3960,  2.4134,  ...,  2.4483,  2.4483,  2.4134],
          [ 2.4308,  2.41

(9, [tensor([[[[ 2.0605,  2.0605,  2.0605,  ...,  1.2214,  1.8208,  1.9578],
          [ 2.0605,  2.0605,  2.0605,  ...,  1.0159,  1.5639,  1.8379],
          [ 2.0605,  2.0605,  2.0605,  ...,  1.7352,  1.8893,  2.0434],
          ...,
          [ 0.0227,  0.3481,  0.3652,  ...,  2.0605,  2.0605,  2.0605],
          [ 0.4337,  0.1083,  0.5022,  ...,  2.0605,  2.0605,  2.0605],
          [ 0.2453,  0.0912,  1.6838,  ...,  2.0605,  2.0605,  2.0605]],

         [[ 2.2360,  2.2360,  2.2360,  ...,  0.8179,  1.5532,  2.1310],
          [ 2.2360,  2.2360,  2.2360,  ...,  0.6779,  1.3957,  2.0959],
          [ 2.2360,  2.2360,  2.2360,  ...,  1.2381,  1.5532,  2.0784],
          ...,
          [-0.5301, -0.1625, -0.0749,  ...,  2.2360,  2.2360,  2.2360],
          [-0.1099, -0.3725,  0.2052,  ...,  2.2360,  2.2360,  2.2360],
          [-0.3025, -0.3375,  1.5357,  ...,  2.2360,  2.2360,  2.2360]],

         [[ 2.4483,  2.4483,  2.4483,  ...,  1.5594,  2.0823,  2.3960],
          [ 2.4483,  2.44

Process Process-3:
Process Process-1:
Process Process-2:
Process Process-7:
Process Process-8:
Process Process-5:
Process Process-6:
Process Process-4:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self

(10, [tensor([[[[ 1.9235,  1.4269,  1.8893,  ...,  1.7694,  1.8208,  1.8722],
          [ 1.4098,  0.5536,  1.5639,  ...,  1.8722,  1.8550,  1.9064],
          [ 0.2282, -0.0629,  0.3481,  ...,  1.9749,  1.9578,  1.9578],
          ...,
          [-1.0048, -0.4911,  0.9132,  ...,  1.8893,  1.8550,  1.8893],
          [-1.1075, -0.6623,  0.7419,  ...,  1.9064,  1.8722,  1.9064],
          [-1.0562, -0.3883,  0.8789,  ...,  1.8722,  1.8893,  1.8722]],

         [[ 1.0280,  0.8354,  1.3256,  ...,  1.8859,  1.9734,  2.0259],
          [ 0.5028, -0.1625,  0.9055,  ...,  1.9384,  1.9734,  2.0084],
          [-0.6176, -0.8803, -0.4251,  ...,  2.0084,  2.0084,  1.9909],
          ...,
          [-1.3529, -1.1253, -0.3375,  ...,  2.0259,  1.9909,  2.0259],
          [-1.4230, -1.3004, -0.5301,  ...,  2.0434,  2.0084,  2.0434],
          [-1.3354, -0.9853, -0.3725,  ...,  2.0084,  2.0259,  2.0084]],

         [[ 1.8383,  1.5420,  2.0474,  ...,  2.1868,  2.1694,  2.1868],
          [ 1.5071,  0.7

  File "/share/apps/python3/3.6.3/intel/lib/python3.6/multiprocessing/connection.py", line 414, in _poll
    r = wait([self], timeout)
Traceback (most recent call last):
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/multiprocessing/connection.py", line 911, in wait
    ready = selector.select(timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/multiprocessing/connection.py", line 911, in wait
    ready = selector.select(timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/multiprocessing/connection.py", line 911, in wait
    ready = selector.select(timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/multiprocessing/connection.py", line 911, in wait
    ready = selector.select(timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/multiprocessing/connection.py", line 911, in wait
    ready = selector.select(timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/multiprocessing/connection.py", line 911, in wait
    ready = selector.

KeyboardInterrupt: 

  File "/share/apps/python3/3.6.3/intel/lib/python3.6/selectors.py", line 376, in select
    fd_event_list = self._poll.poll(timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/selectors.py", line 376, in select
    fd_event_list = self._poll.poll(timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/selectors.py", line 376, in select
    fd_event_list = self._poll.poll(timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/selectors.py", line 376, in select
    fd_event_list = self._poll.poll(timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/selectors.py", line 376, in select
    fd_event_list = self._poll.poll(timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/selectors.py", line 376, in select
    fd_event_list = self._poll.poll(timeout)
  File "/share/apps/python3/3.6.3/intel/lib/python3.6/selectors.py", line 

In [15]:
USE_CUDA = 1

#make an arg
params = utils.Params("/home/ay1392/anna_beegfs/pytorch_projects/features_proj/features/params.json")

model_dir = params.model_dir
metadata_file = params.metadata_file
batch_size = 4
num_epochs = params.num_epochs
learning_rate = params.learning_rate
decay = params.decay
save_summary_steps = params.save_summary_steps
optimizer = params.optimizer
loss_func = params.loss_func
model = params.model
classes = params.classes
num_classes = len(classes)


In [10]:
print(model)


test_net


In [16]:
# Set the random seed for reproducible experiments
torch.manual_seed(230)

experiment = Experiment(api_key="ysBHHTJLIBAhlklc4sdBd0vlp",
                        project_name="general", workspace="ayeaton")

# Set the logger
print(os.path.join(model_dir, 'train.log'))
utils.set_logger(os.path.join(model_dir, 'train.log'))


# Create the input data pipeline
logging.info("Loading the datasets...")

# get data

train_dataset = dataset(file_path = "/home/ay1392/anna_beegfs/pytorch_projects/listfile_test.txt", split = "Train", 
                         transformer = transformer, classes = classes)

train_loader = DataLoader(dataset = train_dataset, batch_size = batch_size,
                                 shuffle = True, num_workers = 8)

val_dataset = dataset(file_path = "/home/ay1392/anna_beegfs/pytorch_projects/listfile_test.txt", split = "Val",
                       transformer = transformer, classes = classes)

val_loader = DataLoader(dataset = val_dataset, batch_size = batch_size,
                                 shuffle = True, num_workers = 8)


logging.info("- done.")

if model != "Inception":
    net = importlib.import_module("features.models.{}".format(model))
    model = net.Net()
else: 
    model = models.inception_v3(pretrained=False) 
    model.fc = nn.Linear(2048, num_classes)
    model.AuxLogits.fc = nn.Linear(768, 1)

logging.info("Model -- {}".format(repr(model)))


model.cuda()

# fetch loss function and metrics
metrics_save = metrics_code.metrics_save

hyper_params = {"learning_rate": learning_rate, "steps": 1000, "batch_size": batch_size}
experiment.log_multiple_params(hyper_params)


# Train the model
logging.info("Starting training for {} epoch(s)".format(num_epochs))
print(model_dir)
train_and_evaluate(model, train_loader, val_loader,  metrics_save, model_dir, num_epochs, 
                   loss_func, optimizer)

COMET INFO: ----------------------------
COMET INFO: Comet.ml Experiment Summary:
COMET INFO:   Data:
COMET INFO:     url: https://www.comet.ml/ayeaton/general/1f4dfe3452e5453188d7bb3894325f17
COMET INFO:   Metrics:
COMET INFO:         sys.gpu.0.free_memory: 15911223296
COMET INFO:     sys.gpu.0.gpu_utilization: 0
COMET INFO:        sys.gpu.0.total_memory: 16914055168
COMET INFO:         sys.gpu.0.used_memory: 1002831872
COMET INFO: ----------------------------
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/ayeaton/general/528ae726329d47b395f7b2b0f7c2dfdd

Loading the datasets...
- done.
Model -- Net(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10

/beegfs/ay1392/Embryo_models/models/C3D_dartmouth/train.log
/beegfs/ay1392/Embryo_models/models/C3D_dartmouth
torch.Size([4, 16, 71, 71])





RuntimeError: invalid argument 2: size '[-1 x 400]' is invalid for input with 322624 elements at /pytorch/aten/src/TH/THStorage.cpp:80

In [3]:

def train(model,train_loader, metrics_save, loss_func, optimizer):
    """Train the model on `num_steps` batches
    Args:
        model: (torch.nn.Module) the neural network
        train_loader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        loss_func: the loss function
        optimizer: the optimizer
        time_tile: 0 or 1, whether there are several small videos for one video or not
    """

    # set model to training mode
    model.train()

    # summary for current training loop and a running average object for loss
    summ = []
    loss_avg = utils.RunningAverage()

    # Use tqdm for progress bar
    with tqdm(total=len(train_loader)) as t:
        with experiment.train():
            for i, train_batch in enumerate(train_loader):


                inputs,labels = train_batch
                optimizer.zero_grad()
                outputs, aux = model(utils.tovar(inputs,requires_grad = False))
                parallelNet = torch.nn.DataParallel(model)
                outputs, aux = parallelNet(inputs)
                loss = loss_func(outputs, utils.tovar(labels, requires_grad = False))
                loss.backward()
                optimizer.step()

                # Evaluate summaries only once in a while
                if i % save_summary_steps == 0:
                    output_batch = outputs.data.cpu().numpy()
                    labels_batch = labels.cpu().numpy()

                    experiment.log_metric("acc", metrics_save["accuracy"](output_batch, labels_batch))
                    experiment.log_metric("AUC", metrics_save["AUC"](output_batch, labels_batch))
                    experiment.log_metric("loss", loss.data[0])

                   #compute all metrics on this batch
                    summary_batch = {"accuracy":metrics_save["accuracy"](output_batch, labels_batch),
                                     #AUC is on binary
                                     "AUC":metrics_save["AUC"](output_batch, labels_batch),
                                     "mean_fpr":metrics_save["fpr"](output_batch, labels_batch),
                                     "mean_tpr":metrics_save["tpr"](output_batch, labels_batch),
                                    "loss":loss.data[0]}
                    summary_batch['loss'] = loss.data[0]
                    summ.append(summary_batch)

                # update the average loss
                loss_avg.update(loss.data[0])

                t.set_postfix(loss='{:05.3f}'.format(loss_avg()))
                t.update()
                
    # compute mean of all metrics in summary
    metrics_mean = {metric:np.mean([x[metric] for x in summ]) for metric in summ[0]}
    metrics_string = " ; ".join("{}: {:05.3f}".format(k , v) for k, v in metrics_mean.items())
    logging.info("- Train metrics: " + metrics_string)

    
def train_and_evaluate(model, train_dataloader, val_dataloader,  metrics_save, model_dir, num_epochs, loss_func,
                       optimizer, restore_file=None):
    """Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the neural network
        train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data
        val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_val_acc = 0.0
    best_val_auc = 0.0
    
    loss_func = eval(loss_func)
    optimizer = eval(optimizer)

    for epoch in range(num_epochs):

        logging.info("Epoch {}/{}".format(epoch + 1, num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(model, train_dataloader,metrics_save,loss_func, optimizer)

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model, val_dataloader, metrics_save, loss_func)            
        val_acc = val_metrics['accuracy']
        val_auc = val_metrics['AUC'] 
        
        is_best = val_acc>=best_val_acc and val_auc>=best_val_auc

        # If best_eval and auc, best_save_path
        if is_best:
            logging.info("- Found new best accuracy or auc")
            best_val_acc = val_acc
            best_val_auc = val_auc
                        
            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)
            
        # Save weights
        utils.save_checkpoint({'epoch': epoch + 1,
                               'state_dict': model.state_dict(),
                               'optim_dict' : optimizer.state_dict()},
                               is_best=is_best,
                               checkpoint=model_dir)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
        
