In [1]:
import os
import gc
import sys
import torch
import psutil
import pickle
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn import metrics
from collections import Counter
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torchvision import models, set_image_backend

import data_utils
import train_utils

%reload_ext autoreload
%autoreload 2

set_image_backend('accimage')

In [3]:
# define global vars
classification = 'WGD'
magnification = '10.0'
output_size = 1
device = torch.device('cuda', 0)

In [4]:
# get model file paths
if classification == 'WGD':
    if magnification == '10.0':
        #sa_file = '/n/tcga_models/resnet18_WGD_10x_sa.pkl'
        state_dict_file = '/n/tcga_models/resnet18_WGD_10x.pt'
    elif magnification == '5.0':
        #sa_file = '/n/tcga_models/resnet18_WGD_v04_sa.pkl'
        state_dict_file = '/n/tcga_models/resnet18_WGD_v04.pt'
elif classification == 'MSI':
    if magnification == '10.0':
        #sa_file = '/n/tcga_models/resnet18_MSI_singlelabel_10x_sa.pkl'
        state_dict_file = '/n/tcga_models/resnet18_MSI_singlelabel_10x.pt'
    elif magnification == '5.0':
        #sa_file = '/n/tcga_models/resnet18_MSI_singlelabel_v02_sa.pkl'
        state_dict_file = '/n/tcga_models/resnet18_MSI_singlelabel_v02.pt'

In [None]:
# load embedding network and freeze layers
resnet = models.resnet18(pretrained=False)
resnet.fc = nn.Linear(2048, output_shape, bias=True)
saved_state = torch.load(state_dict_file, map_location=lambda storage, loc: storage)
resnet.load_state_dict(saved_state)
resnet.fc = nn.Linear(2048, 2048, bias=False)
resnet.fc.weight.data=torch.eye(2048)
resnet.cuda(device=device)
for param in resnet.parameters():
    param.requires_grad = False

In [None]:
# initialize fully-connected final layer 
final_embed_layer = nn.Linear(2048, 2048)
final_embed_layer.cuda()

In [6]:
# get image file paths
root_dir = '/n/mounted-data-drive/'
batch_one = ['COAD', 'BRCA', 'UCEC']
batch_two_orig = ['BLCA', 'KIRC', 'READ', 'HNSC', 'LUSC', 'LIHC', 'LUAD', 'STAD']
if magnification == '10.0':
    batch_two = [b + '_10x' for b in batch_two_orig]
elif magnification == '5.0':
    batch_two = [b + '_5x' for b in batch_two_orig]

In [56]:
# get sample annotations
# NOTE: ONLY FOR WGD
wgd_path = 'ALL_WGD_TABLE.xlsx'
wgd_raw = pd.read_excel(wgd_path)
wgd_raw.head(3)

Unnamed: 0,Sample,Type,AneuploidyScore(AS),AS_del,AS_amp,Genome_doublings,Leuk,Purity,Stroma,Stroma_notLeukocyte,...,8,9,10,11,12,16,17,18,19,20
0,TCGA-18-3406-01,LUSC,8,8,0,0,0.337487,0.48,0.52,0.182513,...,,-1.0,,0.0,0.0,0.0,,0.0,0.0,0.0
1,TCGA-18-3407-01,LUSC,25,5,20,1,0.325946,0.29,0.71,0.384054,...,1.0,1.0,,,-1.0,1.0,,1.0,1.0,1.0
2,TCGA-18-3408-01,LUSC,5,3,2,0,0.144959,0.76,0.24,0.095041,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
batch_all = batch_one + batch_two_orig
wgd_filtered = wgd_raw.loc[wgd_raw['Type'].isin(batch_all)]
wgd_filtered.head(3)

Unnamed: 0,Sample,Type,AneuploidyScore(AS),AS_del,AS_amp,Genome_doublings,Leuk,Purity,Stroma,Stroma_notLeukocyte,...,8,9,10,11,12,16,17,18,19,20
0,TCGA-18-3406-01,LUSC,8,8,0,0,0.337487,0.48,0.52,0.182513,...,,-1.0,,0.0,0.0,0.0,,0.0,0.0,0.0
1,TCGA-18-3407-01,LUSC,25,5,20,1,0.325946,0.29,0.71,0.384054,...,1.0,1.0,,,-1.0,1.0,,1.0,1.0,1.0
2,TCGA-18-3408-01,LUSC,5,3,2,0,0.144959,0.76,0.24,0.095041,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
Counter(wgd_filtered['Genome_doublings'])

Counter({0: 3112, 1: 1913, 2: 299})

In [59]:
wgd_filtered.loc[wgd_filtered['Genome_doublings'].values == 2, 'Genome_doublings'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [60]:
Counter(wgd_filtered['Genome_doublings'])

Counter({0: 3112, 1: 2212})

In [62]:
wgd_filtered.set_index('Sample', inplace=True)
wgd_filtered.head(3)

Unnamed: 0_level_0,Type,AneuploidyScore(AS),AS_del,AS_amp,Genome_doublings,Leuk,Purity,Stroma,Stroma_notLeukocyte,Stroma_notLeukocyte_Floor,...,8,9,10,11,12,16,17,18,19,20
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-18-3406-01,LUSC,8,8,0,0,0.337487,0.48,0.52,0.182513,0.182513,...,,-1.0,,0.0,0.0,0.0,,0.0,0.0,0.0
TCGA-18-3407-01,LUSC,25,5,20,1,0.325946,0.29,0.71,0.384054,0.384054,...,1.0,1.0,,,-1.0,1.0,,1.0,1.0,1.0
TCGA-18-3408-01,LUSC,5,3,2,0,0.144959,0.76,0.24,0.095041,0.095041,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
sa_trains = []
sa_vals = []
batch_all = batch_one + batch_two

for cancer in batch_all:
    sa_train, sa_val = process_WGD_data(root_dir='/n/mounted-data-drive/', cancer_type=cancer, wgd_path=None, 
                                        wgd_raw = wgd_filtered)
    sa_trains.append(sa_train)
    sa_vals.append(sa_val)

In [69]:
# save sample annotations in a pickle
pickle_file = 'tcga_wgd_sa_all.pkl'
with open(pickle_file, 'wb') as f: 
        pickle.dump([batch_all, sa_trains, sa_vals], f)

In [None]:
# initialize Datasets
sample_annotations, root_dir, transform=None, loader=default_loader, magnification='5.0', batch_type='tile'

In [2]:
# define few-shot learning params
n_support = 5 # number of training examples in the support set
n_query = 20 # number of training examples in the query set
n_task = 4 # number of 'tasks' to sample from each cancer type