In [47]:
#read all the jsons and add them to Data list
import json
import os
import math
import numpy as np
import torch
from torch_geometric.data import Data, DataLoader, Dataset
from torch.utils.data import random_split

json_dir= r"C:\Users\Brian\Desktop\AIProject\roi_graph_jsons"
data_list = []
label_counts=[0,0,0,0]
for roi_json in os.listdir(json_dir):
    with open(os.path.join(json_dir, roi_json), 'r') as f:
        roi_data = json.loads(f.read())
        #Json has following fields:
        #  name - name of the image
        #  node_features - [num_nodes num_features] matrix of features for each node
        #  dist_edge_table - [2 num_edges] edges defined by distance metric
        #  labels - dict, which contains p1_label, p2_label, p3_label (roi_data["labels"]["p1_label"]
        
        #process the labels. see the function for more details
        label, conf = process_labels2(roi_data["labels"])
        if label == -1:
            #print("Bad sample, no label. Discarding.")
            continue
            
#         for lbl in roi_data["labels"]:
#             l = roi_data["labels"][lbl]
#             ind = 0 if l == "Normal Duct" else 1 if l=="Columnar" else 2 if l=="Flat Epithelial" else 3 if l=="ADH" else -1
#             label_counts[ind] += 1
            
        #normalize input variables in x
        
        #print(len(roi_data["node_features"][0]))
        data_obj = Data(x=torch.tensor(roi_data["node_features"], dtype=torch.float),
                        edge_table=torch.tensor(roi_data["dist_edge_table"], dtype=torch.long).type(torch.LongTensor), 
#With confidence score  #y=torch.tensor([label,conf], dtype=torch.long))
                        y=torch.tensor([label], dtype=torch.float))
        data_list.append(data_obj)
        

print("Num data samples: " + str(len(data_list))) #was 1759, but discarding bad samples, its now 1613
print("Total label distribution: " + str(label_counts))
print("Processing dataset...")
dataset = HistGraphDataset(data_list)
#dataset.process()
t = random_split(dataset, [math.floor(len(dataset)*(1/3)), math.ceil(len(dataset)*(2/3))])
test = t[0]
train = t[1]
test_loader = DataLoader(test, batch_size=1) #537
train_loader = DataLoader(train, batch_size=1) #1076
print("Done")
for i, d in enumerate(train_loader):
    print(d.x[0])
    break

Num data samples: 1613
Total label distribution: [0, 0, 0, 0]
Processing dataset...
Done
tensor([ 41.0000,   0.9364,   0.1752,  25.4127,   8.9171, 130.0000,   0.2528,
          7.2252, 109.0000,  45.1470,   0.3154])


In [48]:
#define model
import torch
import torch.nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_max_pool, global_mean_pool, global_sort_pool

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
#         self.conv1 = GCNConv(11, 11, normalize=True) #keep at 11 dimensions
#         self.conv2 = GCNConv(11, 11, normalize=True)  

        self.gcn_hd = 24
        self.conv1 = GCNConv(11, self.gcn_hd, normalize=True) #keep at 11 dimensions
        self.conv2 = GCNConv(self.gcn_hd, self.gcn_hd, normalize=True)  
        
        self.lin_hd = 12
        self.lin1 = torch.nn.Linear(self.gcn_hd,self.lin_hd)                   #FC layers for classification
        #self.lin2 = torch.nn.Linear(self.lin_hd, 1)        
        self.lin2 = torch.nn.Linear(self.lin_hd, 4)                   #down to 4 layers for 4 classes
        
        #add confidence scores
        
        

    def forward(self, data):
        x, edge_table, b = data.x, data.edge_table, data.batch
        #x is [num_nodes num_features(11)], edge_table is [2 num_edges]
        
        #first conv layer
        x = self.conv1(x, edge_table) 
        x = F.relu(x)
        #x = torch.sigmoid(x)
        #x = F.dropout(x, training=self.training)

        #second conv layer
        x = self.conv2(x, edge_table)
        x = F.relu(x)
        #x = torch.sigmoid(x)
        #x = F.dropout(x, training=self.training)
        #global pooling
        # since batch size is 1, need to pass as batch LongTensor of zeros with size num_nodes
        out = global_max_pool(x, b).type(torch.float)
        #out = global_mean_pool(x, b).type(torch.float)
        #print(x.shape)
        #print(b.shape)
        #out = global_sort_pool(x, b, 2).view(2,self.gcn_hd)
        #print("After pooling: " + str(out.shape))
        #print("Y: " + str(data.y))
        #print()
        
        #now through linear layers
        out = self.lin1(out)
        #out = torch.tanh(out)
        #out = F.softmax(out, dim=1)
        out = F.relu(out)
        
        out = self.lin2(out)
        #out = F.sigmoid(out)              #for binary
        out = F.softmax(out, dim=1)      #for 4 way
        
        return out

In [62]:
#define training / testing logic (for 1 epoch)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
print(device)    
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

def train(loader):
    model.train()

    total_loss = 0
    for data in loader:
        #batch size is 1, need to assign all nodes to a batch for global pooling
        data.batch = torch.tensor(np.zeros([data.num_nodes])).type(torch.long)
        one_hot = torch.tensor(np.zeros((1,4)),dtype=torch.float)
        one_hot[0,int(data.y)] = 1
        data.one_hot = one_hot
        #data.weight = torch.tensor([0.25, 0.75])  #weigh high risk examples more
        
        data = data.to(device)
        
        optimizer.zero_grad()
        out = model(data)
        
        #print("Out: " + str(out.type()))
        #print("Y: " + str(data.y.type()))
    
        #loss = F.nll_loss(out, data.y)   #negative log liklihood loss, for 4 way
        #loss = F.binary_cross_entropy(out, data.y)   #for binary
        loss = F.binary_cross_entropy(out, data.one_hot)
        #print(loss)
        #print()
        
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
    return total_loss / len(dataset)

#for binary
def test(loader):
    model.eval()

    correct = 0
    preds = [0,0]
    preds_list = []
    y_test = []
    for data in loader:
        data = data.to(device)
        with torch.no_grad():
            #print(model(data))
            #pred = model(data).max(dim=1)[1]
            pred = 1 if model(data) >= 0.5 else 0
            preds_list.append(pred)
            preds[pred] += 1
        correct += 1 if pred == data.y else 0
        y_test.append(int(data.y.item()))
    print("Predictons on test set: " + str(preds))
    return correct / len(loader.dataset), preds_list, y_test

def test_4way(loader):
    model.eval()

    correct = 0
    preds = [0,0,0,0]
    preds_list = []
    y_test = []
    for data in loader:
        data = data.to(device)
        with torch.no_grad():
            #print(model(data))
            pred = model(data).max(dim=1)[1]
            #pred = 1 if model(data) >= 0.5 else 0
            preds_list.append(pred)
            preds[pred] += 1
        correct += 1 if pred == data.y else 0
        y_test.append(int(data.y.item()))
    print("Predictons on test set: " + str(preds))
    return correct / len(loader.dataset), preds_list, y_test


cuda


In [63]:
#actually running the model
from sklearn.metrics import classification_report, confusion_matrix
optimizer.zero_grad()
for epoch in range(1, 50):
    loss = train(train_loader)
    #test_acc, y_pred, y_test = test(test_loader)
    test_acc, y_pred, y_test = test_4way(test_loader)
    print('Epoch {:03d}, Loss: {:.4f}, Test: {:.4f}, F1: {:.4f}'.format(
        epoch, loss, test_acc, 0))
    #scheduler.step()

Predictons on test set: [453, 0, 84, 0]
Epoch 001, Loss: 2.5509, Test: 0.4842, F1: 0.0000
Predictons on test set: [467, 0, 70, 0]
Epoch 002, Loss: 1.9144, Test: 0.4991, F1: 0.0000
Predictons on test set: [535, 0, 2, 0]
Epoch 003, Loss: 1.3255, Test: 0.5475, F1: 0.0000
Predictons on test set: [504, 28, 5, 0]
Epoch 004, Loss: 0.7465, Test: 0.5345, F1: 0.0000
Predictons on test set: [511, 20, 6, 0]
Epoch 005, Loss: 0.6237, Test: 0.5363, F1: 0.0000
Predictons on test set: [511, 16, 10, 0]
Epoch 006, Loss: 0.5694, Test: 0.5363, F1: 0.0000
Predictons on test set: [513, 10, 14, 0]
Epoch 007, Loss: 0.4910, Test: 0.5400, F1: 0.0000
Predictons on test set: [508, 4, 5, 20]
Epoch 008, Loss: 0.3786, Test: 0.5326, F1: 0.0000


KeyboardInterrupt: 

In [70]:
y_pred_vals = [int(y.item()) for y in y_pred]
print(classification_report(y_test,y_pred_vals))
print(confusion_matrix(y_test,y_pred_vals))

In [None]:
#return the label as a majority vote with confidence value. 
# if all 3 labels the same, return the label with confidence 1
# if 2 agree on a label, return that label with confidence 0.66
# if all 3 disagree, return the HIGHEST RISK LABEL with confidence 0.33
# if any of them put "dont know", subtract from confidence 
# if all 3 are don't know, throw out example
#
# additionally, convert labels to numericals, with the following order:
# 0: normal duct
# 1: columnar
# 2: flat epithelial
# 3: adh
#
# returns [label, confidence]
def process_labels(o_labels):
    l1 = o_labels["p1_label"]
    l2 = o_labels["p2_label"]
    l3 = o_labels["p3_label"]
    mf_label = max(set([l1, l2, l3]), key = [l1, l2, l3].count)
    ####all 3 agree######
    if [l1, l2, l3].count(mf_label) == 3:
        if mf_label == "Normal Duct":
            return 0, 1
        elif mf_label == "Columnar":
            return 1, 1
        elif mf_label == "Flat Epithelial":
            return 2, 1
        elif mf_label == "ADH":
            return 3, 1
        else:               #all "Don't know" / "other", throw out
            return -1, 0
        
    ####only 2 agree######
    elif [l1, l2, l3].count(mf_label) == 2:
        if mf_label == "Normal Duct":
            return 0, 0.66
        elif mf_label == "Columnar":
            return 1, 0.66
        elif mf_label == "Flat Epithelial":
            return 2, 0.66
        elif mf_label == "ADH":
            return 3, 0.66
        else:               #2 votes for don't know / other, use other label with confidence 0.33
            other_label = min(set([l1, l2, l3]), key = [l1, l2, l3].count)
            if other_label == "Normal Duct":
                return 0, 0.33
            elif other_label == "Columnar":
                return 1, 0.33
            elif other_label == "Flat Epithelial":
                return 2, 0.33
            elif other_label == "ADH":
                return 3, 0.33
            else:               #2 votes for "dont know"/"other", the other vote for the other
                return -1, 0
    
    ####they all disagree#####
    else:  
        #choose most severe diagnosis, use low confidence
        if "ADH" in [l1, l2, l3]:
            return 3, 0.33
        elif "Flat Epithelial" in [l1, l2, l3]:
            return 2, 0.33
        elif "Columnar" in [l1, l2, l3]:
            return 1, 0.33
        elif "Normal Duct" in [l1, l2, l3]:
            return 0, 0.33
        else:
            return -1, 0

In [44]:
#other option, just always take the most severe diagnosis, use count for confidence (hopefully get more high risk examples)
def process_labels2(o_labels):
    l1 = o_labels["p1_label"]
    l2 = o_labels["p2_label"]
    l3 = o_labels["p3_label"]
    l = [l1, l2, l3]
    #choose most severe diagnosis
    if "ADH" in l:
        if l.count("ADH") == 3:
            return torch.tensor([3, 1])
        elif l.count("ADH") == 2:
            return torch.tensor([3, 0.66])
        else:
            return torch.tensor([3, 0.33])

    elif "Flat Epithelial" in l:
        if l.count("Flat Epithelial") == 3:
            return torch.tensor([2, 1])
        elif l.count("Flat Epithelial") == 2:
            return torch.tensor([2, 0.66])
        else:
            return torch.tensor([2, 0.33])

    elif "Columnar" in l:
        if l.count("Columnar") == 3:
            return torch.tensor([1, 1])
        elif l.count("Columnar") == 2:
            return torch.tensor([1, 0.66])
        else:
            return torch.tensor([1, 0.33])

    elif "Normal Duct" in l:
        if l.count("Normal Duct") == 3:
            return torch.tensor([0, 1])
        elif l.count("Normal Duct") == 2:
            return torch.tensor([0, 0.66])
        else:
            return torch.tensor([0, 0.33])     

    else:
        return torch.tensor([-1, 0])

In [45]:
#same as process_labels2, but a binary problem, just high and low risk
def process_labels3(o_labels):
    l1 = o_labels["p1_label"]
    l2 = o_labels["p2_label"]
    l3 = o_labels["p3_label"]
    l = [l1, l2, l3]
    #choose most severe diagnosis
    if "ADH" in l:
        if l.count("ADH") == 3:
            return torch.tensor([1, 1])
        elif l.count("ADH") == 2:
            return torch.tensor([1, 0.66])
        else:
            return torch.tensor([1, 0.33])

    elif "Flat Epithelial" in l:
        if l.count("Flat Epithelial") == 3:
            return torch.tensor([1, 1])
        elif l.count("Flat Epithelial") == 2:
            return torch.tensor([1, 0.66])
        else:
            return torch.tensor([1, 0.33])

    elif "Columnar" in l:
        if l.count("Columnar") == 3:
            return torch.tensor([0, 1])
        elif l.count("Columnar") == 2:
            return torch.tensor([0, 0.66])
        else:
            return torch.tensor([0, 0.33])

    elif "Normal Duct" in l:
        if l.count("Normal Duct") == 3:
            return torch.tensor([0, 1])
        elif l.count("Normal Duct") == 2:
            return torch.tensor([0, 0.66])
        else:
            return torch.tensor([0, 0.33])     

    else:
        return torch.tensor([-1, 0])

In [46]:
#Dataset class
from torch_geometric.data import Data, DataLoader, Dataset
from sklearn.preprocessing import normalize
class HistGraphDataset(Dataset):
    def __init__(self, data_list):
        self.data_list = data_list
        self.num_classes = 2
        #self.num_features = len(data_list[0].x[0])

    def __len__(self):
        return len(self.data_list)
    
    #normalize input variables x in data_list across EACH SAMPLE
    #ie each sample is normalized relative to itself, might account for zoom issues
    def process(self):
        #normalize 
        for i, data in enumerate(self.data_list):
            #print("After: " + str(normalize(data.x,axis=0)[0]))
            #self.data_list[i].x = torch.tensor(normalize(data.x, axis=1), dtype=torch.float)
            self.data_list[i].x = torch.tensor(data.x / data.x.max(0, keepdim=True)[0], dtype=torch.float)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            print("IDX WAS TENSOR")
            idx = idx.tolist()
        sample = self.data_list[idx]     
        return sample

In [4]:
a = [0,0,0,0]
for data in train_loader:
    a[data.y[0].item()] = a[data.y[0].item()] + 1
print(a) #class distributions

NameError: name 'train_loader' is not defined