### Uploading files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/StatProject/train.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: LUSC/TCGA-33-A5GW-01Z-00-DX3.7A1C5169-96C6-4ECB-81E1-1050EFE84B60/TCGA-33-A5GW-01Z-00-DX3.7A1C5169-96C6-4ECB-81E1-1050EFE84B60_20x_60.jpg  
  inflating: LUSC/TCGA-33-A5GW-01Z-00-DX3.7A1C5169-96C6-4ECB-81E1-1050EFE84B60/TCGA-33-A5GW-01Z-00-DX3.7A1C5169-96C6-4ECB-81E1-1050EFE84B60_20x_61.jpg  
  inflating: LUSC/TCGA-33-A5GW-01Z-00-DX3.7A1C5169-96C6-4ECB-81E1-1050EFE84B60/TCGA-33-A5GW-01Z-00-DX3.7A1C5169-96C6-4ECB-81E1-1050EFE84B60_20x_62.jpg  
  inflating: LUSC/TCGA-33-A5GW-01Z-00-DX3.7A1C5169-96C6-4ECB-81E1-1050EFE84B60/TCGA-33-A5GW-01Z-00-DX3.7A1C5169-96C6-4ECB-81E1-1050EFE84B60_20x_63.jpg  
  inflating: LUSC/TCGA-33-A5GW-01Z-00-DX3.7A1C5169-96C6-4ECB-81E1-1050EFE84B60/TCGA-33-A5GW-01Z-00-DX3.7A1C5169-96C6-4ECB-81E1-1050EFE84B60_20x_64.jpg  
  inflating: LUSC/TCGA-33-A5GW-01Z-00-DX3.7A1C5169-96C6-4ECB-81E1-1050EFE84B60/TCGA-33-A5GW-01Z-00-DX3.7A1C5169-96C6-4ECB-81E1-1050EFE84B60_20x_65.jpg  
  inflating: LUSC

## Dataset Implementation

In [None]:

import PIL.Image as Image
import torchvision.transforms as transforms
import torch
import math
import os, sys

class Th_generator:
    def __init__(self, num_classes):
        self.num_classes = num_classes

    def Ec(self, patches):
        Ec = {}
        target_encoding = {"LUSC": 0, "MESO": 1, "LUAD": 2}
        for i in range(3):
            Ec['Ec_'+str(i)] = []
            for j in patches:
                if target_encoding[j.image_label] == i & j.discriminative ==True:
                    Ec['Ec_'+str(i)].append(mean(j.probs))
        return Ec

    def Si(self, patches):
        names = [i.image_name for i in patches]
        img_names = np.unique(names)
        num_images = len(np.unique(img_names))
        target_encoding = {"LUSC": 0, "MESO": 1, "LUAD": 2}
        Si = {}
        for i in range(num_images):
            Si['Si_'+str(i)] = []
            for j in patches:
                if j.image_name == img_names[i] & j.discriminative ==True :
                    Si['Si_'+str(i)].append(mean(j.probs))#j.probs[target_encoding[j.image_label]]
        return Si

    def p_percentiale(self,p,set, set_name):
        p_perc = []
        for i in range(len(set)):
          si = set[set_name+"_"+str(i)]
          i = (p/100)*(len(si) + 1)
          up = math.ceil(i)-1
          down = math.floor(i)
          if down>0:
              down-=1
          if len(si)>0:
              p_perc.append((si[up]+si[down])/2)
          else:
              p_perc.append(0)
        return p_perc

    def threshold(self,patches):
        Ec = self.Ec(patches)
        Si = self.Si(patches)
        Hi = self.p_percentiale(1,Si,"Si")
        Ri = self.p_percentiale(2,Ec,"Ec")
        return Hi, Ri


class WSImage:
    
    def __init__(self, image_name, label, pathes):
        self.image_name = image_name
        self.label = label
        self.patches = pathes

        
class Patch:
    
    def  __init__(self, image_name, image_label, patch_name, tensor, probs ):
        self.image_name = image_name
        self.image_label = image_label
        self.patch_name = patch_name
        self.discriminative = True
        self.tensor = tensor
        self.probs = probs 
    
     
class Dataset:
    
    def __init__(self, dir):
        classes = os.listdir(dir)
        # transforms
        MEANS, STDS = torch.tensor([0.4968, 0.4968, 0.4968]), torch.tensor([0.2458, 0.2458, 0.2458])
        
        transform = transforms.Compose([transforms.ToTensor(),
                                        transforms.Normalize(MEANS, STDS), 
                                        transforms.Resize(64)])                               
        self.images = []
        self.patches = []
        self.prob = 0
        for label in classes:
            print(f"\nLoading {label} data:")
            image_names = os.listdir(dir + "/" + label)
            for i, image_name in enumerate(image_names):
                print(f'\rLoaded {i}/{len(image_names)}', end='')
                patches = []
                for patch_name in os.listdir(dir + "/" + label + "/" + image_name):
                    patch_tensor = transform(Image.open(dir + "/" + label + "/" + image_name + "/" + patch_name))
                    patches.append(Patch(image_name, label, patch_name, patch_tensor, self.prob ))
                    
                self.images.append(WSImage(image_name, label, patches))
                self.patches += patches
      
    def reset_discriminative_patches(self):
        for i in self.patches:
          i.discriminative = True
          i.probs = 0

In [None]:

!unzip /content/drive/MyDrive/StatProject/dev.zip


In [None]:
data = Dataset("train/")
test_data = Dataset("dev/")


Loading MESO data:
Loaded 8/9
Loading LUAD data:
Loaded 71/72
Loading LUSC data:
Loaded 155/156
Loading MESO data:
Loaded 4/5
Loading LUAD data:
Loaded 37/38
Loading LUSC data:
Loaded 40/41

## DLA

In [None]:
import random 
import numpy as np

def expectation_step(model, data, batch_size,Th_generator,img_names, nepochs):
  print("\nExpectation step!")
  patches = data.patches
  n_patches = len(patches)
  s = 0
  target_encoding = {"LUSC": 0, "MESO": 1, "LUAD": 2}

  classes = np.array([target_encoding[p.image_label] for p in patches])
  logits = []
  Hi, Ri = Th_generator.threshold(patches) 
  model.eval()
  while s < n_patches:
    print(f"\rPatches finished: {s}/{n_patches}", end="")
    batch = patches[s: min(n_patches, s + batch_size)]
    batch_data = torch.stack([i.tensor for i in batch]).to(device)
    target = torch.tensor([target_encoding[i.image_label] for i in batch]).to(device)

    y = model(batch_data)
    pred = y.data.max(1, keepdim=True)[1]
    logits.append(torch.nn.functional.softmax(y).to("cpu").detach().numpy())
    s += batch_size

  logits = np.concatenate(logits)
  ## updating the probabilities of each patch
  all_patches = np.array(patches, dtype="object")
  for i, p in enumerate(all_patches):
      j = img_names.index(p.image_name)
      p.probs = logits[i]
      # if nepochs>4:
      p.discriminative = logits[i, target_encoding[p.image_label]] > min((Hi[j], Ri[target_encoding[p.image_label]])) # pred[:, c].mean()
      

def maximization_step(model, data, batch_size):
  patches = data.patches
  print(f"\nMaximization step with {len(patches)} patches!")
  patches = [i for i in patches if i.discriminative is True]
  random.shuffle(patches)

  n_patches = len(patches)
  s = 0
  target_encoding = {"LUSC": 0, "MESO": 1, "LUAD": 2}

  train_loss = 0
  correct = 0
  train_transform = transforms.Compose([transforms.RandomVerticalFlip(),
                                        transforms.RandomAffine((-15, 15))])

  model.train()
  while s < n_patches:
    print(f"\rPatches finished: {s}/{n_patches}", end="")
    batch = patches[s: min(n_patches, s + batch_size)]
    batch_data = train_transform(torch.stack([i.tensor for i in batch])).to(device)
    target = torch.tensor([target_encoding[i.image_label] for i in batch]).to(device)
    y = model(batch_data)

    pred = y.data.max(1, keepdim=True)[1]
    correct_pred = pred.eq(target.data.view_as(pred))
    correct += correct_pred.sum().item()
          
    l = loss(y, target)
    train_loss += l.mean().item()

    optimizer.zero_grad()
    l.mean().backward()
    optimizer.step()
    s += batch_size
  print(f"\nEpoch matrics: accuracy = {correct}/{n_patches}[{correct/n_patches}]\tloss = {train_loss}")


def test(model, test_dataset, title="Test"):
  model.eval()
  target_encoding = {"LUSC": 0, "MESO": 1, "LUAD": 2}

  y_true = []
  y_pred = []

  for image in test_dataset.images:
    patches = image.patches
    tensors = torch.stack([i.tensor for i in patches]).to(device)
    y = model(tensors)

    pred = y.data.max(1, keepdim=True)[1].flatten()
    counts = torch.bincount(pred)
    image_pred = counts.argmax().item()

    y_true.append(target_encoding[image.label])
    y_pred.append(image_pred)

  y_true = np.array(y_true)
  y_pred = np.array(y_pred)
  print("\n\n==========================================")
  print(title + " accuracy: ", np.mean(y_true == y_pred))
  zero_idx = np.where(y_true == 0)[0]
  print(f"Accuracy class = 0: {np.mean(y_pred[zero_idx] == 0)} %, {(y_pred[zero_idx] == 0).sum()}/{zero_idx.size}")
  one_idx = np.where(y_true == 1)[0]
  print(f"Accuracy class = 1: {np.mean(y_pred[one_idx] == 1)} %, {(y_pred[one_idx] == 1).sum()}/{one_idx.size}")
  two_idx = np.where(y_true == 2)[0]
  print(f"Accuracy class = 2: {np.mean(y_pred[two_idx] == 2)} %, {(y_pred[two_idx] == 2).sum()}/{two_idx.size}")
  print("==========================================")
  return y_true, y_pred



def run_em_algorithm(model, dataset, test_dataset,Th_generator,img_names, epochs=10, batch_size=20):
  for epoch in range(epochs):
    test(model, test_dataset)
    test(model, dataset, title="Train")
    maximization_step(model, dataset, batch_size)
    expectation_step(model, dataset, batch_size,Th_generator, img_names, epoch)
  return test(model, test_dataset), test(model, dataset)
    

In [None]:
import torchvision
import torch.nn as nn
import torch.optim as optim

model = torchvision.models.resnet34(pretrained=True)

print("CUDA Available: ", torch.cuda.is_available())
device = torch.device("cpu")

model.fc = torch.nn.Linear(512, 3)
model = model.to(device)

LR = 0.0001
loss = nn.CrossEntropyLoss(weight=torch.tensor([0.2, 0.45, 0.35])).to(device)
optimizer = optim.Adam(lr=LR, params=model.parameters())

CUDA Available:  False


In [None]:
th_gen = Th_generator(3)
patches = data.patches
names = [i.image_name for i in patches]
img_names = np.unique(names)
run_em_algorithm(model, data, test_data,th_gen,list(img_names), batch_size=50, epochs=20)



Test accuracy:  0.05952380952380952
Accuracy class = 0: 0.0 %, 0/41
Accuracy class = 1: 1.0 %, 5/5
Accuracy class = 2: 0.0 %, 0/38


Train accuracy:  0.03375527426160337
Accuracy class = 0: 0.0 %, 0/156
Accuracy class = 1: 0.8888888888888888 %, 8/9
Accuracy class = 2: 0.0 %, 0/72

Maximization step with 22404 patches!
Patches finished: 14100/22404

KeyboardInterrupt: ignored

In [None]:
data.reset_discriminative_patches()

In [None]:
import pandas as pd
from sklearn import svm, datasets
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import numpy as np

train = pd.read_csv('/content/drive/MyDrive/StatProject/train_images_patch_pred (1).csv')
valid = pd.read_csv('/content/drive/MyDrive/StatProject/test_images_patch_pred.csv')
test = pd.read_csv('/content/drive/MyDrive/StatProject/dev_images_patch_pred.csv')



y_train = train['true_label']
x_train = np.array(train[['LUSC_count', 'MESO_count', 'LUAD_count']], dtype=float)
for i in range(len(y_train)):
    if y_train[i]=="LUSC":
        y_train[i] = 0
    elif y_train[i]=="MESO":
        y_train[i] = 1
    else:
        y_train[i] = 2
y_train = np.array(y_train, dtype=int)

y_valid = valid['true_label']
x_valid = np.array(valid[['LUSC_count', 'MESO_count', 'LUAD_count']], dtype=float)
for i in range(len(y_valid)):
    if y_valid[i]=="LUSC":
        y_valid[i] = 0
    elif y_valid[i]=="MESO":
        y_valid[i] = 1
    else:
        y_valid[i] = 2
y_valid = np.array(y_valid, dtype=int)

y_test = test['true_label']
x_test = np.array(test[['LUSC_count', 'MESO_count', 'LUAD_count']], dtype=float)
for i in range(len(y_test)):
    if y_test[i]=="LUSC":
        y_test[i] = 0
    elif y_test[i]=="MESO":
        y_test[i] = 1
    else:
        y_test[i] = 2
y_test = np.array(y_test, dtype=int)

rbf = svm.SVC(kernel='rbf', gamma=0.5, C=0.1).fit(x_train, y_train)
poly = svm.SVC(kernel='poly', degree=3, C=1).fit(x_train, y_train)

poly_pred = poly.predict(x_test)
rbf_pred = rbf.predict(x_test)


poly_accuracy = accuracy_score(y_test, poly_pred)
poly_f1 = f1_score(y_test, poly_pred, average='weighted')
print('Accuracy (Polynomial Kernel): ', "%.2f" % (poly_accuracy*100))
print('F1 (Polynomial Kernel): ', "%.2f" % (poly_f1*100))

rbf_accuracy = accuracy_score(y_test, rbf_pred)
rbf_f1 = f1_score(y_test, rbf_pred, average='weighted')
print('Accuracy (RBF Kernel): ', "%.2f" % (rbf_accuracy*100))
print('F1 (RBF Kernel): ', "%.2f" % (rbf_f1*100))

Accuracy (Polynomial Kernel):  51.81
F1 (Polynomial Kernel):  37.77
Accuracy (RBF Kernel):  49.40
F1 (RBF Kernel):  32.67


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

In [None]:
iris = datasets.load_iris()
X = iris.data[:, :2]
y = iris.target
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
