In [1]:
import torch
from torchvision import models
import torchvision.transforms as T
from PIL import Image
import os
from torch.utils.data import DataLoader
from torch.utils import data
import matplotlib.pyplot as plt
import numpy as np
from torch import nn
from tqdm import tqdm
from torch.nn import functional as F
from torch.autograd import Variable
from torch import optim
#import models.cifar as models

In [None]:
%matplotlib inline

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
#choose gpu
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:
augment_root = '/home/userpython3/Templates/health/Xli/eye_data/augment_train/'
train_data_root = ''
test_data_root = ''

In [None]:
imgs = os.listdir(pick_root + '3/')
len(imgs)

In [None]:
imgs[0].split('.')[-2].split('_')[-2]

In [3]:
class Eye(data.Dataset):
    def __init__(self, root,transforms = None, train = True, test = False):
        self.test = test
        imgs = []
        if self.test:
            imgs = [os.path.join(root,img) for img in os.listdir(root)]
        else:    
            labels = os.listdir(root)
            for i in range(len(labels)):
                img_temp = [os.path.join(root,labels[i], img) for img in os.listdir(os.path.join(root, labels[i]))]
                imgs = imgs + img_temp

            imgs = sorted(imgs, key=lambda x: int(x.split('/')[-1].split('.')[-2].split('_')[-2]))
        imgs_num = len(imgs)
        if self.test:
            self.imgs = imgs
        elif train:
            self.imgs = imgs[:int(0.8 * imgs_num)]
        else:
            self.imgs = imgs[int(0.8 * imgs_num):]
        
        
        normalize = T.Normalize(mean=[0.5, 0.5, 0.5],
                                    std=[0.5, 0.5, 0.5])

        if self.test:
            self.transforms = T.Compose([
                    T.Resize(600),
                    T.CenterCrop(600),

                    T.ToTensor(),
                    normalize
                ])
        else:
            self.transforms = T.Compose([

                        T.ToTensor(),
                        normalize
                    ])

    def __getitem__(self, index):
        """
        一次返回一张图片的数据
        """
        img_path = self.imgs[index]
        if self.test:
            label = img_path
        else:
            label = int(self.imgs[index].split('/')[-2])
        data = Image.open(img_path)
        
        data = self.transforms(data)
        return data, label

    def __len__(self):
        return len(self.imgs)


In [None]:
# model = models.inception_v3(pretrained=True)
# model.fc = nn.Linear(8192,5)
# model.AuxLogits.fc = nn.Linear(768, 5)
#model.load_state_dict(t.load(opt.load_model_path))


In [None]:
import torch.utils.model_zoo as model_zoo
model_zoo.load_url()

In [None]:

def get_prob(augment_root, model, batch_size):
    '''
    input：通过dataloader批量处理的结构化图片数据

    output：
    probabilities:包含label，与概率值,字典格式

    '''
    model.cuda()
    model.eval()
    test_data = Eye(augment_root, test=True,train=False)
    test_dataloader = DataLoader(test_data, batch_size = batch_size, shuffle=False)

    #t.no_grad() torch0.4
    raw_probabilities = []
    for ii,(data, label) in tqdm(enumerate(test_dataloader)):
        input = Variable(data)
        #label = str(label.numpy().tolist().pop())
        

        input = input.cuda()
        score = model(input)
        prob = F.softmax(score,dim=1).data.tolist()
        #label = score.max(dim=1)[1].data.tolist()
        #label = str(label)
        batch_results = [(label_,prob_) for label_,prob_ in zip(label,prob)]
        raw_probabilities += batch_results
     

    #with open('val_cats_prob.txt', 'w') as f:
     #   f.write(str(probabilities))
    #   f.close()
    return raw_probabilities


In [None]:
demo_root = '/data/health/Xli/original_5class/demo/'
raw_pro = get_prob(demo_root, model, 5)

In [None]:
#deal with raw_probabilities
def deal_probabilities(raw_probabilities):
    probabilities = {}
    pro_label_list = []
    for key in raw_probabilities:
        pro_label = key[0].split('/')[-1].split('.')[-2]
        if pro_label in pro_label_list:
            probabilities[key[0].split('/')[-2] + '/' + pro_label] += [key[1]]
        else:
            probabilities[key[0].split('/')[-2] + '/'+ pro_label] = [key[1]]
            pro_label_list.append(pro_label)
    return probabilities
        


In [None]:
pro = deal_probabilities(raw_pro)

In [None]:
i = 0
for key in pro:
    if i > 8:
        print(key)
    i += 1

In [None]:
def compute_diversity(propabilities):
    '''
    计算diversity指标
    diversity indicates the prediction consistency among the patches within a candidate
    higher diversity values denotes higher degrees of prediction inconsistency among 
    patches within a patch
    input：
        probabilities：一幅图片的n个augmentation后的patch通过模型后得到的概率值，字典格式
    output：
        diversity：该图片的diversity,（float）格式
    '''
    diversity = 0.
    m = len(propabilities)
    k = len(propabilities[0])
    for i in range(m):
        for j in range(i + 1, m):
            for k in range(k):
                diversity += (propabilities[j][k] - propabilities[i][k]) * np.log((propabilities[j][k]+0.00001) / (propabilities[i][k]+0.00001)) 
    return diversity

    

def compute_entropy(propabilities):
    '''
    计算entropy指标
    entropy包含一个样本的所有patchs对不同类别的预测概率的熵值之和，其值越高表示越难以分类
    input：
        probabilities：一幅图片的n个augmentation后的patch通过模型后得到的概率值，字典格式
    output：
        diversity：该图片的entropy,（float）格式
    '''
    entropy = 0.
    m = len(propabilities)
    k = len(propabilities[0])
    for i in range(m):
        for j in range(k):
            entropy += (propabilities[i][j] + 0.00001) * np.log(propabilities[i][j] + 0.00001)
    entropy = (-1 / m) * entropy
    return entropy

In [None]:
def pick_hard(lambda_div, lambda_ent, pic_result, augment_now_root, model, batch_size):
    '''

    '''
   
    #找出指标
    probs = get_prob(augment_now_root, model, batch_size)
    probs = deal_probabilities(probs)
    
    for prob in probs:
        diversity = compute_diversity(probs[prob])
        entropy = compute_entropy(probs[prob])
        result = lambda_div * diversity + lambda_ent * entropy
        pic_result[prob] = result,diversity,entropy

    #将得到结果储存在‘results.txt'中
#     with open('demo.txt', 'w') as f:
#         f.write(str(pic_result))
#         f.close()
    #返回最高的前n个
#     result_tuple = sorted(pic_result.items(), key=lambda x:x[1][0], reverse=True)
#     top1000 = result_tuple[:1000]
#     with open('sorted_demo_results.txt', 'w') as f:
#         f.write(str(result_tuple))
#         f.close()

    return pic_result

In [None]:
roots = os.listdir(augment_root)
pic_result = {}
for key in roots:
    augment_now_root = os.path.join(augment_root, key)
    pic_result = pick_hard(0.5, 0.5, pic_result, augment_now_root, model, 1)
    
with open('fifth_result.txt', 'w') as f:
    f.write(str(pic_result))
    f.close()
        
result_tuple = sorted(pic_result.items(), key=lambda x:x[1][0], reverse=True)
with open('sorted_fifth_results.txt', 'w') as f:
    f.write(str(result_tuple))
    f.close()

In [None]:
with open('sorted_fifth_results.txt', 'r') as f:
    a = f.read()
    a = eval(a)
    f.close()

In [None]:
a[:10]

In [None]:
pick = a[:1500]
picked = []
for key in pick:
    picked.append(key[0])

In [None]:
picked = sorted(picked, key=lambda x:int(x.split('/')[0]))

In [None]:
picked[:10]

In [None]:
picked_0 = []
picked_1 = []
picked_2 = []
picked_3 = []
picked_4 = []
for key in picked:
    if int(key.split('/')[0]) == 0:
        picked_0.append(key.split('/')[1])
    elif int(key.split('/')[0]) == 1:
        picked_1.append(key.split('/')[1])
    elif int(key.split('/')[0]) == 2:
        picked_2.append(key.split('/')[1])
    elif int(key.split('/')[0]) == 3:
        picked_3.append(key.split('/')[1])
    elif int(key.split('/')[0]) == 4:
        picked_4.append(key.split('/')[1])

In [None]:
picked_4[:10]

In [None]:
# move data to train
import shutil
in_root = '/home/userpython3/Templates/health/Xli/eye_data/augment_train/4/'
out_root = '/home/userpython3/Templates/health/Xli/eye_data/picked/fifth/4/'
imgs = os.listdir(in_root)
for key in imgs:
    if key.split('.')[-2] in picked_4:
        shutil.move(os.path.join(in_root, key), out_root)

In [None]:
#查看文件夹包含多少张图片
imgs_0 = os.listdir(out_root)
len(imgs_0)

In [None]:
#数据复制
in_root_copy = '/home/userpython3/Templates/health/Xli/eye_data/picked/first/4/'
out_root_copy= '/home/userpython3/Templates/health/Xli/eye_data/picked/second/4/'
imgs = os.listdir(in_root_copy)
for img in imgs:
    #if 'cat' in img:
        #if int(img.split('.')[-2]) in cat1:
    shutil.copy(os.path.join(in_root_copy,img), out_root_copy)

In [7]:
def train(train_data_root, model, batch_size, lr,epoch_num):
    #model
    
    model.train()
    model.cuda()
    #data
    train_data = Eye(root= train_data_root, train=True)
    val_data = Eye(root = train_data_root, train=False)
    train_dataloader = DataLoader(train_data, batch_size,shuffle=True,num_workers=4)
    val_dataloader = DataLoader(val_data, batch_size=10,shuffle=True, num_workers=4)
    #val_dataloader = DataLoader(val_data)
    #optimizer
    criterion = torch.nn.CrossEntropyLoss()
   
    optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=1e-8)
    #nn.utils.clip_grad_norm(model.parameters(), 10, norm_type=2)
    #train
    train_loss_log = []
    val_loss_log = []
    for epoch in range(epoch_num):
        
        for ii,(data,label) in tqdm(enumerate(train_dataloader)):
                input = Variable(data)
                target = Variable(label)

                input = input.cuda()
                target = target.cuda()
                optimizer.zero_grad()
                score = model(input)
                loss = criterion(score,target)
                loss.backward()
                nn.utils.clip_grad_norm(model.parameters(), 10, norm_type=2)
                optimizer.step()
                if ii%500==499:
                    print(epoch, loss.item())
        train_loss_log.append(loss.item())
        
        name = str('check_points/fifth_solo_train_') + str(int(epoch)) + str('epoch.pth')
        torch.save(model.state_dict(), name)
        
        val_loss_log.append(val(val_dataloader, model))
    #torch.save(model.state_dict(),'eye_first_train.pth')
    with open('train_fifth_solo_log.txt', 'w') as f:
        f.write(str(train_loss_log))
        f.close()
    with open('val_fifth_solo_log.txt', 'w') as f:
        f.write(str(val_loss_log))
        f.close()
        #validation loss
        

In [4]:
def val(val_dataloader, model):
    '''
    compute model's loss in validation data
    '''
    model.eval()
    criterion = torch.nn.CrossEntropyLoss()
    for ii, (data, label) in tqdm(enumerate(val_dataloader)):
        data = Variable(data)
        target = Variable(label)
        data = data.cuda()
        target = target.cuda()
        score = model(data)
        loss = criterion(score,target)
    print(loss.item())
    return loss.item()

In [4]:

model = models.inception_v3(pretrained=True)
model.fc = nn.Linear(8192,5)
model.aux_logits = False

model.load_state_dict(torch.load('check_points/fifth_solo_train_17epoch.pth'))
#model.AuxLogits.fc = nn.Linear(37632, 0)

In [None]:
# val_data = Eye(train_root, test=False,train=False)
# val_dataloader = DataLoader(val_data, batch_size=10,shuffle=False, num_workers=4)
# for ii, (img,label) in tqdm(enumerate(val_dataloader)):
    
#     if ii >400:
#         print(label)
#     if ii >410:
#         break

In [None]:
train_root = '/home/userpython3/Templates/health/Xli/eye_data/picked/fifth/'
train(train_root, model, 20, 0.0001, 20)



1it [00:04,  4.45s/it][A
2it [00:07,  4.01s/it][A
3it [00:10,  3.70s/it][A
4it [00:13,  3.48s/it][A
5it [00:16,  3.34s/it][A
6it [00:19,  3.22s/it][A
7it [00:22,  3.15s/it][A
8it [00:25,  3.09s/it][A
9it [00:28,  3.06s/it][A
10it [00:31,  3.04s/it][A
11it [00:34,  3.03s/it][A
12it [00:37,  3.01s/it][A
13it [00:40,  3.01s/it][A
14it [00:43,  3.00s/it][A
15it [00:46,  3.00s/it][A
16it [00:49,  3.00s/it][A
17it [00:52,  3.00s/it][A
18it [00:55,  3.00s/it][A
19it [00:58,  3.01s/it][A
20it [01:01,  3.00s/it][A
21it [01:04,  3.01s/it][A
22it [01:07,  3.00s/it][A
23it [01:10,  3.01s/it][A
24it [01:13,  3.00s/it][A
25it [01:16,  3.00s/it][A
26it [01:19,  3.00s/it][A
27it [01:22,  3.00s/it][A
28it [01:25,  3.00s/it][A
29it [01:28,  3.01s/it][A
30it [01:31,  3.01s/it][A
31it [01:34,  3.01s/it][A
32it [01:37,  3.00s/it][A

In [None]:
val(val_dataloader, model)

In [None]:
a = Variable(torch.Tensor(1,3,600, 600))
a = a.cuda()
model.cuda()
model.train()
s = model(a)

In [5]:
def test(test_root, model,batch_size):
    test_data = Eye(test_root, test=True)
    test_dataloader = DataLoader(test_data, batch_size,shuffle=False, num_workers=4)
    model.cuda()
    model.eval()
    results = []
    for ii, (data, label) in tqdm(enumerate(test_dataloader)):
        data = Variable(data)
        
        data = data.cuda()
        
        score = model(data)
        
        result = score.max(dim = 1)[1].data.tolist()
        batch_results = [result_ for result_ in result ]
        results += batch_results
    return results

In [None]:
test_root = '/home/userpython3/Templates/health/Xli/eye_data/validation/4/'
result_0 = test(test_root, model,1)

In [None]:
l0 = []
l1 = []
l2 = []
l3 = []
l4 = []
for key in result_0:
    if key == 0:
        l0.append(key)
    elif key == 1:
        l1.append(key)
    elif key == 2:
        l2.append(key)
    elif key == 3:
        l3.append(key)
    elif key == 4:
        l4.append(key)
            
print(len(l0),len(l1),len(l2),len(l3), len(l4))

In [None]:
len(result_0)

In [None]:
(2521+7+278+34+17)/3500.

In [6]:
def test_csv(test_root, model,batch_size):
    test_data = Eye(test_root, test=True)
    test_dataloader = DataLoader(test_data, batch_size,shuffle=False, num_workers=4)
    model.cuda()
    model.eval()
    results = []
    for ii, (data, label) in tqdm(enumerate(test_dataloader)):
        data = Variable(data)
        
        data = data.cuda()
        
        score = model(data)
        
        result = score.max(dim = 1)[1].data.tolist()
        batch_results = [(label_.split('/')[-1].split('.')[-2], result_) for label_,result_ in zip(label, result)]
        results += batch_results
    return results

In [8]:
kaggle_root = '/home/user/data/eye_fundus/original_5class/test/'
imgs = os.listdir(kaggle_root)
len(imgs)

53576

In [9]:
def write_csv(results, file_name):
    import csv
    with open(file_name, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['image', 'level'])
        writer.writerows(results)

In [10]:
results = test_csv(kaggle_root, model,1)
write_csv(results, 'kaggle_5_csv')

53576it [1:30:13, 11.45it/s]


In [None]:
write_csv(results, 'kaggle_csv')

In [2]:
import h5py