In [1]:
import paddle
import paddle.nn.functional as F
import numpy as np
import pandas as pd
import os
from paddle import nn
from PIL import Image
from paddle.distributed import fleet, get_rank
import copy

IMAGE_SIZE = 224
BATCH_SIZE = 32
EPOCH_NUM = 5

print(paddle.__version__)

2.4.0


In [2]:
# 定义数据集
class MyDataset(paddle.io.Dataset):
    def __init__(self, img_dir='data/PALM-Training400/', csvfile=None, mode='train') -> None:
        super(MyDataset, self).__init__()
        self.imgpath = img_dir
        self.mode = mode
        if self.mode=='test':
            self.filedir = os.listdir(img_dir)
        else:
            self.csvfile = csvfile
        pass
    def __len__(self):
        if self.mode=='test':
            return len(self.filedir)
        else:
            return len(self.csvfile)
        pass
    def __getitem__(self, idx):
        if self.mode=='test':
            img = np.reshape((np.array(Image.open(self.imgpath+os.sep+self.filedir[idx]).resize((IMAGE_SIZE,IMAGE_SIZE))).astype('float32')),(3,IMAGE_SIZE,IMAGE_SIZE))/256.
            lab = self.filedir[idx]
        else:
            img = np.reshape((np.array(Image.open(self.imgpath+os.sep+self.csvfile['imgName'][idx]).resize((IMAGE_SIZE,IMAGE_SIZE))).astype('float32')),(3,IMAGE_SIZE,IMAGE_SIZE))/256.
            lab = np.array(self.csvfile['Label'][idx]).astype('float32')
        return img,lab
    pass

In [3]:
# 定义网络结构
def vgg_block(num_convs, in_channels, out_channels):
    net = [nn.Conv2D(in_channels=in_channels,out_channels=out_channels,kernel_size=3,padding=1),nn.ReLU()]
    for i in range(num_convs-1):
        net.append(nn.Conv2D(out_channels=out_channels,in_channels=out_channels,kernel_size=3,stride=1,padding=1))
        net.append(nn.ReLU())
    net.append(nn.MaxPool2D(kernel_size=2))
    return nn.Sequential(*net)

def vgg_stack(num_convs,channels):
    net = []
    for n,c in zip(num_convs,channels):
        in_c = c[0]
        out_c = c[1]
        net.append(vgg_block(n,in_c,out_c))
    return nn.Sequential(*net)

class VGG(paddle.nn.Layer):
    def __init__(self,vgg_net) -> None:
        super(VGG,self).__init__()
        self.conv = vgg_stack(vgg_net[0],vgg_net[1])
        self.line = nn.Sequential(
            nn.Linear(512*7*7,4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096,4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096,1)
        )
        
    def forward(self,x):
        x = self.conv(x)
        x = paddle.flatten(x, 1, -1)
        x = self.line(x)
        return x

In [19]:
def train_pm(model, optimizer, train_loader, valid_loader, epoches=1):
    # 开启0号GPU训练
    paddle.device.set_device('gpu:0')

    print('start training ... ')
    model.train()
    # 定义数据读取器，训练数据读取器和验证数据读取器
    # train_loader = paddle.io.DataLoader(mydataset,batch_size=BATCH_SIZE,shuffle=True,drop_last=True)
    # valid_loader = paddle.io.DataLoader(mydataset,batch_size=BATCH_SIZE,shuffle=True,drop_last=True)
    for epoch in range(epoches):
        for batch_id, data in enumerate(train_loader()):
            x_data, y_data = data
            img = paddle.to_tensor(x_data)
            label = paddle.reshape(paddle.to_tensor(y_data),(-1,1))
            # 运行模型前向计算，得到预测值
            logits = model(img)
            loss = F.binary_cross_entropy_with_logits(logits, label)
            avg_loss = paddle.mean(loss)

            if batch_id % 5 == 4:
                print("epoch: {}, batch_id: {}, loss is: {:.4f}".format(epoch, batch_id, float(avg_loss.numpy())))
            # 反向传播，更新权重，清除梯度
            avg_loss.backward()
            optimizer.step()
            optimizer.clear_grad()

        model.eval()
        accuracies = []
        losses = []
        for batch_id, data in enumerate(valid_loader()):
            x_data, y_data = data
            img = paddle.to_tensor(x_data)
            label = paddle.reshape(paddle.to_tensor(y_data),(-1,1))
            # 运行模型前向计算，得到预测值
            logits = model(img)
            # 二分类，sigmoid计算后的结果以0.5为阈值分两个类别
            # 计算sigmoid后的预测概率，进行loss计算
            pred = F.sigmoid(logits)
            loss = F.binary_cross_entropy_with_logits(logits, label)
            # 计算预测概率小于0.5的类别
            pred2 = pred * (-1.0) + 1.0
            # 得到两个类别的预测概率，并沿第一个维度级联
            pred = paddle.concat([pred2, pred], axis=1)
            acc = paddle.metric.accuracy(pred, paddle.cast(label, dtype='int64'))

            accuracies.append(acc.numpy())
            losses.append(loss.numpy())
        print("[validation] accuracy/loss: {:.4f}/{:.4f}".format(np.mean(accuracies), np.mean(losses)))
        model.train()

        paddle.save(model.state_dict(), 'palmp{}_{}.pdparams'.format(epoch,acc.numpy()))

In [5]:
# 创建模型
model =[VGG([[2,2,3,3,3], [[3,64],[64,128],[128,256],[256,512],[512,512]]]) for i in range(5)]
# opt = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters())
opt = [paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9, parameters=model[i].parameters()) for i in range(5)]

W0204 21:58:48.843931 16076 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0204 21:58:48.848599 16076 gpu_resources.cc:91] device: 0, cuDNN Version: 8.2.


In [20]:
df = pd.read_csv('Train/Classification.csv')
for i in range(5):
    valid_loader = paddle.io.DataLoader(
        MyDataset('常规赛：PALM病理性近视预测/Train/fundus_image',(df[i*160:i*160+160]).reset_index()),
        shuffle=True,
        drop_last=True,
        batch_size=BATCH_SIZE
    )
    train_loader = paddle.io.DataLoader(
        MyDataset('常规赛：PALM病理性近视预测/Train/fundus_image',pd.concat([df[:160*i],df[160*i+160:]]).reset_index()),
        shuffle=True,
        drop_last=True,
        batch_size=BATCH_SIZE
    )
    train_pm(model[i],opt[i],train_loader,valid_loader)

start training ... 
epoch: 0, batch_id: 4, loss is: 0.4881
epoch: 0, batch_id: 9, loss is: 0.5245
epoch: 0, batch_id: 14, loss is: 0.5482
epoch: 0, batch_id: 19, loss is: 0.2626
[validation] accuracy/loss: 0.9625/0.2490
start training ... 
epoch: 0, batch_id: 4, loss is: 0.7193
epoch: 0, batch_id: 9, loss is: 0.7356
epoch: 0, batch_id: 14, loss is: 0.5501
epoch: 0, batch_id: 19, loss is: 0.3735
[validation] accuracy/loss: 0.9000/0.4009
start training ... 
epoch: 0, batch_id: 4, loss is: 0.7218
epoch: 0, batch_id: 9, loss is: 0.7143
epoch: 0, batch_id: 4, loss is: 0.6873
epoch: 0, batch_id: 9, loss is: 0.7244
epoch: 0, batch_id: 14, loss is: 0.5817
epoch: 0, batch_id: 19, loss is: 0.5512
[validation] accuracy/loss: 0.8125/0.4629
start training ... 
epoch: 0, batch_id: 4, loss is: 0.8424
epoch: 0, batch_id: 9, loss is: 0.6207
epoch: 0, batch_id: 14, loss is: 0.5804
epoch: 0, batch_id: 19, loss is: 0.5365
[validation] accuracy/loss: 0.7563/0.5432


In [22]:
def predict(model, dataloader):
    model.eval()
    pred_list = np.array([])
    file_list = np.array([])
    for idx, data in enumerate(dataloader):
        print('\r{}/{}'.format(1+idx,len(dataloader)),end='')
        x_data, filename = data
        img = paddle.to_tensor(x_data)
        # 运行模型前向计算，得到预测值
        logits = model(img)
        # 二分类，sigmoid计算后的结果以0.5为阈值分两个类别
        # 计算sigmoid后的预测概率，进行loss计算
        pred = F.sigmoid(logits)
        pred_list = np.append(pred_list, pred.numpy().ravel())
        file_list = np.append(file_list, filename)
    return pred_list, file_list

In [None]:
validset = paddle.io.DataLoader(
    MyDataset('常规赛：PALM病理性近视预测/Train/fundus_image',df),
    shuffle=False,
    drop_last=False,
    batch_size=BATCH_SIZE
)
testdataset = MyDataset('PALM-Testing400-Images',None, 'test')
testdataloader = paddle.io.DataLoader(testdataset,shuffle=False,drop_last=False,batch_size=BATCH_SIZE)
for i in range(5):
    pred_list,file_list = predict(model[i], validset)
    pd.DataFrame(np.c_[file_list,pred_list],columns=['FileName','PM Risk']).to_csv('temp_{}.csv'.format(i),index=False)
    pred_list,file_list = predict(model[i], testdataloader)
    pd.DataFrame(np.c_[file_list,pred_list],columns=['FileName','PM Risk']).to_csv('pred_{}.csv'.format(i),index=False)