# Introduction

This is a simple fork of my previous kernel (https://www.kaggle.com/taindow/pytorch-efficientnet-b0), 

except here we make use of ResNeXt and "weakly supervised pre-training" as opposed to EfficientNet. 

See https://github.com/facebookresearch/WSL-Images for model information. 

Note due to the number of parameters a single sweep of the data will take approx. 

每个epoch耗时4个小时

# Sources

Windowing functions for pre-processed data taken from the following:

- https://www.kaggle.com/omission/eda-view-dicom-images-with-correct-windowing 

# Parameters

In [None]:

# 读取224像素的图片数据
dir_csv = '../input/rsna-intracranial-hemorrhage-detection'
dir_train_img = '../input/rsna-train-stage-1-images-png-224x/stage_1_train_png_224x'#训练集
dir_test_img = '../input/rsna-test-stage-1-images-png-224x/stage_1_test_png_224x'#测试集


In [None]:

# Parameters

n_classes = 6
n_epochs = 2
batch_size = 64


# Setup

Need to grab a couple of extra libraries

- Nvidia Apex for mixed precision training (https://github.com/NVIDIA/apex)

In [None]:
# 混合权重精度训练，减少内存消耗，同时提升训练速度
# Installing useful libraries

!git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
    

In [None]:
import apex
dir(apex)

In [None]:
# Libraries

# 混合权重精度训练
from apex import amp

import os
import cv2
import glob
import pydicom
import numpy as np
import pandas as pd
import torch
import torch.optim as optim

# albumnetations是用于图像增强的库
from albumentations import Compose, ShiftScaleRotate, Resize, CenterCrop, HorizontalFlip, RandomBrightnessContrast
from albumentations.pytorch import ToTensor

from torch.utils.data import Dataset
from tqdm import tqdm_notebook as tqdm
from matplotlib import pyplot as plt


from torchvision import transforms

In [None]:
%%time
# Functions

class IntracranialDataset(Dataset):

    def __init__(self, csv_file, path, labels, transform=None):
        
        self.path = path
        self.data = pd.read_csv(csv_file)#读取标签
        self.transform = transform#这个transform其实是一个管道
        self.labels = labels

    def __len__(self):#获取数据的长度
        
        return len(self.data)

    def __getitem__(self, idx):#图片读取数据
        
        img_name = os.path.join(self.path, self.data.loc[idx, 'Image'] + '.png')#读取图片
        print("img_name=",img_name)
        img = cv2.imread(img_name)   
        
        if self.transform:       
            
            augmented = self.transform(image=img)#使用了上面构造函数中定义的transform
            print("augmented=",augmented)
            img = augmented['image']#图片数据
            
        if self.labels:#判断是否ｔｒａｉｎ
            
            labels = torch.tensor(
                self.data.loc[idx, ['epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'any']])
            return {'image': img, 'labels': labels}    
        
        else:      
            
            return {'image': img}
    
    


# CSV

In [None]:
%%time

train = pd.read_csv(os.path.join(dir_csv, 'stage_1_train.csv'))#标签文件
test  = pd.read_csv(os.path.join(dir_csv, 'stage_1_sample_submission.csv'))#提交文件模板

In [None]:
train.head(10)

In [None]:
%%time

# Split train out into row per image and save a sample
# 把上面的ID_2669954a7_intraventricular分成ID，2669954a7，intraventricular三列

train[['ID', 'Image', 'Diagnosis']] = train['ID'].str.split('_', expand=True)
#expand的作用是：
#对下划线进行分割，Series类型转化为DataFrame

In [None]:
train.head(10)

In [None]:
%%time

train = train[['Image', 'Diagnosis', 'Label']]#扔掉ID列
train.drop_duplicates(inplace=True)
# inplace = True：
# 不创建新的对象，直接对原始对象进行修改；
# 删除重复数据

In [None]:
train.head(10)

In [None]:
np.set_printoptions(threshold=2000)#全部输出 
pd.set_option('display.max_colwidth',2000)#全部输出 
pd.set_option('max_colwidth',2000)
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [None]:
%%time

train = train.pivot(index='Image', columns='Diagnosis', values='Label').reset_index()#透视表，进行独热编码
train['Image'] = 'ID_' + train['Image']
#这里的Image指的是图片的ＩＤ

In [None]:
train.head(1000)

In [None]:
train.dtypes#7列，第一列是ID，其余６列是类别

In [None]:
train.head(10)

In [None]:
undersample_seed=0
train["any"].value_counts()
#“任意其他部位”的出血图片数量

In [None]:
num_ill_patients = train[train["any"]==1].shape[0]
#shape[0]是多少行数据，shape[1]是多少列数据
num_ill_patients
train[train["any"]==1].head(10)

In [None]:
%%time
healthy_patients = train[train["any"]==0].index.values
healthy_patients_selection = np.random.RandomState(undersample_seed).choice(
    healthy_patients, size=num_ill_patients, replace=False
)
len(healthy_patients_selection)

In [None]:
%%time
sick_patients = train[train["any"]==1].index.values
selected_patients = list(set(healthy_patients_selection).union(set(sick_patients)))

len(selected_patients)/2

In [None]:
%%time
new_train = train.loc[selected_patients].copy()
new_train["any"].value_counts()

In [None]:
%%time
png = glob.glob(os.path.join(dir_train_img, '*.png'))#所有的路径放入到一个列表

In [None]:
png

In [None]:
png

In [None]:
png = [os.path.basename(png)[:-4] for png in png]
#把文件名中的".png"去掉

In [None]:
png

In [None]:
%%time
# Some files didn't contain legitimate images, so we need to remove them


png = np.array(png)#list转换成array

In [None]:
png

In [None]:
train['Image']
print("-")

In [None]:
train['Image'].isin(png)
#检查在路径中的.png的文件名是否都在stage_1_train.csv中
print("--")

In [None]:
%%time
train = train[train['Image'].isin(png)]
train.to_csv('train.csv', index=False)
#把同时都在stage_1_train.csv且在路径中有相同名称png的文件名写入train.csv中

In [None]:
%%time
# Also prepare the test data

test[['ID','Image','Diagnosis']] = test['ID'].str.split('_', expand=True)
#expand表示series变换为DataFrame类型

test['Image'] = 'ID_' + test['Image']
test = test[['Image', 'Label']]

test.drop_duplicates(inplace=True)
#删除重复数据,inplace=True表示在原有数据上进行修改

test.to_csv('test.csv', index=False)

# DataLoaders(加载图片数据的部分,注释完毕)

In [None]:
%%time
# Data loaders
#－－－－－－－－－－－－－－－－－－－－－下面是训练集加载－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－
transform_train = Compose([CenterCrop(200, 200), #猜测有应该是中间部分200*200的区域，这里也就是kernel标题说的CenterCrop，就是中间截取一大块
                           #Resize(224, 224),
                           HorizontalFlip(), #水平翻转用来图像强化
                           RandomBrightnessContrast(), #来自于cafｆｅ的函数，调整对比度和亮度，怎么调整似乎是不可控的
    ShiftScaleRotate(),
    ToTensor()
])
# 上面的这个东西看起来有点像是管道

train_dataset = IntracranialDataset(csv_file='train.csv', path=dir_train_img, transform=transform_train, labels=True)
data_loader_train = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
#－－－－－－－－－－－－－－－－－－－－－－下面是测试集加载－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－

transform_test= Compose([CenterCrop(200, 200),
                         #Resize(224, 224),
    ToTensor()
])

test_dataset = IntracranialDataset(csv_file='test.csv', path=dir_test_img, transform=transform_test, labels=False)
data_loader_test = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [None]:
len(train_dataset)

In [None]:
%%time
# Plot train example

batch = next(iter(data_loader_train))
fig, axs = plt.subplots(1, 5, figsize=(15,5))

for i in np.arange(5):
    axs[i].imshow(np.transpose(batch['image'][i].numpy(), (1,2,0))[:,:,0], cmap=plt.cm.bone)

In [None]:
%%time
# Plot test example

batch = next(iter(data_loader_test))
fig, axs = plt.subplots(1, 5, figsize=(15,5))

for i in np.arange(5):
    
    axs[i].imshow(np.transpose(batch['image'][i].numpy(), (1,2,0))[:,:,0], cmap=plt.cm.bone)


# Model(绝大部分注释完毕)

In [1]:
%%time
# Model

device = torch.device("cuda:0")#配置GPU设备

# 下面的ｈｕｂ的意思是模型库
model = torch.hub.load('facebookresearch/WSL-Images', 'resnext101_32x8d_wsl')
# 这里的ＷＳＬ-Images以及resｎｅｘｔ101_32x8d_wsl是什么意思？？？
# 通过修改这里的32x8d为16d,32d,64d可以使用不同的预训练模型

model.fc = torch.nn.Linear(2048, n_classes)
# 2048应该是输入层的隐藏节点数
# 这里的fc是全连接层(full connection)的意思
# 参考
# https://blog.csdn.net/MiniCnCoder/article/details/79920545

model.to(device)

criterion = torch.nn.BCEWithLogitsLoss() 
#到底哪里体现了权重[2,1,1,1,1,1]的权重关系???
plist = [{'params': model.parameters(), 'lr': 2e-5}]


optimizer = optim.Adam(plist, lr=2e-5)
#优化器设定为adam

model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
# 这个amp和混合权重仿真到底是什么关系呢?


NameError: name 'torch' is not defined

# Training(绝大部分注释完毕)

In [None]:
%%time
# Train


for epoch in range(n_epochs):
    
    print('Epoch {}/{}'.format(epoch, n_epochs - 1))
    print('-' * 10)

    model.train()    
    tr_loss = 0
    
    tk0 = tqdm(data_loader_train, desc="Iteration")
    
    #data_loader_train是在加载图片数据     
    #用加载图片数据的进度来近似模拟训练进度，所以进度条只是一个epoch的训练速度

    for step, batch in enumerate(tk0):

        inputs = batch["image"]#图片
        labels = batch["labels"]#图片标签

        inputs = inputs.to(device, dtype=torch.float)#输入图片
        labels = labels.to(device, dtype=torch.float)#输入类别标签

        outputs = model(inputs)#训练
        loss = criterion(outputs, labels)#loss

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()#??????

        tr_loss += loss.item()

        optimizer.step()
        optimizer.zero_grad()
        
        #下面这个可以改下step的阈值,看loss!!!!(重要)
        
        if epoch == 1 and step > 6000:
            epoch_loss = tr_loss / 6000
            print('Training Loss: {:.4f}'.format(epoch_loss))
            break

    epoch_loss = tr_loss / len(data_loader_train)
    print('Training Loss: {:.4f}'.format(epoch_loss))

# Inference(注释完毕)

In [None]:
# Inference

for param in model.parameters():
    param.requires_grad = False
#不再计算导数,有网上的博客说,没有这两句也是可以的:
# https://blog.csdn.net/york1996/article/details/83019978

model.eval()
# model.train() ：启用 BatchNormalization 和 Dropout
# model.eval() ：不启用 BatchNormalization 和 Dropout

test_pred = np.zeros((len(test_dataset) * n_classes, 1))#初始化测试集的预测结果
# 因为上面是左侧ID,上侧是类别名的形式

for i, x_batch in enumerate(tqdm(data_loader_test)):# 读取测试图片
    
    x_batch = x_batch["image"]
    x_batch = x_batch.to(device, dtype=torch.float)
    
    with torch.no_grad():   
        pred = model(x_batch)#预测语句
        
        #对pred用sigmoid处理以及调整输出格式
        test_pred[(i * batch_size * n_classes):((i + 1) * batch_size * n_classes)] = \
        torch.sigmoid(pred).detach().cpu().reshape((len(x_batch) * n_classes, 1))

# Submission（注释完毕）

In [None]:
# Submission

submission =  pd.read_csv(os.path.join(dir_csv, 'stage_1_sample_submission.csv'))#读取提交模板

submission = pd.concat([submission.drop(columns=['Label']), pd.DataFrame(test_pred)], axis=1)
#模板的Label这一列删除，然后拼接上预测结果
submission.columns = ['ID', 'Label']
# 设定列名

submission.to_csv('submission.csv', index=False)
submission.head()

# Clean Up# Submission（注释完毕）

Have to clean up since Kaggle limits the number of files that can be output from a kernel

In [None]:
!rm -rf /kaggle/working/apex
!rm test.csv
!rm train.csv