In [81]:
import torch
import torch.nn as nn
from vilt.modules import heads, objectives
import vilt.modules.vision_transformer as vit
import torch.nn.functional as F
import random
from typing import OrderedDict
import os
import pandas as pd
import numpy as np
from vilt.transforms import pixelbert_transform
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
from torch.utils.data import DataLoader
import gc
import torch.optim as optim
from torch.optim import lr_scheduler
from collections import defaultdict
import wandb
import pretrainedmodels
from efficientnet_pytorch import EfficientNet
from sklearn.model_selection import StratifiedKFold, KFold, StratifiedGroupKFold



class config:
    debug = False
    exp_name = "vilt"
    seed = 101
    batch_size = 4096  # this is a desired batch size; pl trainer will accumulate gradients when per step batch is smaller.
    train_batch_size = 32
    valid_batch_size = 4
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # root_path = r'E:\\Download\\xiangguan' # 存放数据的根目录
    root_path = r'/home/junsheng/data/xiangguan' # 存放数据的根目录
    n_fold = 5

    # model_name = "sensorViLOnlyTransformerSS" #仅vilt图像
    # model_name = "sensorOnlyViLTransformerSS"  #仅vilt传感器
    model_name = "sensorViLTransformerSS"  #vilt图像+传感器
    # wandb 
    wandb_name = "vilt|水稻|290图像加传感器"
    code_file = "test1.py"

    # Image setting
    train_transform_keys = ["pixelbert"]
    val_transform_keys = ["pixelbert"]
    img_size = 384
    max_image_len = -1
    patch_size = 32
    draw_false_image = 1
    image_only = False

    # Sensor
    # senser_input_num = 11 # 翔冠的传感器参数
    senser_input_num = 19 # 天航的传感器参数
    
    # Text Setting
    vqav2_label_size = 3129
    max_text_len = 40
    tokenizer = "bert-base-uncased"
    vocab_size = 30522 # vocabulary词汇数量
    whole_word_masking = False
    mlm_prob = 0.15
    draw_false_text = 0

    # Transformer Setting
    vit = "vit_base_patch32_384"
    hidden_size = 768  # 嵌入向量大小
    num_heads = 12
    num_layers = 12
    mlp_ratio = 4
    drop_rate = 0.1

    # Optimizer Setting
    optim_type = "adamw"
    learning_rate = 1e-3 #0.0015#2e-3 #
    weight_decay = 1e-4 # 0.01 ->1e-4
    decay_power = 1
    max_epoch = 50
    # T_max = 8000/train_batch_size*max_epoch 
    T_max = 1000/train_batch_size*max_epoch 

    # Downstream Setting
    get_recall_metric = False


    # below params varies with the environment
    data_root = ""
    log_dir = "result"
    per_gpu_batchsize = 0  # you should define this manually with per_gpu_batch_size=#
    num_gpus = 1
    num_nodes = 1
    load_path = "weights/vilt_200k_mlm_itm.ckpt"
    # load_path = "save_model_dict.pt"
    num_workers = 1
    precision = 16

# config = vars(config)
# config = dict(config)
config

if config.debug:
    config.max_epoch = 5


model


In [82]:
resnet_model = pretrainedmodels.__dict__["resnet152"](
    num_classes=1000, pretrained='imagenet')
features = list([resnet_model.conv1, resnet_model.bn1, resnet_model.relu, resnet_model.maxpool, resnet_model.layer1, resnet_model.layer2, resnet_model.layer3,resnet_model.layer4])

conv = nn.Conv2d(2048, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
bn = nn.BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
relu = nn.ReLU(inplace=True)


resnet_features = nn.Sequential(*features,conv,bn,relu)

# DNNF1

dnnf1 图片加传感器

In [83]:
class DNNF1(torch.nn.Module):
    
 
    def __init__(self,sensor_nums):
        super(DNNF1,self).__init__()
        self.sensor_linear = torch.nn.Linear(sensor_nums,768)
        
        self.token_type_embeddings = nn.Embedding(2, config.hidden_size)
        self.token_type_embeddings.apply(objectives.init_weights)

        self.transformer = getattr(vit, config.vit)(
                pretrained=True, config=vars(config)
            )
       
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()


        self.pooler = heads.Pooler(config.hidden_size)


        # DNNF1结构
        self.linear1=torch.nn.Linear(768+768,64)
        self.relu=torch.nn.ReLU()
        self.linear2=torch.nn.Linear(64,128)
        self.relu2=torch.nn.ReLU()
        self.linear3=torch.nn.Linear(128,256)
        self.relu3=torch.nn.ReLU()
        self.linear4=torch.nn.Linear(256,512)
        self.relu4=torch.nn.ReLU()
        self.linear5=torch.nn.Linear(512,512)
        self.relu5=torch.nn.ReLU()
        self.linear6=torch.nn.Linear(512,1024)
        self.relu6=torch.nn.ReLU()
        self.linear7=torch.nn.Linear(1024,1)


    def forward(self,batch,
        mask_image=False,
        image_token_type_idx=1,
        image_embeds=None,
        image_masks=None,):
        sensor_input = batch['sensor'].to(config.device)
        sensor_feats = self.sensor_linear(sensor_input)

        if image_embeds is None and image_masks is None:
            img = batch["image"].to(config.device) # torch.Size([1, 3, 384, 384])

            (
                image_embeds,  # torch.Size([1, 217, 768])
                image_masks,  # torch.Size([1, 217])
                patch_index,
                image_labels,
            ) = self.transformer.visual_embed(
                img,
                max_image_len=config.max_image_len,
                mask_it=mask_image,
            )
        else:
            patch_index, image_labels = (
                None,
                None,
            )
        # 用embedding对数据输入预处理，降低维度
        image_embeds = image_embeds + self.token_type_embeddings(
            torch.full_like(image_masks, image_token_type_idx)
        )
        image_masks = image_masks.to(config.device)
        co_embeds = image_embeds
        co_masks = image_masks

        x = co_embeds.to(config.device)  # torch.Size([1, 145, 768])

        for i, blk in enumerate(self.transformer.blocks):
            blk = blk.to(config.device)
            x, _attn = blk(x, mask=co_masks)  # co_masks = torch.Size([1, 211])

        x = self.transformer.norm(x)  # torch.Size([1, 240, 768])
        picture_feats = self.pooler(x)  # torch.Size([1, 768])#图像的特征数据
        sensor_feats = sensor_feats.squeeze(dim=1) #torch.Size([1, 1, 768])->[1,768]

        x = torch.cat([picture_feats, sensor_feats], dim=1)

        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        x = self.relu3(x)
        x = self.linear4(x)
        x = self.relu4(x)
        x = self.linear5(x)
        x = self.relu5(x)
        x = self.linear6(x)
        x = self.relu6(x)
        x = self.linear7(x)
        return x

DNNF1 picture only


In [84]:
class DNNF1PictureOnly(torch.nn.Module):
    
 
    def __init__(self,sensor_nums):
        super(DNNF1PictureOnly,self).__init__()
        self.token_type_embeddings = nn.Embedding(2, config.hidden_size)
        self.token_type_embeddings.apply(objectives.init_weights)

        self.transformer = getattr(vit, config.vit)(
                pretrained=True, config=vars(config)
            )
       
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()


        self.pooler = heads.Pooler(config.hidden_size)


        # DNNF1结构
        self.linear1=torch.nn.Linear(768,64)
        self.relu=torch.nn.ReLU()
        self.linear2=torch.nn.Linear(64,128)
        self.relu2=torch.nn.ReLU()
        self.linear3=torch.nn.Linear(128,256)
        self.relu3=torch.nn.ReLU()
        self.linear4=torch.nn.Linear(256,512)
        self.relu4=torch.nn.ReLU()
        self.linear5=torch.nn.Linear(512,512)
        self.relu5=torch.nn.ReLU()
        self.linear6=torch.nn.Linear(512,1024)
        self.relu6=torch.nn.ReLU()
        self.linear7=torch.nn.Linear(1024,1)


    def forward(self,batch,
        mask_image=False,
        image_token_type_idx=1,
        image_embeds=None,
        image_masks=None,):

        if image_embeds is None and image_masks is None:
            img = batch["image"].to(config.device) # torch.Size([1, 3, 384, 384])

            (
                image_embeds,  # torch.Size([1, 217, 768])
                image_masks,  # torch.Size([1, 217])
                patch_index,
                image_labels,
            ) = self.transformer.visual_embed(
                img,
                max_image_len=config.max_image_len,
                mask_it=mask_image,
            )
        else:
            patch_index, image_labels = (
                None,
                None,
            )
        # 用embedding对数据输入预处理，降低维度
        image_embeds = image_embeds + self.token_type_embeddings(
            torch.full_like(image_masks, image_token_type_idx)
        )
        image_masks = image_masks.to(config.device)
        co_embeds = image_embeds
        co_masks = image_masks

        x = co_embeds.to(config.device)  # torch.Size([1, 145, 768])

        for i, blk in enumerate(self.transformer.blocks):
            blk = blk.to(config.device)
            x, _attn = blk(x, mask=co_masks)  # co_masks = torch.Size([1, 211])

        x = self.transformer.norm(x)  # torch.Size([1, 240, 768])
        picture_feats = self.pooler(x)  # torch.Size([1, 768])#图像的特征数据

        x = picture_feats

        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        x = self.relu3(x)
        x = self.linear4(x)
        x = self.relu4(x)
        x = self.linear5(x)
        x = self.relu5(x)
        x = self.linear6(x)
        x = self.relu6(x)
        x = self.linear7(x)
        return x

DNNF1 sensor only

In [85]:
class DNNF1SensorOnly(torch.nn.Module):
    
 
    def __init__(self,sensor_nums):
        super(DNNF1SensorOnly,self).__init__()
        self.sensor_linear = torch.nn.Linear(sensor_nums,768)

        # DNNF1结构
        self.linear1=torch.nn.Linear(768,64)
        self.relu=torch.nn.ReLU()
        self.linear2=torch.nn.Linear(64,128)
        self.relu2=torch.nn.ReLU()
        self.linear3=torch.nn.Linear(128,256)
        self.relu3=torch.nn.ReLU()
        self.linear4=torch.nn.Linear(256,512)
        self.relu4=torch.nn.ReLU()
        self.linear5=torch.nn.Linear(512,512)
        self.relu5=torch.nn.ReLU()
        self.linear6=torch.nn.Linear(512,1024)
        self.relu6=torch.nn.ReLU()
        self.linear7=torch.nn.Linear(1024,1)


    def forward(self,batch,
        mask_image=False,
        image_token_type_idx=1,
        image_embeds=None,
        image_masks=None,):
        sensor_input = batch['sensor'].to(config.device)
        sensor_feats = self.sensor_linear(sensor_input)

        sensor_feats = sensor_feats.squeeze(dim=1) #torch.Size([1, 1, 768])->[1,768]

        x = sensor_feats

        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        x = self.relu3(x)
        x = self.linear4(x)
        x = self.relu4(x)
        x = self.linear5(x)
        x = self.relu5(x)
        x = self.linear6(x)
        x = self.relu6(x)
        x = self.linear7(x)
        return x

# DNNF2

dnnf2 图片加传感器

In [86]:
class DNNF2(torch.nn.Module):
    
 
    def __init__(self,sensor_nums):
        super(DNNF2,self).__init__()
        self.resnet = resnet_features
        self.sensor_linear = torch.nn.Linear(sensor_nums,768)
        
        self.token_type_embeddings = nn.Embedding(2, config.hidden_size)
        self.token_type_embeddings.apply(objectives.init_weights)

        self.transformer = getattr(vit, config.vit)(
                pretrained=True, config=vars(config)
            )
       
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()


        self.pooler = heads.Pooler(config.hidden_size)


        # DNNF2结构
        self.linear1=torch.nn.Linear(64+64,512)
        self.relu=torch.nn.ReLU()
        self.linear2=torch.nn.Linear(512,1024)
        self.relu2=torch.nn.ReLU()
        self.linear3=torch.nn.Linear(1024,1024)
        self.relu3=torch.nn.ReLU()
        self.linear4=torch.nn.Linear(1024,1)

        self.sensor_linear1 = torch.nn.Linear(768,32)
        self.sensor_relu1=torch.nn.ReLU()
        self.sensor_linear2 = torch.nn.Linear(32,64)
        self.sensor_relu2=torch.nn.ReLU()

        self.picture_linear1 = torch.nn.Linear(768,32)
        self.picture_relu1 = torch.nn.ReLU()
        self.picture_linear2 = torch.nn.Linear(32,64)
        self.picture_relu2 = torch.nn.ReLU()


    def forward(self,batch,
        mask_image=False,
        image_token_type_idx=1,
        image_embeds=None,
        image_masks=None,):
        sensor_input = batch['sensor'].to(config.device)
        sensor_feats = self.sensor_linear(sensor_input)

        if image_embeds is None and image_masks is None:
            img = batch["image"].to(config.device) # torch.Size([1, 3, 384, 384])

            (
                image_embeds,  # torch.Size([1, 217, 768])
                image_masks,  # torch.Size([1, 217])
                patch_index,
                image_labels,
            ) = self.transformer.visual_embed(
                img,
                max_image_len=config.max_image_len,
                mask_it=mask_image,
            )
        else:
            patch_index, image_labels = (
                None,
                None,
            )
        # 用embedding对数据输入预处理，降低维度
        image_embeds = image_embeds + self.token_type_embeddings(
            torch.full_like(image_masks, image_token_type_idx)
        )
        image_masks = image_masks.to(config.device)
        co_embeds = image_embeds
        co_masks = image_masks

        x = co_embeds.to(config.device)  # torch.Size([1, 145, 768])

        for i, blk in enumerate(self.transformer.blocks):
            blk = blk.to(config.device)
            x, _attn = blk(x, mask=co_masks)  # co_masks = torch.Size([1, 211])

        x = self.transformer.norm(x)  # torch.Size([1, 240, 768])
        picture_feats = self.pooler(x)  # torch.Size([1, 768])#图像的特征数据
        sensor_feats = sensor_feats.squeeze(dim=1) #torch.Size([1, 1, 768])->[1,768]

        sensor_feats = self.sensor_linear1(sensor_feats)
        sensor_feats = self.sensor_relu1(sensor_feats)
        sensor_feats = self.sensor_linear2(sensor_feats)
        sensor_feats = self.sensor_relu2(sensor_feats)

        picture_feats = self.picture_linear1(picture_feats)
        picture_feats = self.picture_relu1(picture_feats)
        picture_feats = self.picture_linear2(picture_feats)
        picture_feats = self.picture_relu2(picture_feats)
        x = torch.cat([picture_feats, sensor_feats], dim=1)


        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        x = self.relu3(x)
        x = self.linear4(x)
        return x

DNNF2 picture only

In [87]:
class DNNF2PictureOnly(torch.nn.Module):
    
 
    def __init__(self,sensor_nums):
        super(DNNF2PictureOnly,self).__init__()
        self.sensor_linear = torch.nn.Linear(sensor_nums,768)
        
        self.token_type_embeddings = nn.Embedding(2, config.hidden_size)
        self.token_type_embeddings.apply(objectives.init_weights)

        self.transformer = getattr(vit, config.vit)(
                pretrained=True, config=vars(config)
            )
       
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()


        self.pooler = heads.Pooler(config.hidden_size)


        # DNNF2结构
        self.linear1=torch.nn.Linear(64,512)
        self.relu=torch.nn.ReLU()
        self.linear2=torch.nn.Linear(512,1024)
        self.relu2=torch.nn.ReLU()
        self.linear3=torch.nn.Linear(1024,1024)
        self.relu3=torch.nn.ReLU()
        self.linear4=torch.nn.Linear(1024,1)


        self.picture_linear1 = torch.nn.Linear(768,32)
        self.picture_relu1 = torch.nn.ReLU()
        self.picture_linear2 = torch.nn.Linear(32,64)
        self.picture_relu2 = torch.nn.ReLU()


    def forward(self,batch,
        mask_image=False,
        image_token_type_idx=1,
        image_embeds=None,
        image_masks=None,):

        if image_embeds is None and image_masks is None:
            img = batch["image"].to(config.device) # torch.Size([1, 3, 384, 384])

            (
                image_embeds,  # torch.Size([1, 217, 768])
                image_masks,  # torch.Size([1, 217])
                patch_index,
                image_labels,
            ) = self.transformer.visual_embed(
                img,
                max_image_len=config.max_image_len,
                mask_it=mask_image,
            )
        else:
            patch_index, image_labels = (
                None,
                None,
            )
        # 用embedding对数据输入预处理，降低维度
        image_embeds = image_embeds + self.token_type_embeddings(
            torch.full_like(image_masks, image_token_type_idx)
        )
        image_masks = image_masks.to(config.device)
        co_embeds = image_embeds
        co_masks = image_masks

        x = co_embeds.to(config.device)  # torch.Size([1, 145, 768])

        for i, blk in enumerate(self.transformer.blocks):
            blk = blk.to(config.device)
            x, _attn = blk(x, mask=co_masks)  # co_masks = torch.Size([1, 211])

        x = self.transformer.norm(x)  # torch.Size([1, 240, 768])
        picture_feats = self.pooler(x)  # torch.Size([1, 768])#图像的特征数据

        picture_feats = self.picture_linear1(picture_feats)
        picture_feats = self.picture_relu1(picture_feats)
        picture_feats = self.picture_linear2(picture_feats)
        picture_feats = self.picture_relu2(picture_feats)
        
        x = picture_feats

        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        x = self.relu3(x)
        x = self.linear4(x)
        return x

DNNF2 sensor only


In [88]:
class DNNF2SensorOnly(torch.nn.Module):
    
 
    def __init__(self,sensor_nums):
        super(DNNF2SensorOnly,self).__init__()
        self.sensor_linear = torch.nn.Linear(sensor_nums,768)


        # DNNF2结构
        self.linear1=torch.nn.Linear(64,512)
        self.relu=torch.nn.ReLU()
        self.linear2=torch.nn.Linear(512,1024)
        self.relu2=torch.nn.ReLU()
        self.linear3=torch.nn.Linear(1024,1024)
        self.relu3=torch.nn.ReLU()
        self.linear4=torch.nn.Linear(1024,1)

        self.sensor_linear1 = torch.nn.Linear(768,32)
        self.sensor_relu1=torch.nn.ReLU()
        self.sensor_linear2 = torch.nn.Linear(32,64)
        self.sensor_relu2=torch.nn.ReLU()



    def forward(self,batch,
        mask_image=False,
        image_token_type_idx=1,
        image_embeds=None,
        image_masks=None,):
        sensor_input = batch['sensor'].to(config.device)
        sensor_feats = self.sensor_linear(sensor_input)

        sensor_feats = sensor_feats.squeeze(dim=1) #torch.Size([1, 1, 768])->[1,768]

        sensor_feats = self.sensor_linear1(sensor_feats)
        sensor_feats = self.sensor_relu1(sensor_feats)
        sensor_feats = self.sensor_linear2(sensor_feats)
        sensor_feats = self.sensor_relu2(sensor_feats)


        
        x = sensor_feats


        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        x = self.relu3(x)
        x = self.linear4(x)
        return x

# test

In [89]:
# model = DNNF2(sensor_nums=config.senser_input_num)
# model = DNNF2SensorOnly(sensor_nums=config.senser_input_num)
# model = DNNF2PictureOnly(sensor_nums=config.senser_input_num)
# model = DNNF1(sensor_nums=config.senser_input_num)
model = DNNF1SensorOnly(sensor_nums=config.senser_input_num)

model.to(config.device)
sensor = torch.rand(config.senser_input_num)
# sensor = torch.ones(config.senser_input_num)
# print(sensor)
sensor =  torch.tensor(sensor).unsqueeze(0).unsqueeze(0) # torch.Size([1, 1, 3])
batch = {}
batch['sensor'] = sensor
batch['image'] = torch.randn((1,3,384,384))
model(batch)

  sensor =  torch.tensor(sensor).unsqueeze(0).unsqueeze(0) # torch.Size([1, 1, 3])


tensor([[0.0192]], device='cuda:0', grad_fn=<AddmmBackward0>)