# import

In [32]:
import torch
import torch.nn as nn
from vilt.modules import heads, objectives
import vilt.modules.vision_transformer as vit
import torch.nn.functional as F
import random
from typing import OrderedDict
import os
import pandas as pd
import numpy as np
from vilt.transforms import pixelbert_transform
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
from torch.utils.data import DataLoader
import gc
import torch.optim as optim
from torch.optim import lr_scheduler
from collections import defaultdict
import wandb

from sklearn.model_selection import StratifiedKFold, KFold, StratifiedGroupKFold


# config

In [33]:


class config:
    debug = False
    exp_name = "vilt"
    seed = 101
    batch_size = 4096  # this is a desired batch size; pl trainer will accumulate gradients when per step batch is smaller.
    train_batch_size = 32
    valid_batch_size = 4
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    n_fold = 5

    model_name = "sensorViLOnlyTransformerSS"
    # wandb 
    wandb_name = "vilt|大豆|290仅图片"
    

    # Image setting
    train_transform_keys = ["pixelbert"]
    val_transform_keys = ["pixelbert"]
    img_size = 384
    max_image_len = -1
    patch_size = 32
    draw_false_image = 1
    image_only = False

    # Sensor
    # senser_input_num = 11 # 翔冠的传感器参数
    senser_input_num = 19 # 天航的传感器参数
    
    # Text Setting
    vqav2_label_size = 3129
    max_text_len = 40
    tokenizer = "bert-base-uncased"
    vocab_size = 30522 # vocabulary词汇数量
    whole_word_masking = False
    mlm_prob = 0.15
    draw_false_text = 0

    # Transformer Setting
    vit = "vit_base_patch32_384"
    hidden_size = 768  # 嵌入向量大小
    num_heads = 12
    num_layers = 12
    mlp_ratio = 4
    drop_rate = 0.1

    # Optimizer Setting
    optim_type = "adamw"
    learning_rate = 1e-3 #0.0015#2e-3 #
    weight_decay = 1e-4 # 0.01 ->1e-4
    decay_power = 1
    max_epoch = 50
    max_steps = 25000
    warmup_steps = 2500
    end_lr = 0
    lr_mult = 1  # multiply lr for downstream heads
    # T_max = 8000/train_batch_size*max_epoch 
    T_max = 1000/train_batch_size*max_epoch 

    # Downstream Setting
    get_recall_metric = False


    # below params varies with the environment
    data_root = ""
    log_dir = "result"
    per_gpu_batchsize = 0  # you should define this manually with per_gpu_batch_size=#
    num_gpus = 1
    num_nodes = 1
    load_path = "weights/vilt_200k_mlm_itm.ckpt"
    # load_path = "save_model_dict.pt"
    num_workers = 1
    precision = 16

# config = vars(config)
# config = dict(config)
config

if config.debug:
    config.max_epoch = 5

In [34]:
def setup_seed(seed):

    torch.manual_seed(seed)  # 为CPU设置随机种子
    np.random.seed(seed)  # Numpy module.
    random.seed(seed)  # Python random module.
    # torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.cuda.manual_seed(seed)  # 为当前GPU设置随机种子
    torch.cuda.manual_seed_all(seed)  # 为所有GPU设置随机种子
    #os.environ['PYTHONHASHSEED'] = str(seed)
    
setup_seed(config.seed)

# wandb

In [35]:
os.environ["WANDB_MODE"] = 'dryrun' # 离线模式
try:
    # wandb.log(key="*******") # if debug
    wandb.login() # storage in ~/.netrc file
    anonymous = None
except:
    anonymous = "must"
    print('\nGet your W&B access token from here: https://wandb.ai/authorize\n')




# 数据

In [36]:
df_tianhang = pd.read_csv("/home/junsheng/ViLT/data/290-tianhang-soybean.csv")
df_tianhang['image_path'] = df_tianhang['pic_key'].map(lambda x:os.path.join('/home/junsheng/data/tianhang_soybean',x.split('/')[-1]))
df_tianhang['label'] = df_tianhang['LAI']
df_tianhang = df_tianhang.dropna()
df_tianhang = df_tianhang.reset_index()
print(df_tianhang.shape)
df_tianhang.to_csv("test.csv",index=False)
df_tianhang.head()

(2658, 28)


Unnamed: 0,index,pic_key,date_hour,date,co2,stemp,stemp2,stemp3,stemp4,stemp5,...,pm10,pm25,press,solar,temp,wind_d,wind_sp,LAI,image_path,label
0,32,/794/1655497027_1655496664_4.jpg,2022-06-18 04,2022/6/18,419.0,19.2,19.3,19.1,18.8,18.4,...,6.0,6.0,991.1,2.52,17.26,274.3,3.75,1.3175,/home/junsheng/data/tianhang_soybean/165549702...,1.3175
1,33,/794/1655497027_1655496664_4.jpg,2022-06-18 04,2022/6/18,419.0,19.2,19.3,19.1,18.8,18.4,...,7.0,7.0,991.2,5.93,17.18,268.7,2.67,1.3175,/home/junsheng/data/tianhang_soybean/165549702...,1.3175
2,34,/794/1655497027_1655496664_4.jpg,2022-06-18 04,2022/6/18,418.0,19.1,19.2,19.1,18.8,18.4,...,6.0,6.0,991.1,2.52,17.26,274.3,3.75,1.3175,/home/junsheng/data/tianhang_soybean/165549702...,1.3175
3,35,/794/1655497027_1655496664_4.jpg,2022-06-18 04,2022/6/18,418.0,19.1,19.2,19.1,18.8,18.4,...,7.0,7.0,991.2,5.93,17.18,268.7,2.67,1.3175,/home/junsheng/data/tianhang_soybean/165549702...,1.3175
4,36,/794/1655504185_1655503864_4.jpg,2022-06-18 06,2022/6/18,419.0,18.8,19.0,18.9,18.7,18.3,...,5.0,5.0,991.9,8.84,17.75,248.6,2.07,1.3175,/home/junsheng/data/tianhang_soybean/165550418...,1.3175


数据检查

In [37]:
# 检查图片下载的全不全
pic = df_tianhang.image_path.map(lambda x:x.split('/')[-1]).unique()
print(len(pic))
file_ls = os.listdir("/home/junsheng/data/tianhang_soybean")
print(len(file_ls))
ret = list(set(pic) ^ set(file_ls))
print(len(ret)) #差集
# assert len(pic)==len(file_ls),"请检查下载的图片，缺了{}个".format(len(pic)-len(file_ls))


648
811
163


归一化非object列

In [38]:
list(df_tianhang)

['index',
 'pic_key',
 'date_hour',
 'date',
 'co2',
 'stemp',
 'stemp2',
 'stemp3',
 'stemp4',
 'stemp5',
 'shumi',
 'shumi2',
 'shumi3',
 'shumi4',
 'shumi5',
 'ts',
 'insert_time',
 'humi',
 'pm10',
 'pm25',
 'press',
 'solar',
 'temp',
 'wind_d',
 'wind_sp',
 'LAI',
 'image_path',
 'label']

In [39]:
number_title = []
recorder = {}
for title in df_tianhang:
    # print(df_xiangguan[title].head())
    if title == 'raw_label':
        continue
    if df_tianhang[title].dtype != "object":
        
        number_title.append(title)
        x_min = df_tianhang[title].min()
        x_max = df_tianhang[title].max()
        # print(x_min,x_max)
        recorder[title] = (x_min,x_max)
        df_tianhang[title] = df_tianhang[title].map(lambda x:(x-x_min)/(x_max - x_min))
print(number_title)
print(recorder)

['index', 'co2', 'stemp', 'stemp2', 'stemp3', 'stemp4', 'stemp5', 'shumi', 'shumi2', 'shumi3', 'shumi4', 'shumi5', 'humi', 'pm10', 'pm25', 'press', 'solar', 'temp', 'wind_d', 'wind_sp', 'LAI', 'label']
{'index': (32, 3161), 'co2': (341.0, 751.0), 'stemp': (14.0, 29.0), 'stemp2': (14.8, 27.5), 'stemp3': (15.5, 25.7), 'stemp4': (15.6, 24.6), 'stemp5': (16.0, 24.3), 'shumi': (44.6, 75.7), 'shumi2': (36.5, 71.3), 'shumi3': (38.9, 71.7), 'shumi4': (43.6, 75.0), 'shumi5': (61.6, 80.0), 'humi': (31.0, 100.0), 'pm10': (0.0, 1333.0), 'pm25': (0.0, 1333.0), 'press': (981.1, 1014.8), 'solar': (0.0, 200.0), 'temp': (7.25, 32.0), 'wind_d': (0.0, 359.8), 'wind_sp': (0.0, 10.27), 'LAI': (1.3175, 2.23), 'label': (1.3175, 2.23)}


In [40]:
df_tianhang['stemp4'].dtype

dtype('float64')

In [41]:
# xiangguan_sensor = ['temperature', 'humidity', 'illuminance', 'soil_temperature', 'soil_humidity', 'pressure', 'wind_speed', 'photosynthetic', 'sun_exposure_time', 'COz', 'soil_ph']
tianhang_sensor = ['co2', 'stemp', 'stemp2', 'stemp3', 'stemp4', 'stemp5', 'shumi', 'shumi2', 'shumi3', 'shumi4', 'shumi5', 'humi', 'pm10', 'pm25', 'press', 'solar', 'temp', 'wind_d', 'wind_sp']
# tianhang_sensor = ['co2', 'stemp', 'stemp2', 'stemp3', 'stemp5', 'shumi', 'shumi2', 'shumi3', 'shumi5', 'humi', 'pm10', 'pm25', 'press', 'solar', 'temp', 'wind_d', 'wind_sp']

df_tianhang['sensor'] = df_tianhang[tianhang_sensor].values.tolist()
print("input dim:",len(tianhang_sensor))

input dim: 19


In [42]:
df=df_tianhang
if config.debug:
    df = df[:100]
df.shape

(2658, 29)

In [43]:
df_tianhang.to_csv("test.csv",index=False)

create folds

In [44]:
skf = StratifiedKFold(n_splits=config.n_fold, shuffle=True, random_state=config.seed)  
for fold, (train_idx, val_idx) in enumerate(skf.split(df,df.date)):
    df.loc[val_idx, 'fold'] = fold
df.groupby(['fold'])['label'].count()# ???

fold
0.0    532
1.0    532
2.0    532
3.0    531
4.0    531
Name: label, dtype: int64

In [45]:
df.head()
df.to_csv("test_fold.csv",index=False)

# dataset


In [46]:
myTransforms = transforms.Compose([
    transforms.Resize((config.img_size,config.img_size)),
    transforms.ToTensor(),
    transforms.Normalize(
    mean=[0.7136, 0.7118, 0.6788],
    std=[0.3338, 0.3453, 0.3020],
    
)
])

def load_img(path):
    img =  Image.open(path).convert('RGB')
    img = myTransforms(img)
    return img

class BuildDataset(torch.utils.data.Dataset):
    def __init__(self, df, label=True, transforms=None):
        self.df         = df
        self.label      = label
        self.sensors = df['sensor'].tolist()
        self.img_paths  = df['image_path'].tolist()   
        if self.label:
            self.labels = df['label'].tolist()
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_path  = self.img_paths[index]
        img = load_img(img_path)
        sensor = self.sensors[index]
        sensor = torch.tensor(sensor).unsqueeze(0) #[1,n]
        if self.label:
            label = self.labels[index]
            return torch.tensor(img).to(torch.float), torch.tensor(sensor).to(torch.float),torch.tensor(label).to(torch.float)
        else:
            return torch.tensor(img).to(torch.float), torch.tensor(sensor).to(torch.float)

# dataloader

In [47]:
def fetch_dataloader(fold:int):
    train_df = df.query("fold!=@fold").reset_index(drop=True)

    valid_df = df.query("fold==@fold").reset_index(drop=True)
    print("train_df.shape:",train_df.shape)
    print("valid_df.shape:",valid_df.shape)

    train_data  = BuildDataset(df=train_df,label=True)
    valid_data = BuildDataset(df=valid_df,label=True)

    train_loader = DataLoader(train_data, batch_size=config.train_batch_size,shuffle=True)
    valid_loader = DataLoader(valid_data, batch_size=config.valid_batch_size,shuffle=False)
    # test_loader = DataLoader(test_data, batch_size=config.test_batch_size,shuffle=False)
    return train_loader,valid_loader


In [48]:
# train_dataset = BuildDataset(df=df)
# train_loader = DataLoader(train_dataset, batch_size=config.train_batch_size,shuffle=True)
# valid_loader = DataLoader(train_dataset, batch_size=config.valid_batch_size,shuffle=True)
train_loader,valid_loader = fetch_dataloader(fold=0)


train_df.shape: (2126, 30)
valid_df.shape: (532, 30)


In [49]:
img,sensor,label = next(iter(train_loader))
print(img.shape)
print(sensor.shape)
print(label.shape)

  return torch.tensor(img).to(torch.float), torch.tensor(sensor).to(torch.float),torch.tensor(label).to(torch.float)


torch.Size([32, 3, 384, 384])
torch.Size([32, 1, 19])
torch.Size([32])


# model

sensorViLOnlyTransformerSS-仅vit

In [50]:
class sensorViLOnlyTransformerSS(nn.Module):

    def __init__(self, sensor_class_n, output_class_n):
        super().__init__()
        self.token_type_embeddings = nn.Embedding(2, config.hidden_size)
        self.token_type_embeddings.apply(objectives.init_weights)
        self.transformer = getattr(vit, config.vit)(
            pretrained=True, config=vars(config)
        )
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()
        self.pooler = heads.Pooler(config.hidden_size)
        self.classifier = nn.Linear(config.hidden_size, output_class_n)

    def infer(
        self,
        batch,
        mask_image=False,
        image_token_type_idx=1,
        image_embeds=None,
        image_masks=None,
    ):

        if image_embeds is None and image_masks is None:
            img = batch["image"].to(config.device)

            (
                image_embeds,  # torch.Size([1, 217, 768])
                image_masks,  # torch.Size([1, 217])
                patch_index,
                image_labels,
            ) = self.transformer.visual_embed(
                img,
                max_image_len=config.max_image_len,
                mask_it=mask_image,
            )
        else:
            patch_index, image_labels = (
                None,
                None,
            )
        # 用embedding对数据输入预处理，降低维度
        image_embeds = image_embeds + self.token_type_embeddings(
            torch.full_like(image_masks, image_token_type_idx)
        )
        # sensor_masks = batch['sensor_masks'] # 序列数量
        batch_size = img.shape[0]
        sensor_masks = torch.ones(batch_size, 1).to(config.device)  # 序列数量
        image_masks = image_masks.to(config.device)
        co_embeds = image_embeds
        co_masks = image_masks

        x = co_embeds.to(config.device)  # torch.Size([1, 145, 768])

        for i, blk in enumerate(self.transformer.blocks):
            blk = blk.to(config.device)
            x, _attn = blk(x, mask=co_masks)  # co_masks = torch.Size([1, 211])

        x = self.transformer.norm(x)  # torch.Size([1, 240, 768])
        image_feats = x
        cls_feats = self.pooler(x)  # torch.Size([1, 768])
        # cls_feats = self.dense(x)
        # cls_feats = self.activation(cls_feats)
        cls_output = self.classifier(cls_feats)
        # m = nn.Softmax(dim=1)

        m = nn.Sigmoid()
        cls_output = m(cls_output)

        ret = {
            "image_feats": image_feats,
            "cls_feats": cls_feats,  # class features
            "raw_cls_feats": x[:, 0],
            "image_labels": image_labels,
            "image_masks": image_masks,

            "patch_index": patch_index,

            "cls_output": cls_output,
        }

        return ret

    def forward(self, batch):
        ret = dict()

        ret.update(self.infer(batch))
        return ret

sensorViLTransformerSS

In [51]:

class sensorViLTransformerSS(nn.Module):

    def __init__(self,sensor_class_n,output_class_n):
        super().__init__()
        self.sensor_linear = nn.Linear(sensor_class_n,config.hidden_size) 

        self.token_type_embeddings = nn.Embedding(2, config.hidden_size)
        self.token_type_embeddings.apply(objectives.init_weights)

        self.transformer = getattr(vit, config.vit)(
                pretrained=True, config=vars(config)
            )
       
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()


        self.pooler = heads.Pooler(config.hidden_size)

        # self.pooler.apply(objectives.init_weights)
        self.classifier = nn.Linear(config.hidden_size,output_class_n)

        hs = config.hidden_size


    def infer(
        self,
        batch,
        mask_image=False,
        image_token_type_idx=1,
        image_embeds=None,
        image_masks=None,
    ):
        sensor = batch['sensor'].to(config.device)
        sensor_embeds = self.sensor_linear(sensor) # input[1,1,12]  output[1,1,768]
        

        if image_embeds is None and image_masks is None:
            img = batch["image"].to(config.device)
       
            (
                image_embeds, # torch.Size([1, 217, 768])
                image_masks, # torch.Size([1, 217])
                patch_index,
                image_labels,
            ) = self.transformer.visual_embed(
                img,
                max_image_len=config.max_image_len,
                mask_it=mask_image,
            )
        else:
            patch_index, image_labels = (
                None,
                None,
            )
        # 用embedding对数据输入预处理，降低维度
        image_embeds = image_embeds + self.token_type_embeddings(
                torch.full_like(image_masks, image_token_type_idx)
            )
        # sensor_masks = batch['sensor_masks'] # 序列数量
        batch_size = img.shape[0]
        sensor_masks = torch.ones(batch_size,1).to(config.device) # 序列数量
        image_masks = image_masks.to(config.device)
        co_embeds = torch.cat([sensor_embeds, image_embeds], dim=1) # torch.Size([1, 240, 768]) ->240=217+23
        co_masks = torch.cat([sensor_masks, image_masks], dim=1) # torch.Size([1, 240])

        x = co_embeds.to(config.device) # torch.Size([1, 211, 768])

        for i, blk in enumerate(self.transformer.blocks): 
            blk = blk.to(config.device)
            x, _attn = blk(x, mask=co_masks) # co_masks = torch.Size([1, 211])

        x = self.transformer.norm(x) # torch.Size([1, 240, 768])
        sensor_feats, image_feats = ( # torch.Size([1, 23, 768]),torch.Size([1, 217, 768])
            x[:, : sensor_embeds.shape[1]], # 后面字数输出23维
            x[:, sensor_embeds.shape[1] :], # 前面图片输出217维
        )
        cls_feats = self.pooler(x) # torch.Size([1, 768])
        # cls_feats = self.dense(x)
        # cls_feats = self.activation(cls_feats)
        cls_output = self.classifier(cls_feats)
        # m = nn.Softmax(dim=1)
        
        m = nn.Sigmoid()
        cls_output = m(cls_output)
        
        ret = {
           "sensor_feats":sensor_feats,
            "image_feats": image_feats,
            "cls_feats": cls_feats, # class features
            "raw_cls_feats": x[:, 0],
            "image_labels": image_labels,
            "image_masks": image_masks,
           
            "patch_index": patch_index,

            "cls_output":cls_output,
        }

        return ret

    def forward(self, batch):
        ret = dict()
        
        ret.update(self.infer(batch))
        return ret


sensorOnlyViLTransformerSS

In [52]:

class sensorOnlyViLTransformerSS(nn.Module):

    def __init__(self,sensor_class_n,output_class_n):
        super().__init__()
        self.sensor_linear = nn.Linear(sensor_class_n,config.hidden_size) 

        self.token_type_embeddings = nn.Embedding(2, config.hidden_size)
        self.token_type_embeddings.apply(objectives.init_weights)

        self.transformer = getattr(vit, config.vit)(
                pretrained=True, config=vars(config)
            )
       
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()


        self.pooler = heads.Pooler(config.hidden_size)

        # self.pooler.apply(objectives.init_weights)
        self.classifier = nn.Linear(config.hidden_size,output_class_n)

        hs = config.hidden_size


    def infer(
        self,
        batch,
        # mask_image=False,
        # image_token_type_idx=1,
        # image_embeds=None,
        # image_masks=None,
    ):
        sensor = batch['sensor'].to(config.device)
        sensor_embeds = self.sensor_linear(sensor) # input[1,1,12]  output[1,1,768]
        

        # if image_embeds is None and image_masks is None:
        #     img = batch["image"].to(config.device)
       
        #     (
        #         image_embeds, # torch.Size([1, 217, 768])
        #         image_masks, # torch.Size([1, 217])
        #         patch_index,
        #         image_labels,
        #     ) = self.transformer.visual_embed(
        #         img,
        #         max_image_len=config.max_image_len,
        #         mask_it=mask_image,
        #     )
        # else:
        #     patch_index, image_labels = (
        #         None,
        #         None,
        #     )
        # 用embedding对数据输入预处理，降低维度
        # image_embeds = image_embeds + self.token_type_embeddings(
        #         torch.full_like(image_masks, image_token_type_idx)
        #     )
        # sensor_masks = batch['sensor_masks'] # 序列数量
        # batch_size = img.shape[0]
        sensor_masks = torch.ones(sensor_embeds.shape[1],1).to(config.device) # 序列数量
        # image_masks = image_masks.to(config.device)
        # co_embeds = torch.cat([sensor_embeds, image_embeds], dim=1) # torch.Size([1, 240, 768]) ->240=217+23
        # co_masks = torch.cat([sensor_masks, image_masks], dim=1) # torch.Size([1, 240])
        co_embeds = sensor_embeds
        co_masks = sensor_masks

        x = co_embeds.to(config.device) # torch.Size([1, 1, 768])

        for i, blk in enumerate(self.transformer.blocks):
            blk = blk.to(config.device)
            x, _attn = blk(x, mask=co_masks)

        x = self.transformer.norm(x) # torch.Size([1, 240, 768])
        # sensor_feats, image_feats = ( # torch.Size([1, 23, 768]),torch.Size([1, 217, 768])
        #     x[:, : sensor_embeds.shape[1]], # 后面字数输出23维
        #     x[:, sensor_embeds.shape[1] :], # 前面图片输出217维
        # )
        cls_feats = self.pooler(x) # torch.Size([1, 768])
        # cls_feats = self.dense(x)
        # cls_feats = self.activation(cls_feats)
        cls_output = self.classifier(cls_feats)
        # m = nn.Softmax(dim=1)
        
        m = nn.Sigmoid()
        cls_output = m(cls_output)
        
        ret = {
        #    "sensor_feats":sensor_feats,
            # "image_feats": image_feats,
            "cls_feats": cls_feats, # class features
            "raw_cls_feats": x[:, 0],
            # "image_labels": image_labels,
            # "image_masks": image_masks,
           
            # "patch_index": patch_index,

            "cls_output":cls_output,
        }

        return ret

    def forward(self, batch):
        ret = dict()
        
        ret.update(self.infer(batch))
        return ret


sensorResnet50TransformerSS

In [53]:

class sensorResnet50TransformerSS(nn.Module):

    def __init__(self,sensor_class_n,output_class_n):
        super().__init__()
        self.sensor_linear = nn.Linear(sensor_class_n,config.hidden_size) 
        # resnet model
        resnet_model = pretrainedmodels.__dict__["resnet50"](
    num_classes=1000, pretrained='imagenet')
        features = list([resnet_model.conv1, resnet_model.bn1, resnet_model.relu, resnet_model.maxpool, resnet_model.layer1, resnet_model.layer2, resnet_model.layer3,resnet_model.layer4])
        conv = nn.Conv2d(2048, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        bn = nn.BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        relu = nn.ReLU(inplace=True)


        self.resnet_features = nn.Sequential(*features,conv,bn,relu)

        self.token_type_embeddings = nn.Embedding(2, config.hidden_size)
        self.token_type_embeddings.apply(objectives.init_weights)

        self.transformer = getattr(vit, config.vit)(
                pretrained=True, config=vars(config)
            )
       
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()


        self.pooler = heads.Pooler(config.hidden_size)

        # self.pooler.apply(objectives.init_weights)
        self.classifier = nn.Linear(config.hidden_size,output_class_n)

        hs = config.hidden_size


    def infer(
        self,
        batch,
        mask_image=False,
        image_token_type_idx=1,
        image_embeds=None,
        image_masks=None,
    ):
        sensor = batch['sensor'].to(config.device)
        sensor_embeds = self.sensor_linear(sensor) # input[1,1,12]  output[1,1,768]
        img = batch["image"].to(config.device)
        image_embeds = self.resnet_features(img) 
        image_embeds = image_embeds.flatten(2).transpose(1, 2)
        image_masks = torch.ones(image_embeds.shape[0],image_embeds.shape[1],dtype=torch.int64).to(config.device)

        # 用embedding对数据输入预处理，降低维度
        image_embeds = image_embeds + self.token_type_embeddings(
                torch.full_like(image_masks, image_token_type_idx)
            )
        # sensor_masks = batch['sensor_masks'] # 序列数量
        batch_size = img.shape[0]
        sensor_masks = torch.ones(batch_size,1).to(config.device) # 序列数量
        image_masks = image_masks.to(config.device)
        co_embeds = torch.cat([sensor_embeds, image_embeds], dim=1) # torch.Size([1, 240, 768]) ->240=217+23
        co_masks = torch.cat([sensor_masks, image_masks], dim=1) # torch.Size([1, 240])

        x = co_embeds.to(config.device) # torch.Size([1, 211, 768])

        for i, blk in enumerate(self.transformer.blocks): 
            blk = blk.to(config.device)
            x, _attn = blk(x, mask=co_masks) # co_masks = torch.Size([1, 211])

        x = self.transformer.norm(x) # torch.Size([1, 240, 768])
        sensor_feats, image_feats = ( # torch.Size([1, 23, 768]),torch.Size([1, 217, 768])
            x[:, : sensor_embeds.shape[1]], # 后面字数输出23维
            x[:, sensor_embeds.shape[1] :], # 前面图片输出217维
        )
        cls_feats = self.pooler(x) # torch.Size([1, 768])
        # cls_feats = self.dense(x)
        # cls_feats = self.activation(cls_feats)
        cls_output = self.classifier(cls_feats)
        # m = nn.Softmax(dim=1)
        
        m = nn.Sigmoid()
        cls_output = m(cls_output)
        
        ret = {
           "sensor_feats":sensor_feats,
            "image_feats": image_feats,
            "cls_feats": cls_feats, # class features
            "raw_cls_feats": x[:, 0],
            "image_masks": image_masks,
           

            "cls_output":cls_output,
        }

        return ret

    def forward(self, batch):
        ret = dict()
        
        ret.update(self.infer(batch))
        return ret


sensorResnet101TransformerSS

In [54]:

class sensorResnet101TransformerSS(nn.Module):

    def __init__(self,sensor_class_n,output_class_n):
        super().__init__()
        self.sensor_linear = nn.Linear(sensor_class_n,config.hidden_size) 
        # resnet model
        resnet_model = pretrainedmodels.__dict__["resnet101"](
    num_classes=1000, pretrained='imagenet')
        features = list([resnet_model.conv1, resnet_model.bn1, resnet_model.relu, resnet_model.maxpool, resnet_model.layer1, resnet_model.layer2, resnet_model.layer3,resnet_model.layer4])
        conv = nn.Conv2d(2048, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        bn = nn.BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        relu = nn.ReLU(inplace=True)


        self.resnet_features = nn.Sequential(*features,conv,bn,relu)

        self.token_type_embeddings = nn.Embedding(2, config.hidden_size)
        self.token_type_embeddings.apply(objectives.init_weights)

        self.transformer = getattr(vit, config.vit)(
                pretrained=True, config=vars(config)
            )
       
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()


        self.pooler = heads.Pooler(config.hidden_size)

        # self.pooler.apply(objectives.init_weights)
        self.classifier = nn.Linear(config.hidden_size,output_class_n)

        hs = config.hidden_size


    def infer(
        self,
        batch,
        mask_image=False,
        image_token_type_idx=1,
        image_embeds=None,
        image_masks=None,
    ):
        sensor = batch['sensor'].to(config.device)
        sensor_embeds = self.sensor_linear(sensor) # input[1,1,12]  output[1,1,768]
        img = batch["image"].to(config.device)
        image_embeds = self.resnet_features(img) 
        image_embeds = image_embeds.flatten(2).transpose(1, 2)
        image_masks = torch.ones(image_embeds.shape[0],image_embeds.shape[1],dtype=torch.int64).to(config.device)

        # 用embedding对数据输入预处理，降低维度
        image_embeds = image_embeds + self.token_type_embeddings(
                torch.full_like(image_masks, image_token_type_idx)
            )
        # sensor_masks = batch['sensor_masks'] # 序列数量
        batch_size = img.shape[0]
        sensor_masks = torch.ones(batch_size,1).to(config.device) # 序列数量
        image_masks = image_masks.to(config.device)
        co_embeds = torch.cat([sensor_embeds, image_embeds], dim=1) # torch.Size([1, 240, 768]) ->240=217+23
        co_masks = torch.cat([sensor_masks, image_masks], dim=1) # torch.Size([1, 240])

        x = co_embeds.to(config.device) # torch.Size([1, 211, 768])

        for i, blk in enumerate(self.transformer.blocks): 
            blk = blk.to(config.device)
            x, _attn = blk(x, mask=co_masks) # co_masks = torch.Size([1, 211])

        x = self.transformer.norm(x) # torch.Size([1, 240, 768])
        sensor_feats, image_feats = ( # torch.Size([1, 23, 768]),torch.Size([1, 217, 768])
            x[:, : sensor_embeds.shape[1]], # 后面字数输出23维
            x[:, sensor_embeds.shape[1] :], # 前面图片输出217维
        )
        cls_feats = self.pooler(x) # torch.Size([1, 768])
        # cls_feats = self.dense(x)
        # cls_feats = self.activation(cls_feats)
        cls_output = self.classifier(cls_feats)
        # m = nn.Softmax(dim=1)
        
        m = nn.Sigmoid()
        cls_output = m(cls_output)
        
        ret = {
           "sensor_feats":sensor_feats,
            "image_feats": image_feats,
            "cls_feats": cls_feats, # class features
            "raw_cls_feats": x[:, 0],
            "image_masks": image_masks,
           

            "cls_output":cls_output,
        }

        return ret

    def forward(self, batch):
        ret = dict()
        
        ret.update(self.infer(batch))
        return ret


## model build

In [55]:
import pretrainedmodels
from efficientnet_pytorch import EfficientNet

def build_model(model_name: str,pre_train):
    if model_name[:6] == "resnet50":
        model = pretrainedmodels.__dict__[config.model_name](
            num_classes=1000, pretrained='imagenet')
        dim_feats = model.last_linear.in_features  # =2048
        nb_classes = 1
        model.last_linear = nn.Linear(dim_feats, nb_classes)
        return model
    if model_name == "se_resnet50":
        model = pretrainedmodels.__dict__[config.model_name](
            num_classes=1000, pretrained='imagenet')
        model.last_linear = nn.Linear(204800, 1,bias=True)
        return model
    if model_name == "efficientnet-b4": # efficient net
        # refer:https://github.com/lukemelas/EfficientNet-PyTorch#example-classification
        nb_classes = 1
        if pre_train:
            model = EfficientNet.from_pretrained(config.model_name)# 'efficientnet-b4'
        else:
            model = EfficientNet.from_name(config.model_name)# 'efficientnet-b4'
        model._fc = nn.Linear(1792, nb_classes)
        return model

    if model_name == "sensorOnlyViLTransformerSS": #仅传感器
        model = sensorOnlyViLTransformerSS(sensor_class_n= config.senser_input_num,output_class_n = 1)
        return model
    if model_name == "sensorViLOnlyTransformerSS": # 仅vit图像
        model = sensorViLOnlyTransformerSS(sensor_class_n= config.senser_input_num,output_class_n = 1)
        return model
        
    if model_name == "sensorResnet50TransformerSS":
        model = sensorResnet50TransformerSS(sensor_class_n= config.senser_input_num,output_class_n = 1)
        return model
    if model_name == "sensorResnet101TransformerSS":
        model = sensorResnet101TransformerSS(sensor_class_n= config.senser_input_num,output_class_n = 1)
        return model

    if model_name == "sensorViLTransformerSS":
        model = sensorViLTransformerSS(sensor_class_n= config.senser_input_num,output_class_n = 1)
        return model

model = build_model(config.model_name,True)
model.to(config.device)
print(config.device)
for i,m in enumerate(model.modules()):
    print(i,m)

No pretrained weights exist or were found for this model. Using random initialization.


cuda:0
0 sensorViLOnlyTransformerSS(
  (token_type_embeddings): Embedding(2, 768)
  (transformer): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32))
    )
    (pos_drop): Dropout(p=0.1, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.1, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate=none)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop): Dropout(

test

In [56]:

# sensor = torch.rand(config.senser_input_num)
# # sensor = torch.ones(config.senser_input_num)
# print(sensor)
# sensor =  torch.tensor(sensor).unsqueeze(0).unsqueeze(0) # torch.Size([1, 1, 3])
# batch = {}
# batch['sensor'] = sensor
# batch['image'] = "/home/junsheng/data/xiangguan/pic/xiangguanD4-2021-05-24-10-00-25.jpeg"
# model(batch)

# 损失函数

In [57]:
criterion = F.mse_loss #均方误差损失函数
# criterion = F.mae_loss

# train one epoch

In [58]:



def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    dataset_size = 0
    running_loss = 0.0
    
    pbar = tqdm(enumerate(dataloader), total=len(dataloader), desc='Train ')
    for step, (img, sensor,label) in pbar:         
        # img = img.to(device, dtype=torch.float)
        # sensor  = sensor.to(device, dtype=torch.float)
        # label  = label.to(device, dtype=torch.float)
        batch_size = img.size(0)
        
        batch = {"image":img,"sensor":sensor}

        y_pred = model(batch)
        label = label.to(config.device).unsqueeze(1)
        loss = criterion(y_pred['cls_output'], label)
        
        #一坨优化
        optimizer.zero_grad()#每一次反向传播之前都要归零梯度
        loss.backward()      #反向传播
        optimizer.step()     #固定写法
        scheduler.step()
     
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        epoch_loss = running_loss / dataset_size
        mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0
        current_lr = optimizer.param_groups[0]['lr']
        pbar.set_postfix(train_loss=f'{epoch_loss:0.4f}',
                        lr=f'{current_lr:0.5f}',
                        gpu_mem=f'{mem:0.2f} GB')

    
        
        
    torch.cuda.empty_cache()
    gc.collect()
    
    return epoch_loss

# valid one epoch

In [59]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, optimizer):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    val_scores = []
    
    pbar = tqdm(enumerate(dataloader), total=len(dataloader), desc='Valid ')
    for step, (img, sensor,label) in pbar:               
        
        
        batch_size = img.size(0)
        batch = {"image":img,"sensor":sensor}

        y_pred  = model(batch)
        label = label.to(config.device).unsqueeze(1)

        loss = criterion(y_pred['cls_output'], label)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        
        mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0
        current_lr = optimizer.param_groups[0]['lr']
        pbar.set_postfix(valid_loss=f'{epoch_loss:0.4f}',
                        lr=f'{current_lr:0.5f}',
                        gpu_memory=f'{mem:0.2f} GB')
    torch.cuda.empty_cache()
    gc.collect()
    
    return epoch_loss

# train

In [60]:

def run_training(model, optimizer, scheduler, device, num_epochs):
     # init wandb
    run = wandb.init(project="vilt",
                    config={k: v for k, v in dict(vars(config)).items() if '__' not in k},
                    # config={k: v for k, v in dict(config).items() if '__' not in k},
                    anonymous=anonymous,
                    # name=f"vilt|fold-{config.valid_fold}",
                    name=config.wandb_name,
                    # group=config.wandb_group,
                    )
    wandb.watch(model, log_freq=100)

    best_loss = 9999
    best_valid_loss = 9999
    history = defaultdict(list)
    if torch.cuda.is_available():
        print("cuda: {}\n".format(torch.cuda.get_device_name()))
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        print(f'Epoch {epoch}/{num_epochs}', end='')
        train_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=device, epoch=epoch)
        val_loss = valid_one_epoch(model,valid_loader,device=device,optimizer=optimizer)
        history['Train Loss'].append(train_loss)
        history['Valid Loss'].append(val_loss)

        wandb.log({"Train Loss": train_loss,
                    "Valid Loss": val_loss,
                "lr": scheduler.get_last_lr()[0]
                })
        if best_valid_loss > val_loss:
            best_valid_loss = val_loss
            # model_file_path = os.path.join(wandb.run.dir,"epoch-{}-{}.bin".format(epoch,wandb.run.id))
            model_file_path = os.path.join(wandb.run.dir,"epoch-best.bin")
            run.summary["Best Epoch"] = epoch
            torch.save(model.state_dict(), model_file_path)
            print("model save to", model_file_path)
            
    os.system("cp /home/junsheng/ViLT/my_vilt_tianhang_soybean.ipynb {}".format(wandb.run.dir))
    run.finish()
    return model, history

optimizer

In [61]:
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=config.T_max, 
                                                   eta_min=1e-5)


run train

In [62]:

model, history = run_training(model, optimizer, scheduler,device=config.device,num_epochs=config.max_epoch)



0,1
Train Loss,█▆▆▃▄▂▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
Valid Loss,█▆▆▅▄▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
lr,████▇▇▇▆▆▅▅▄▄▃▃▃▂▂▂▁▁▁▁▁▁▁

0,1
Best Epoch,26.0
Train Loss,0.01645
Valid Loss,0.01765
lr,4e-05


cuda: NVIDIA GeForce RTX 3090

Epoch 1/50

  return torch.tensor(img).to(torch.float), torch.tensor(sensor).to(torch.float),torch.tensor(label).to(torch.float)
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Train : 100%|██████████| 67/67 [02:34<00:00,  2.30s/it, gpu_mem=7.07 GB, lr=0.00100, train_loss=0.2876]
Valid : 100%|██████████| 133/133 [00:38<00:00,  3.42it/s, gpu_memory=3.35 GB, lr=0.00100, valid_loss=0.2937]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 2/50

Train : 100%|██████████| 67/67 [02:32<00:00,  2.27s/it, gpu_mem=7.02 GB, lr=0.00098, train_loss=0.2933]
Valid : 100%|██████████| 133/133 [00:38<00:00,  3.41it/s, gpu_memory=3.34 GB, lr=0.00098, valid_loss=0.2937]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 3/50

Train : 100%|██████████| 67/67 [02:32<00:00,  2.27s/it, gpu_mem=7.02 GB, lr=0.00096, train_loss=0.2933]
Valid : 100%|██████████| 133/133 [00:39<00:00,  3.34it/s, gpu_memory=3.34 GB, lr=0.00096, valid_loss=0.2937]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 4/50

Train : 100%|██████████| 67/67 [02:33<00:00,  2.29s/it, gpu_mem=7.02 GB, lr=0.00093, train_loss=0.2933]
Valid : 100%|██████████| 133/133 [00:39<00:00,  3.35it/s, gpu_memory=3.34 GB, lr=0.00093, valid_loss=0.2936]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 5/50

Train : 100%|██████████| 67/67 [02:31<00:00,  2.27s/it, gpu_mem=7.02 GB, lr=0.00089, train_loss=0.2059]
Valid : 100%|██████████| 133/133 [00:39<00:00,  3.35it/s, gpu_memory=3.34 GB, lr=0.00089, valid_loss=0.0785]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 6/50

Train : 100%|██████████| 67/67 [02:34<00:00,  2.30s/it, gpu_mem=7.02 GB, lr=0.00085, train_loss=0.0812]
Valid : 100%|██████████| 133/133 [00:38<00:00,  3.48it/s, gpu_memory=3.34 GB, lr=0.00085, valid_loss=0.0779]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 7/50

Train : 100%|██████████| 67/67 [02:32<00:00,  2.27s/it, gpu_mem=7.02 GB, lr=0.00080, train_loss=0.0806]
Valid : 100%|██████████| 133/133 [00:39<00:00,  3.34it/s, gpu_memory=3.34 GB, lr=0.00080, valid_loss=0.0838]


Epoch 8/50

Train : 100%|██████████| 67/67 [02:30<00:00,  2.25s/it, gpu_mem=7.02 GB, lr=0.00074, train_loss=0.0811]
Valid : 100%|██████████| 133/133 [00:40<00:00,  3.32it/s, gpu_memory=3.34 GB, lr=0.00074, valid_loss=0.0778]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 9/50

Train : 100%|██████████| 67/67 [02:32<00:00,  2.28s/it, gpu_mem=7.02 GB, lr=0.00068, train_loss=0.0804]
Valid : 100%|██████████| 133/133 [00:38<00:00,  3.45it/s, gpu_memory=3.34 GB, lr=0.00068, valid_loss=0.0780]


Epoch 10/50

Train : 100%|██████████| 67/67 [02:33<00:00,  2.29s/it, gpu_mem=7.02 GB, lr=0.00061, train_loss=0.0795]
Valid : 100%|██████████| 133/133 [00:40<00:00,  3.32it/s, gpu_memory=3.34 GB, lr=0.00061, valid_loss=0.0815]


Epoch 11/50

Train : 100%|██████████| 67/67 [02:33<00:00,  2.28s/it, gpu_mem=7.02 GB, lr=0.00055, train_loss=0.0796]
Valid : 100%|██████████| 133/133 [00:39<00:00,  3.38it/s, gpu_memory=3.34 GB, lr=0.00055, valid_loss=0.0817]


Epoch 12/50

Train : 100%|██████████| 67/67 [02:32<00:00,  2.28s/it, gpu_mem=7.02 GB, lr=0.00048, train_loss=0.0790]
Valid : 100%|██████████| 133/133 [00:38<00:00,  3.43it/s, gpu_memory=3.34 GB, lr=0.00048, valid_loss=0.0849]


Epoch 13/50

Train : 100%|██████████| 67/67 [02:31<00:00,  2.27s/it, gpu_mem=7.02 GB, lr=0.00042, train_loss=0.0788]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.56it/s, gpu_memory=3.34 GB, lr=0.00042, valid_loss=0.0775]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 14/50

Train : 100%|██████████| 67/67 [02:24<00:00,  2.16s/it, gpu_mem=7.02 GB, lr=0.00035, train_loss=0.0790]
Valid : 100%|██████████| 133/133 [00:36<00:00,  3.68it/s, gpu_memory=3.34 GB, lr=0.00035, valid_loss=0.0781]


Epoch 15/50

Train : 100%|██████████| 67/67 [02:22<00:00,  2.13s/it, gpu_mem=7.02 GB, lr=0.00029, train_loss=0.0782]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.56it/s, gpu_memory=3.34 GB, lr=0.00029, valid_loss=0.0802]


Epoch 16/50

Train : 100%|██████████| 67/67 [02:24<00:00,  2.15s/it, gpu_mem=7.02 GB, lr=0.00023, train_loss=0.0788]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.58it/s, gpu_memory=3.34 GB, lr=0.00023, valid_loss=0.0761]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 17/50

Train : 100%|██████████| 67/67 [02:25<00:00,  2.18s/it, gpu_mem=7.02 GB, lr=0.00018, train_loss=0.0678]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.54it/s, gpu_memory=3.34 GB, lr=0.00018, valid_loss=0.0520]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 18/50

Train : 100%|██████████| 67/67 [02:25<00:00,  2.17s/it, gpu_mem=7.02 GB, lr=0.00013, train_loss=0.0343]
Valid : 100%|██████████| 133/133 [00:36<00:00,  3.64it/s, gpu_memory=3.34 GB, lr=0.00013, valid_loss=0.0260]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 19/50

Train : 100%|██████████| 67/67 [02:25<00:00,  2.17s/it, gpu_mem=7.02 GB, lr=0.00009, train_loss=0.0249]
Valid : 100%|██████████| 133/133 [00:36<00:00,  3.65it/s, gpu_memory=3.34 GB, lr=0.00009, valid_loss=0.0212]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 20/50

Train : 100%|██████████| 67/67 [02:21<00:00,  2.11s/it, gpu_mem=7.02 GB, lr=0.00006, train_loss=0.0182]
Valid : 100%|██████████| 133/133 [00:34<00:00,  3.87it/s, gpu_memory=3.34 GB, lr=0.00006, valid_loss=0.0170]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 21/50

Train : 100%|██████████| 67/67 [02:20<00:00,  2.09s/it, gpu_mem=7.02 GB, lr=0.00003, train_loss=0.0151]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.58it/s, gpu_memory=3.34 GB, lr=0.00003, valid_loss=0.0136]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 22/50

Train : 100%|██████████| 67/67 [02:26<00:00,  2.18s/it, gpu_mem=7.02 GB, lr=0.00002, train_loss=0.0144]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.52it/s, gpu_memory=3.34 GB, lr=0.00002, valid_loss=0.0135]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 23/50

Train : 100%|██████████| 67/67 [02:25<00:00,  2.17s/it, gpu_mem=7.02 GB, lr=0.00001, train_loss=0.0142]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.58it/s, gpu_memory=3.34 GB, lr=0.00001, valid_loss=0.0134]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 24/50

Train : 100%|██████████| 67/67 [02:25<00:00,  2.16s/it, gpu_mem=7.02 GB, lr=0.00001, train_loss=0.0139]
Valid : 100%|██████████| 133/133 [00:36<00:00,  3.63it/s, gpu_memory=3.34 GB, lr=0.00001, valid_loss=0.0130]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 25/50

Train : 100%|██████████| 67/67 [02:24<00:00,  2.16s/it, gpu_mem=7.02 GB, lr=0.00002, train_loss=0.0140]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.54it/s, gpu_memory=3.34 GB, lr=0.00002, valid_loss=0.0135]


Epoch 26/50

Train : 100%|██████████| 67/67 [02:26<00:00,  2.18s/it, gpu_mem=7.02 GB, lr=0.00004, train_loss=0.0142]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.57it/s, gpu_memory=3.34 GB, lr=0.00004, valid_loss=0.0169]


Epoch 27/50

Train : 100%|██████████| 67/67 [02:24<00:00,  2.16s/it, gpu_mem=7.02 GB, lr=0.00007, train_loss=0.0142]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.55it/s, gpu_memory=3.34 GB, lr=0.00007, valid_loss=0.0262]


Epoch 28/50

Train : 100%|██████████| 67/67 [02:25<00:00,  2.17s/it, gpu_mem=7.02 GB, lr=0.00011, train_loss=0.0144]
Valid : 100%|██████████| 133/133 [00:36<00:00,  3.65it/s, gpu_memory=3.34 GB, lr=0.00011, valid_loss=0.0124]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 29/50

Train : 100%|██████████| 67/67 [02:24<00:00,  2.16s/it, gpu_mem=7.02 GB, lr=0.00015, train_loss=0.0135]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.57it/s, gpu_memory=3.34 GB, lr=0.00015, valid_loss=0.0109]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 30/50

Train : 100%|██████████| 67/67 [02:25<00:00,  2.17s/it, gpu_mem=7.02 GB, lr=0.00020, train_loss=0.0188]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.54it/s, gpu_memory=3.34 GB, lr=0.00020, valid_loss=0.0121]


Epoch 31/50

Train : 100%|██████████| 67/67 [02:26<00:00,  2.18s/it, gpu_mem=7.02 GB, lr=0.00025, train_loss=0.0115]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.53it/s, gpu_memory=3.34 GB, lr=0.00025, valid_loss=0.0072]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 32/50

Train : 100%|██████████| 67/67 [02:25<00:00,  2.18s/it, gpu_mem=7.02 GB, lr=0.00031, train_loss=0.0081]
Valid : 100%|██████████| 133/133 [00:36<00:00,  3.64it/s, gpu_memory=3.34 GB, lr=0.00031, valid_loss=0.0122]


Epoch 33/50

Train : 100%|██████████| 67/67 [02:24<00:00,  2.15s/it, gpu_mem=7.02 GB, lr=0.00037, train_loss=0.0056]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.58it/s, gpu_memory=3.34 GB, lr=0.00037, valid_loss=0.0066]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 34/50

Train : 100%|██████████| 67/67 [02:23<00:00,  2.15s/it, gpu_mem=7.02 GB, lr=0.00044, train_loss=0.0093]
Valid : 100%|██████████| 133/133 [00:37<00:00,  3.58it/s, gpu_memory=3.34 GB, lr=0.00044, valid_loss=0.0054]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 35/50

Train : 100%|██████████| 67/67 [02:24<00:00,  2.16s/it, gpu_mem=7.02 GB, lr=0.00051, train_loss=0.0051]
Valid : 100%|██████████| 133/133 [00:36<00:00,  3.63it/s, gpu_memory=3.34 GB, lr=0.00051, valid_loss=0.0065]


Epoch 36/50

Train : 100%|██████████| 67/67 [02:15<00:00,  2.02s/it, gpu_mem=7.02 GB, lr=0.00057, train_loss=0.0088]
Valid : 100%|██████████| 133/133 [00:34<00:00,  3.82it/s, gpu_memory=3.34 GB, lr=0.00057, valid_loss=0.0077]


Epoch 37/50

Train : 100%|██████████| 67/67 [02:17<00:00,  2.05s/it, gpu_mem=7.02 GB, lr=0.00064, train_loss=0.0038]
Valid : 100%|██████████| 133/133 [00:33<00:00,  3.94it/s, gpu_memory=3.34 GB, lr=0.00064, valid_loss=0.0073]


Epoch 38/50

Train : 100%|██████████| 67/67 [02:16<00:00,  2.04s/it, gpu_mem=7.02 GB, lr=0.00070, train_loss=0.0101]
Valid : 100%|██████████| 133/133 [00:33<00:00,  3.94it/s, gpu_memory=3.34 GB, lr=0.00070, valid_loss=0.0052]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 39/50

Train : 100%|██████████| 67/67 [02:15<00:00,  2.02s/it, gpu_mem=7.02 GB, lr=0.00076, train_loss=0.0044]
Valid : 100%|██████████| 133/133 [00:33<00:00,  3.96it/s, gpu_memory=3.34 GB, lr=0.00076, valid_loss=0.0029]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 40/50

Train : 100%|██████████| 67/67 [02:15<00:00,  2.03s/it, gpu_mem=7.02 GB, lr=0.00081, train_loss=0.0051]
Valid : 100%|██████████| 133/133 [00:33<00:00,  3.97it/s, gpu_memory=3.34 GB, lr=0.00081, valid_loss=0.0069]


Epoch 41/50

Train : 100%|██████████| 67/67 [02:15<00:00,  2.03s/it, gpu_mem=7.02 GB, lr=0.00086, train_loss=0.0150]
Valid : 100%|██████████| 133/133 [00:33<00:00,  3.98it/s, gpu_memory=3.34 GB, lr=0.00086, valid_loss=0.0102]


Epoch 42/50

Train : 100%|██████████| 67/67 [02:15<00:00,  2.02s/it, gpu_mem=7.02 GB, lr=0.00091, train_loss=0.0063]
Valid : 100%|██████████| 133/133 [00:34<00:00,  3.89it/s, gpu_memory=3.34 GB, lr=0.00091, valid_loss=0.0047]


Epoch 43/50

Train : 100%|██████████| 67/67 [02:14<00:00,  2.00s/it, gpu_mem=7.02 GB, lr=0.00094, train_loss=0.0043]
Valid : 100%|██████████| 133/133 [00:34<00:00,  3.89it/s, gpu_memory=3.34 GB, lr=0.00094, valid_loss=0.0032]


Epoch 44/50

Train : 100%|██████████| 67/67 [02:16<00:00,  2.03s/it, gpu_mem=7.02 GB, lr=0.00097, train_loss=0.0036]
Valid : 100%|██████████| 133/133 [00:34<00:00,  3.85it/s, gpu_memory=3.34 GB, lr=0.00097, valid_loss=0.0058]


Epoch 45/50

Train : 100%|██████████| 67/67 [02:12<00:00,  1.97s/it, gpu_mem=7.02 GB, lr=0.00099, train_loss=0.0047]
Valid : 100%|██████████| 133/133 [00:33<00:00,  3.97it/s, gpu_memory=3.34 GB, lr=0.00099, valid_loss=0.0050]


Epoch 46/50

Train : 100%|██████████| 67/67 [02:15<00:00,  2.02s/it, gpu_mem=7.02 GB, lr=0.00100, train_loss=0.0057]
Valid : 100%|██████████| 133/133 [00:34<00:00,  3.90it/s, gpu_memory=3.34 GB, lr=0.00100, valid_loss=0.0039]


Epoch 47/50

Train : 100%|██████████| 67/67 [02:16<00:00,  2.03s/it, gpu_mem=7.02 GB, lr=0.00100, train_loss=0.0051]
Valid : 100%|██████████| 133/133 [00:33<00:00,  4.02it/s, gpu_memory=3.34 GB, lr=0.00100, valid_loss=0.0088]


Epoch 48/50

Train : 100%|██████████| 67/67 [02:16<00:00,  2.04s/it, gpu_mem=7.02 GB, lr=0.00099, train_loss=0.0055]
Valid : 100%|██████████| 133/133 [00:33<00:00,  3.94it/s, gpu_memory=3.34 GB, lr=0.00099, valid_loss=0.0049]


Epoch 49/50

Train : 100%|██████████| 67/67 [02:13<00:00,  2.00s/it, gpu_mem=7.02 GB, lr=0.00098, train_loss=0.0043]
Valid : 100%|██████████| 133/133 [00:33<00:00,  3.94it/s, gpu_memory=3.34 GB, lr=0.00098, valid_loss=0.0023]


model save to /home/junsheng/ViLT/wandb/offline-run-20221102_172029-dlghdwkd/files/epoch-best.bin
Epoch 50/50

Train : 100%|██████████| 67/67 [02:17<00:00,  2.05s/it, gpu_mem=7.02 GB, lr=0.00095, train_loss=0.0033]
Valid : 100%|██████████| 133/133 [00:33<00:00,  3.98it/s, gpu_memory=3.34 GB, lr=0.00095, valid_loss=0.0028]


0,1
Train Loss,████▃▃▃▃▃▃▃▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Valid Loss,████▃▃▃▃▃▃▃▃▃▂▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,████▇▇▆▆▅▄▄▃▃▂▂▂▁▁▁▁▁▁▂▂▃▃▄▄▅▅▆▆▇▇██████

0,1
Best Epoch,49.0
Train Loss,0.00325
Valid Loss,0.00279
lr,0.00095


# infer

In [63]:
for (img,sensor,label) in valid_loader:
    print(img.shape,sensor.shape,label)
    break

torch.Size([4, 3, 384, 384]) torch.Size([4, 1, 19]) tensor([0., 0., 0., 0.])


  return torch.tensor(img).to(torch.float), torch.tensor(sensor).to(torch.float),torch.tensor(label).to(torch.float)


In [64]:
# torch.save(model.state_dict(), 'embedding_test_dict.pt')
# print(model)

# model.load_state_dict(torch.load("/home/junsheng/ViLT/wandb/offline-run-20220811_120519-nzfb1xoz/files/epoch-best.bin"))
model.eval()
device = config.device
model.to(device)
def infer(img_filename, sensor):
    try:
        img_path = os.path.join('pictures',img_filename)
        image = Image.open(img_path).convert("RGB")
        img = pixelbert_transform(size=384)(image) # 将图像数据归一化torch.Size([3, 384, 576])
        img = torch.tensor(img)
        img = torch.unsqueeze(img, 0) # torch.Size([1, 3, 384, 576])
        img = img.to(device)
        print("img.shape:",img.shape)
    except :
        print("图片加载失败！")
        raise

    batch = dict()
    batch["image"] = img

    batch['sensor_masks'] = torch.ones(1,1).to(device)
    with torch.no_grad():
        batch['sensor'] = sensor.to(device)       
        infer = model(batch)

        print(infer)
        sensor_emb, img_emb = infer["sensor_feats"], infer["image_feats"]# torch.Size([1, 23, 768]) torch.Size([1, 217, 768])
        cls_output = infer['cls_output']
        

    return [cls_output]


random test

In [65]:

examples=[
            "/home/junsheng/data/xiangguan/pic/xiangguanD4-2021-05-24-10-00-25.jpeg", #0
            
            "/home/junsheng/data/xiangguan/pic/xiangguanD4-2021-07-18-04-22-30-preset-18.jpeg", # 3
    ]



n = 1
sensor = torch.rand(config.senser_input_num)
# sensor = torch.ones(config.senser_input_num)
print(sensor)
sensor =  torch.tensor(sensor).unsqueeze(0).unsqueeze(0) # torch.Size([1, 1, 3])
out = infer(examples[0],sensor)
# print("out:",out,"000\n")
# print("out0.shape:",out[0].shape)
# cv2.imwrite('output.png',out[0])



tensor([0.2798, 0.3580, 0.3258, 0.9675, 0.3362, 0.1932, 0.6399, 0.4880, 0.2876,
        0.2520, 0.2133, 0.5669, 0.0398, 0.7086, 0.0027, 0.2921, 0.3442, 0.8358,
        0.7115])
img.shape: torch.Size([1, 3, 352, 608])
{'image_feats': tensor([[[-3.0109e+00, -8.9953e-03, -2.2351e-05,  ...,  4.6877e-02,
          -2.8691e-03, -3.2883e-03],
         [-1.2397e+00,  1.8484e-02, -3.1946e-02,  ..., -4.2533e-02,
           7.0037e-03, -3.2790e-04],
         [-1.6883e+00, -1.7783e-02,  2.9024e-03,  ...,  9.2604e-02,
          -3.3002e-02, -7.4470e-02],
         ...,
         [-1.6554e+00, -1.3973e-02, -1.1536e-02,  ...,  1.7800e-02,
           1.6437e-02,  1.9664e-02],
         [-1.7469e+00, -1.9370e-02,  1.4804e-02,  ...,  2.5505e-02,
          -3.4030e-02, -3.8892e-02],
         [-2.3801e+00, -1.9123e-02,  1.1407e-02,  ...,  5.9326e-03,
           9.1222e-03, -5.7441e-02]]], device='cuda:0'), 'cls_feats': tensor([[-9.4028e-01,  7.3397e-01,  6.6973e-01,  6.4067e-04, -4.8336e-01,
         -8.6946

  sensor =  torch.tensor(sensor).unsqueeze(0).unsqueeze(0) # torch.Size([1, 1, 3])


KeyError: 'sensor_feats'

In [None]:
out

[tensor([[0.8166]], device='cuda:0')]

In [None]:
print(out[0].cpu().numpy()[0][0])
#0.00031266143

0.8166


test by valid

选择三组生长期不同的数据去验证训练的结果

In [None]:
df_test = df.query("fold==0").reset_index(drop=True)
df_test.to_csv("test_by_valid.csv",index=False)
sensor_test_list = df_test.sensor.tolist()
image_test_list = df_test.image_path.tolist()

In [None]:
idx = 64
sensor =  torch.tensor(sensor_test_list[idx]).unsqueeze(0).unsqueeze(0)
out = infer(image_test_list[idx],sensor)

img.shape: torch.Size([1, 3, 352, 608])
{'sensor_feats': tensor([[[ 2.7024e-01,  8.4886e-01,  3.5035e-01,  5.3281e-02,  5.2474e-02,
          -1.7046e-01,  7.3329e-02,  3.0590e-02,  6.7037e-02, -3.7755e-01,
          -8.7652e-01,  1.3844e-01,  2.5090e-01, -3.3451e-01, -1.6991e-01,
          -2.3502e-02, -6.9722e-02, -1.1445e-01, -1.7391e-01,  3.5070e-01,
          -1.2979e+00,  2.7135e-01, -6.4320e-02, -9.8884e-01,  3.2600e-02,
          -4.4981e-01,  1.7270e-01,  3.7485e-01, -8.4396e-01, -6.4098e-02,
           1.6703e-01,  3.6598e-01, -4.2784e-02, -1.1087e-01,  1.1336e-02,
          -2.0030e+00, -7.8494e-01, -9.4766e-02, -1.2019e-01,  1.6362e-01,
          -1.2572e-01,  2.2219e-01, -3.8874e-02,  6.9456e-01, -6.6659e-01,
          -7.5572e-02, -6.5935e-02, -2.7468e-02,  1.0221e+00, -2.5591e-01,
           1.8912e-01, -5.2756e-01,  4.2142e-02,  1.3391e+00,  6.7435e-02,
           9.3807e-01,  1.4467e+00,  4.8933e-01,  2.6360e-01, -6.8080e-02,
           1.7264e-01,  7.1070e-02,  3.0559

  img = torch.tensor(img)


In [None]:
idx = 876
sensor =  torch.tensor(sensor_test_list[idx]).unsqueeze(0).unsqueeze(0)
out = infer(image_test_list[idx],sensor)

IndexError: list index out of range

In [None]:
idx = 1817
sensor =  torch.tensor(sensor_test_list[idx]).unsqueeze(0).unsqueeze(0)
out = infer(image_test_list[idx],sensor)