In [None]:
import os
import gc
import cv2
import math
import copy
import time
import random
from collections import OrderedDict


# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.model_selection import StratifiedKFold

# For Image Models
import timm

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
CONFIG = {"seed": 42, # 随机种子
          "img_size": 768, # 图像尺寸
          "model_name": "tf_efficientnet_b6_ns", # 模型名称 tf_efficientnet_b6_ns, tf_efficientnetv2_l_in21k, eca_nfnet_l2 
          "num_classes": 15587, # 类别数量
          "embedding_size": 512, # embedding 维度
          "train_batch_size": 64, # 训练 batch size
          "valid_batch_size": 64, # 验证 batch size
          "n_fold": 5, # fold
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"), # gpu / cpu
          "gpu_parallel":True, # 并行gpu
          "image_data":"fullbody", # 数据路径 backfins, fullbody 
          "debug":True, # debug 模式
          "num_workers":10, # cpu线程数

          # ArcFace Hyperparameters
          "s": 30.0, # arcface scale
          "m": 0.30, # arcface margin
          "ls_eps": 0.0, # arcface label smoothing
          "easy_margin": False, # arcface easy_margin

          # KNN
          "KNN":850,
          
          }

# 调试模式参数
if CONFIG["debug"]:
    CONFIG["img_size"] = 512
    CONFIG["model_name"] = "tf_efficientnet_b0_ns"
    CONFIG["train_batch_size"] = 32
    CONFIG["valid_batch_size"] = 64

In [None]:
def seed_everything(seed):
    '''
    随机种子
    '''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False # set True to be faster
seed_everything(CONFIG['seed'])

In [None]:
BASE_DIR = './'
ROOT_DIR = './'

if CONFIG["image_data"] == "backfins":
    # backfins 数据
    TRAIN_DIR = f'{ROOT_DIR}/train_backfins_images'
    TEST_DIR = f'{ROOT_DIR}/test_backfins_images'
elif CONFIG["image_data"] == "fullbody":
    # fullbody 数据
    TRAIN_DIR = f'{ROOT_DIR}/train_fullbody_images'
    TEST_DIR = f'{ROOT_DIR}/test_fullbody_images'
else:
    # 完整版数据
    TRAIN_DIR = f'{ROOT_DIR}/train_images'
    TEST_DIR = f'{ROOT_DIR}/test_images'

In [None]:
# 读取train csv文件
def get_train_file_path(id):
    return f"{TRAIN_DIR}/{id}"

df = pd.read_csv(f"{ROOT_DIR}/train.csv")
df['file_path'] = df['image'].apply(get_train_file_path) # 加上图像路径
df.head()

In [None]:
# 对label做 标签编码
encoder = LabelEncoder()

# with open(f'{ROOT_DIR}/le.pkl', "rb") as fp:
#     encoder = joblib.load(fp)

# df['individual_id'] = encoder.transform(df['individual_id'])
df['individual_id'] = encoder.fit_transform(df['individual_id'])

In [None]:
# 分层KFold
skf = StratifiedKFold(n_splits=CONFIG['n_fold'])
for fold, ( _, val_) in enumerate(skf.split(X=df, y=df.individual_id)):
      df.loc[val_ , "kfold"] = fold

In [None]:
class HappyWhaleDataset(Dataset):
    '''
    torch HappyWhale DataSets
    '''
    def __init__(self, df, transforms=None):
        self.df = df # csv to dataframe
        self.ids = df['image'].values # 获取image filename
        self.file_names = df['file_path'].values # 获取图像路径
        self.labels = df['individual_id'].values # 获取labels
        self.transforms = transforms # 数据增强
        
    def __len__(self):
        return len(self.df) # 数据集长度
    
    def __getitem__(self, index):
        idx = self.ids[index]  # 获取某张图片 filename
        img_path = self.file_names[index] # 获取某张图片的路径
        img = cv2.imread(img_path) # 读取图片
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # BGR to RGB
        label = self.labels[index] # 获取labels
        
        if self.transforms:
            img = self.transforms(image=img)["image"] # 对图像应用数据增强
            
        return {
            'image': img, # 返回图像
            'label': torch.tensor(label, dtype=torch.long), # 返回labels
            'id': idx # 返回filename
        }

In [None]:
# 数据增强
data_transforms = {
    "train": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),  # Resize
        # 归一化
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.),
    
    "valid": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),  # Resize
        # 归一化
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
}

In [None]:
# GeM Pooling 详解可查看讲义
class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p) # 比例p参数
        self.eps = eps  # eps 防止除零

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
        
    def __repr__(self):
        return self.__class__.__name__ + \
                '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
                ', ' + 'eps=' + str(self.eps) + ')'

In [None]:
# Arcface
class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, 
                 m=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features  # input的维度
        self.out_features = out_features # output的维度
        self.s = s # re-scale
        self.m = m # margin
        self.ls_eps = ls_eps  # label smoothing
        # 初始化权重
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin # easy_margin 模式
        self.cos_m = math.cos(m) # cos margin
        self.sin_m = math.sin(m) # sin margin
        self.threshold = math.cos(math.pi - m) # cos(pi - m) = -cos(m)
        self.mm = math.sin(math.pi - m) * m # sin(pi - m)*m = sin(m)*m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight)) # 获得cosθ (vector)
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2)) # 获得cosθ
        phi = cosine * self.cos_m - sine * self.sin_m # cosθ*cosm – sinθ*sinm = cos(θ + m)
        phi = phi.float() # phi to float
        cosine = cosine.float() # cosine to float
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            # 以下代码控制了 θ+m 应该在 range[0, pi]
            # if cos(θ) > cos(pi - m) means θ + m < math.pi, so phi = cos(θ + m);
            # else means θ + m >= math.pi, we use Talyer extension to approximate the cos(θ + m).
            # if fact, cos(θ + m) = cos(θ) - m * sin(θ) >= cos(θ) - m * sin(math.pi - m)
            phi = torch.where(cosine > self.threshold, phi, cosine - self.mm) # https://github.com/ronghuaiyang/arcface-pytorch/issues/48
        # --------------------------- convert label to one-hot ---------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        # 对label形式进行onehot转换，假设batch为2、有3类的话，即将label从[1,2]转换成[[0,1,0],[0,0,1]]
        one_hot = torch.zeros(cosine.size(), device=CONFIG['device'])
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # label smoothing
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine) #验证是否匹配正确 
        # 进行re-scale
        output *= self.s

        return output

In [None]:
class HappyWhaleModel(nn.Module):
    def __init__(self, model_name, embedding_size, pretrained=True):
        super(HappyWhaleModel, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained) # 创建模型
        # 获取 in_features，以及置空最后两层
        if 'efficientnet' in model_name:
            in_features = self.model.classifier.in_features
            self.model.classifier = nn.Identity()
            self.model.global_pool = nn.Identity()
        elif 'nfnet' in model_name:
            in_features = self.model.head.fc.in_features
            self.model.head.fc = nn.Identity()
            self.model.head.global_pool = nn.Identity()

        self.pooling = GeM() # GeM Pooling
        # bn层 + dense层
        self.embedding = nn.Sequential(
                            nn.BatchNorm1d(in_features),
                            nn.Linear(in_features, embedding_size)
                            )
        # arcface
        self.fc = ArcMarginProduct(embedding_size,  # in_features
                                   CONFIG["num_classes"], # out_features
                                   s=CONFIG["s"],  # scale
                                   m=CONFIG["m"],  # margin
                                   easy_margin=CONFIG["easy_margin"],  # easy_margin模式
                                   ls_eps=CONFIG["ls_eps"]) # label smoothing

    def forward(self, images, labels):
        '''
        train/valid
        '''
        features = self.model(images) # backbone 
        pooled_features = self.pooling(features).flatten(1) # gem pooling
        embedding = self.embedding(pooled_features) # embedding
        output = self.fc(embedding, labels) # arcface
        return output
    
    def extract(self, images):
        '''
        test
        '''
        features = self.model(images) # backbone 
        pooled_features = self.pooling(features).flatten(1) # gem pooling
        embedding = self.embedding(pooled_features) # embedding
        return embedding

In [None]:
@torch.inference_mode()
def get_embeddings(model, dataloader, device):
    model.to(CONFIG['device'])
    model.eval() # eval模式
    
    LABELS = []
    EMBEDS = []
    IDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader)) # dataloader
    for step, data in bar:        
        images = data['image'].to(device, dtype=torch.float) # images to gpu
        labels = data['label'].to(device, dtype=torch.long) # labels to gpu
        ids = data['id'] # filename

        outputs = model.extract(images) # 提取出 embedding
        
        LABELS.append(labels.cpu().numpy()) # labels存入list
        EMBEDS.append(outputs.cpu().numpy())  # embedding存入list
        IDS.append(ids) # filename 存入list
    
    EMBEDS = np.vstack(EMBEDS) # 合并 embedding
    LABELS = np.concatenate(LABELS) # 合并 labels
    IDS = np.concatenate(IDS) # 合并 filename
    
    return EMBEDS, LABELS, IDS

In [None]:
def prepare_loaders(df, fold):
    df_train = df[df.kfold != fold].reset_index(drop=True) # 切分训练集
    df_valid = df[df.kfold == fold].reset_index(drop=True) # 切分验证集
    
    train_dataset = HappyWhaleDataset(df_train, transforms=data_transforms["train"]) # 创建Train Datasets
    valid_dataset = HappyWhaleDataset(df_valid, transforms=data_transforms["valid"]) # 创建Valid Datasets

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], num_workers=CONFIG["num_workers"], shuffle=False, pin_memory=False) # Train DataLoader
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], num_workers=CONFIG["num_workers"], shuffle=False, pin_memory=False) # Valid DataLoader
    
    return train_loader, valid_loader

In [None]:
# test df
test_df = pd.DataFrame()
test_df["image"] = os.listdir(f"{ROOT_DIR}/test_images")
test_df["file_path"] = test_df["image"].apply(lambda x: f"{TEST_DIR}/{x}")
test_df["individual_id"] = -1  #dummy value

In [None]:
train_loader, valid_loader = prepare_loaders(df, fold=0) # train dataloader / valid dataloader
test_dataset = HappyWhaleDataset(test_df, transforms=data_transforms["valid"]) # test Dataset
test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size'], num_workers=CONFIG["num_workers"], shuffle=False, pin_memory=False) # test dataloader

In [None]:
# 已训练完成的模型权重字典
model_weight_dict = {
    "weights_dir":[f"{BASE_DIR}/tf_efficientnet_b6_ns_fold1.pth",
                   f"{BASE_DIR}/tf_efficientnet_b6_ns_fold3.pth",
                   f"{BASE_DIR}/tf_efficientnetv2_l_in21k_fold0.pth",
                   f"{BASE_DIR}/tf_efficientnetv2_l_in21k_fold1.pth",
                   f"{BASE_DIR}/tf_efficientnetv2_l_in21k_fold2.pth",
                   f"{BASE_DIR}/tf_efficientnetv2_l_in21k_fold4.pth",
                   f"{BASE_DIR}/eca_nfnet_l2_fold0.pth",
                   f"{BASE_DIR}/eca_nfnet_l2_fold1.pth",
                   f"{BASE_DIR}/eca_nfnet_l2_fold4.pth",
                   ],
    "model_name": ["tf_efficientnet_b6_ns",
                   "tf_efficientnet_b6_ns",
                   "tf_efficientnetv2_l_in21k",
                   "tf_efficientnetv2_l_in21k",
                   "tf_efficientnetv2_l_in21k",
                   "tf_efficientnetv2_l_in21k",
                   "eca_nfnet_l2",
                   "eca_nfnet_l2",
                   "eca_nfnet_l2",
                   ],
    "embedding_size":[512, 512, 512, 512, 512, 512, 512, 512, 512],
}

train_embeds_list = []
valid_embeds_list = []
test_embeds_list = []
train_labels_list = []
valid_labels_list = []
train_ids_list = []
valid_ids_list = []
test_ids_list = []

for idx in range(len(model_weight_dict["weights_dir"])):
    weights_dir = model_weight_dict["weights_dir"][idx] # weights_dir
    model_name = model_weight_dict["model_name"][idx] # model_name
    embedding_size = model_weight_dict["embedding_size"][idx] # embedding_size

    model = HappyWhaleModel(model_name, embedding_size) # 创建 model
    state = torch.load(weights_dir) # 读取权重文件

    # weight to model
    if CONFIG['gpu_parallel']:
        new_state_dict = OrderedDict()
        for k, v in state.items():
            k=k[7:]
            new_state_dict[k]=v
        model.load_state_dict(new_state_dict)
    else:
        model.load_state_dict(state)
    model.to(CONFIG['device']) # 模型存入GPU

    train_embeds, train_labels, train_ids = get_embeddings(model, train_loader, CONFIG['device']) # 获取到 train 的 embedding, labels, IDS
    valid_embeds, valid_labels, valid_ids = get_embeddings(model, valid_loader, CONFIG['device']) # 获取到 valid 的 embedding, labels, IDS
    test_embeds, _, test_ids = get_embeddings(model, test_loader, CONFIG['device']) # 获取到 test 的 embedding, IDS

    train_embeds_list.append(train_embeds)
    valid_embeds_list.append(valid_embeds)
    test_embeds_list.append(test_embeds)

    torch.cuda.empty_cache()
    _ = gc.collect()

train_embeds = np.concatenate(train_embeds_list,axis=1) # train_embeds
valid_embeds = np.concatenate(valid_embeds_list,axis=1) # valid_embeds
test_embeds = np.concatenate(test_embeds_list,axis=1) # test_embeds


In [None]:
from sklearn.neighbors import NearestNeighbors # knn
neigh = NearestNeighbors(n_neighbors=CONFIG["KNN"],metric='cosine') # 定义knn
neigh.fit(train_embeds) # 训练knn
valid_distances, valid_idxs = neigh.kneighbors(valid_embeds, CONFIG["KNN"], return_distance=True) # 推理 knn

In [None]:
# 逆标签编码
train_allowed_labels = encoder.inverse_transform(train_labels) 
valid_allowed_labels = encoder.inverse_transform(valid_labels)

train_allowed_labels_set = set(train_allowed_labels)

In [None]:
# 设置 new_individual
val_targets_df = pd.DataFrame(np.stack([valid_ids, valid_allowed_labels], axis=1), columns=['image','target'])
val_targets_df.loc[~val_targets_df.target.isin(train_allowed_labels_set),'target'] = 'new_individual' # valid中的individual若没有出现在train中，则设置为 new_individual
val_targets_df.target.value_counts()

In [None]:
valid_df = []
for i in tqdm(range(len(valid_ids))):
    id_ = valid_ids[i] # filename
    targets = train_labels[valid_idxs[i]] # labels
    distances = valid_distances[i] # distances
    subset_preds = pd.DataFrame(np.stack([targets,distances],axis=1),columns=['target','distances'])
    subset_preds['image'] = id_
    valid_df.append(subset_preds)
valid_df = pd.concat(valid_df).reset_index(drop=True) 
valid_df['confidence'] = 1-valid_df['distances'] # 相似度
valid_df = valid_df.groupby(['image','target']).confidence.max().reset_index() # 获取每张image最大的相似度
valid_df = valid_df.sort_values('confidence',ascending=False).reset_index(drop=True) # 根据相似度排序
valid_df['target'] = encoder.inverse_transform(valid_df['target'].astype("int").to_list()) # 获取target原始名称
valid_df.to_csv('val_neighbors.csv')
valid_df.image.value_counts().value_counts()

In [None]:
sample_list = ['938b7e931166', '5bf17305f073', '7593d2aee842', '7362d7a01d00','956562ff2888'] # 最常见的五个individual

def get_predictions(test_df, threshold=0.2):
    predictions = {} # 预测结果字典
    for i, row in tqdm(test_df.iterrows()):
        if row.image in predictions: # 若image以存在
            if len(predictions[row.image]) == 5: # 如果正好为5个值，则跳过
                continue
            predictions[row.image].append(row.target) # 存下当前的target
        elif row.confidence > threshold:
            predictions[row.image] = [row.target, 'new_individual'] # 相似度大于new阈值，则[row.target, 'new_individual']
        else:
            predictions[row.image] = ['new_individual', row.target] # 反之，则['new_individual', row.target]

    for x in tqdm(predictions):
        if len(predictions[x]) < 5:
            # 如果预测值小于5，则使用最常出现的5个样本代替
            remaining = [y for y in sample_list if y not in predictions] # 获取代替样本
            predictions[x] = predictions[x] + remaining # 加入代替样本
            predictions[x] = predictions[x][:5] # 保留前五个值

    return predictions

In [None]:
# 正式评价指标
def map_per_image(label, predictions):
    """Computes the precision score of one image.

    Parameters
    ----------
    label : string
            The true label of the image
    predictions : list
            A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """    
    try:
        return 1 / (predictions[:5].index(label) + 1)
    except ValueError:
        return 0.0

In [None]:
# 查看不同knn相似度阈值下的cv分数
best_th = 0  
best_cv = 0
for th in [0.1*x for x in range(11)]:  # [0.1,0.2,...,0.9,1.0]
    print("threshold:", th)
    all_preds = get_predictions(valid_df,threshold=th) # 获取某个阈值下的预测结果
    cv = 0
    for i,row in val_targets_df.iterrows(): 
        target = row.target  # target
        preds = all_preds[row.image]  # preds
        val_targets_df.loc[i,th] = map_per_image(target,preds) # MAP score计算
    cv = val_targets_df[th].mean() # cv平均值
    print(f"CV at threshold {th}: {cv}")
    if cv>best_cv:
        best_th = th
        best_cv = cv

In [None]:
print("Best threshold", best_th)
print("Best cv",best_cv) 
val_targets_df.describe() 

In [None]:
val_targets_df.head()

In [None]:
## Adjustment: Since Public lb has nearly 10% 'new_individual' (Be Careful for private LB)
val_targets_df['is_new_individual'] = val_targets_df.target=='new_individual' # 新建列'is_new_individual'，如果targe是new_individual
print(val_targets_df.is_new_individual.value_counts().to_dict())  # 打印targets是或者不是new_individual的数值
val_scores = val_targets_df.groupby('is_new_individual').mean().T # 每个阈值下，获取 new_individual 比例
val_scores['adjusted_cv'] = val_scores[True]*0.15+val_scores[False]*0.85 # 根据比例调整cv分数
best_threshold_adjusted = val_scores['adjusted_cv'].idxmax() # 获取新的best cv
print("best_threshold", best_threshold_adjusted)
val_scores

# Inference

In [None]:
all_embeds = np.concatenate([train_embeds, valid_embeds]) # 合并Train和Valid embeds
all_labels = np.concatenate([train_labels, valid_labels]) # 合并Train和Valid labels
print(all_embeds.shape, all_labels.shape)

from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=CONFIG["KNN"],metric='cosine') # 定义knn
neigh.fit(all_embeds) # 训练 knn
test_distances, test_idxs = neigh.kneighbors(test_embeds, CONFIG["KNN"], return_distance=True) # 在test上推理knn

In [None]:
sample_submission = pd.read_csv(f'{ROOT_DIR}/sample_submission.csv', index_col='image') # 读取 sample_submission.csv
print("test_ids len:",len(test_ids), "sample_submission len:",len(sample_submission))
test_df = []
for i in tqdm(range(len(test_ids))):  
    id_ = test_ids[i]  # filename
    targets = all_labels[test_idxs[i]]  # labels
    distances = test_distances[i] # distances
    subset_preds = pd.DataFrame(np.stack([targets,distances],axis=1),columns=['target','distances'])
    subset_preds['image'] = id_
    test_df.append(subset_preds)
test_df = pd.concat(test_df).reset_index(drop=True)
test_df['confidence'] = 1-test_df['distances'] # 相似度
test_df = test_df.groupby(['image','target']).confidence.max().reset_index() # 获取每张image最大的相似度
test_df = test_df.sort_values('confidence',ascending=False).reset_index(drop=True) # 根据相似度排序
test_df['target'] = encoder.inverse_transform(test_df['target'].astype("int").to_list()) # 获取target原始名称
test_df.to_csv('test_neighbors.csv')
test_df.image.value_counts().value_counts()

In [None]:
predictions = {} # 预测结果字典
for i,row in tqdm(test_df.iterrows()):
    if row.image in predictions: # 若image以存在
        if len(predictions[row.image])==5: # 如果正好为5个值，则跳过
            continue
        predictions[row.image].append(row.target) # 存下当前的target
    elif row.confidence>best_threshold_adjusted:
        predictions[row.image] = [row.target,'new_individual'] # 相似度大于new阈值，则[row.target, 'new_individual']
    else:
        predictions[row.image] = ['new_individual',row.target] # 反之，则['new_individual', row.target]
        
for x in tqdm(predictions):
    if len(predictions[x])<5:
        # 如果预测值小于5，则使用最常出现的5个样本代替
        remaining = [y for y in sample_list if y not in predictions] # 获取代替样本
        predictions[x] = predictions[x]+remaining # 加入代替样本
        predictions[x] = predictions[x][:5] # 保留前五个值
    predictions[x] = ' '.join(predictions[x])
    
# 输出 submission.csv
predictions = pd.Series(predictions).reset_index()
predictions.columns = ['image','predictions']
predictions.to_csv('submission.csv',index=False)
predictions.head()