# Architecture : Eca-nfnet-l0 / Efficient-B3 + Arcface Module


In [1]:
import sys
sys.path.append('/mnt/hdd1/wearly/kaggle/shopee/pytorch-image-models-master')

In [2]:
import numpy as np 
import pandas as pd 

import math
import random 
import os 
import cv2
import timm

from tqdm import tqdm 

import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2

import torch 
from torch.utils.data import Dataset 
from torch import nn
import torch.nn.functional as F 

import gc
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

# Config

In [3]:
GET_CV = True # kaggle commit할때는 false로 지정
CHECK_SUB = False

df = cudf.read_csv('/mnt/hdd1/wearly/kaggle/shopee/test.csv')
# If we are comitting, replace train set for test set and dont get cv
if len(df) > 3:
    GET_CV = False
del df

class CFG:
    
    img_size = 512
    batch_size = 12
    seed = 2020
    
    device = 'cuda'
    classes = 11014
    
    model_name1 = 'eca_nfnet_l0'
    model_path1 = '/mnt/hdd1/wearly/kaggle/shopee/Shopee-pytorch-models/arcface_512x512_nfnet_l0 (mish).pt'
    
    model_name2 = 'efficientnet_b3'
    model_path2 = '/mnt/hdd1/wearly/kaggle/shopee/Shopee-pytorch-models/arcface_512x512_eff_b3.pt'
    
    model_name3 = 'tf_efficientnet_b5_ns'
    model_path3 = '/mnt/hdd1/wearly/kaggle/shopee/Shopee-pytorch-models/arcface_512x512_eff_b5_.pt' 
    
    scale = 30 
    margin = 0.5

# Utils

In [4]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(CFG.seed)

In [5]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [6]:
def read_dataset():
    if GET_CV:
        df = pd.read_csv('/mnt/hdd1/wearly/kaggle/shopee/train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
        image_paths = '/mnt/hdd1/wearly/kaggle/shopee/train_images/' + df['image']
    else:
        df = pd.read_csv('/mnt/hdd1/wearly/kaggle/shopee/test.csv')
        df_cu = cudf.DataFrame(df)
        image_paths = '/mnt/hdd1/wearly/kaggle/shopee/test_images/' + df['image']
        
    return df, df_cu, image_paths

# Augmentation

In [7]:
def get_test_transforms():

    return A.Compose(
        [
            A.Resize(CFG.img_size,CFG.img_size,always_apply=True),
            A.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )

# Dataset

In [8]:
class ShopeeDataset(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
    
        return image,torch.tensor(1)

# Arcface

In [9]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale

        return output

# Model

In [10]:
class ShopeeModel(nn.Module):

    def __init__(
        self, model_name,
        n_classes = CFG.classes,
        fc_dim = 512,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = False):


        super(ShopeeModel,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=pretrained)

        if model_name == 'resnext50_32x4d':
            final_in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'efficientnet_b3':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'tf_efficientnet_b5_ns':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
        
        elif model_name == 'eca_nfnet_l0':
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()

        self.pooling =  nn.AdaptiveAvgPool2d(1)

        self.use_fc = use_fc

        self.dropout = nn.Dropout(p=0.0)
        self.fc = nn.Linear(final_in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        final_in_features = fc_dim

        self.final = ArcMarginProduct(
            final_in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, image, label):
        feature = self.extract_feat(image)
        #logits = self.final(feature,label)
        return feature

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x

# Mish

In [11]:
class Mish_func(torch.autograd.Function):
    
    """from: https://github.com/tyunist/memory_efficient_mish_swish/blob/master/mish.py"""
    
    @staticmethod
    def forward(ctx, i):
        result = i * torch.tanh(F.softplus(i))
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
  
        v = 1. + i.exp()
        h = v.log() 
        grad_gh = 1./h.cosh().pow_(2) 

        # Note that grad_hv * grad_vx = sigmoid(x)
        #grad_hv = 1./v  
        #grad_vx = i.exp()
        
        grad_hx = i.sigmoid()

        grad_gx = grad_gh *  grad_hx #grad_hv * grad_vx 
        
        grad_f =  torch.tanh(F.softplus(i)) + i * grad_gx 
        
        return grad_output * grad_f 


class Mish(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        pass
    def forward(self, input_tensor):
        return Mish_func.apply(input_tensor)


def replace_activations(model, existing_layer, new_layer):
    
    """A function for replacing existing activation layers"""
    
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_activations(module, existing_layer, new_layer)

        if type(module) == existing_layer:
            layer_old = module
            layer_new = new_layer
            model._modules[name] = layer_new
    return model

# get_image_predictions

In [12]:
def get_image_predictions(df, embeddings,threshold = 0.0):
    
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    KNN=3
    print(f'KNN={KNN}')
    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    if GET_CV:        
        thresholds = list(np.arange(0.2, 3.0, 0.01)) # image prediction threshold 범위를 여기서 지정할 것 
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            idx = np.where(distances[k,] < best_threshold)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)

        del model, distances, indices
        gc.collect()
        return predictions # best threshold로 예측한 것을 return
    
    else: # test 데이터로 예측 - threshold는 수동으로 지정해야함.
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            idx = np.where(distances[k,] < threshold)[0] # train으로 best cv나온 threshold값을 함수 인자로서 수동으로 지정.
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)

        del model, distances, indices
        gc.collect()
        return predictions

# get_image_embeddings

In [13]:
def get_image_embeddings(image_paths, model_name, model_path):
    embeds = []
    
    if model_name == 'eca_nfnet_l0':
        model = ShopeeModel(model_name=model_name)
        model.eval()
        model = replace_activations(model, torch.nn.SiLU, Mish())
        model.load_state_dict(torch.load(model_path))
        model = model.to(CFG.device)
        
    elif model_name == 'tf_efficientnet_b5_ns':
        model = ShopeeModel(model_name=model_name, use_fc=False)
        model.eval()
        model = replace_activations(model, torch.nn.SiLU, Mish())
        model.load_state_dict(torch.load(model_path))
        model = model.to(CFG.device)
    
    else:
        model = ShopeeModel(model_name=model_name)
        model.eval()
        model.load_state_dict(torch.load(model_path))
        model = model.to(CFG.device)

    image_dataset = ShopeeDataset(image_paths=image_paths,transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat = model(img,label)
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

# get_text_embeddings

In [14]:
def get_text_predictions(df, max_features=25_000):
    
    model = TfidfVectorizer(stop_words='english',binary=True,max_features=max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()

    print('Finding similar titles...')
    CHUNK = 1024 * 4
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1
    thresholds = list(np.arange(0.55,0.8,0.025)) # text threshold 범위를 여기서 지정할것.
    scores = []
    if GET_CV:    
        for threshold in thresholds:
            preds = []
            for j in range( CTS ):
                a = j * CHUNK
                b = (j+1) * CHUNK
                b = min(b, len(df))
                print('chunk', a, 'to', b)

                # COSINE SIMILARITY DISTANCE
                cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
                for k in range(b-a):
                    IDX = cupy.where(cts[k,]>threshold)[0]
                    o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                    o = ' '.join(o)
                    preds.append(o)
            df['pred_matches'] = preds
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        print(' ')
        print(f'Final text embeddings prediction using best threshold {best_threshold}')
        preds = []
        for j in range( CTS ):
            a = j * CHUNK
            b = (j+1) * CHUNK
            b = min(b, len(df))
            print('chunk', a, 'to', b)
            # COSINE SIMILARITY DISTANCE
            cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
            for k in range(b-a):
                IDX = cupy.where(cts[k,]>best_threshold)[0]
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds.append(o)
        del model,text_embeddings
        gc.collect()
        return preds # best cv score를 기록한 threshold값으로 return
    
    
    else: # test 데이터로 예측 - threshold는 수동으로 지정해야함.
        model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
        text_embeddings = model.fit_transform(df_cu['title']).toarray()
        preds = []
        CHUNK = 1024*4

        print('Finding similar titles...')
        CTS = len(df)//CHUNK
        if len(df)%CHUNK!=0: CTS += 1
        for j in range( CTS ):

            a = j*CHUNK
            b = (j+1)*CHUNK
            b = min(b,len(df))
            print('chunk',a,'to',b)

            # COSINE SIMILARITY DISTANCE
            cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

            for k in range(b-a):
                IDX = cupy.where(cts[k,]>0.75)[0] # 현재 defualt는 0.75. 여기서 train best cv score의 threshold를 수동으로 지정.
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds.append(o)

        del model,text_embeddings
        gc.collect()
        return preds

# Final inference

In [15]:
df,df_cu,image_paths = read_dataset()
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,train_129225211 train_2278313361
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,train_3386243561 train_3423213080
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,train_2288590299 train_3803689425
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,train_2406599165 train_3342059966
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,train_3369186413 train_921438619


In [17]:
image_embeddings1 = get_image_embeddings(image_paths.values, CFG.model_name1, CFG.model_path1)
image_embeddings2 = get_image_embeddings(image_paths.values, CFG.model_name2, CFG.model_path2)
image_embeddings3 = get_image_embeddings(image_paths.values, CFG.model_name3, CFG.model_path3)
#image_embeddings = (image_embeddings1 + image_embeddings2)/2

Building Model Backbone for eca_nfnet_l0 model


100%|██████████| 2855/2855 [04:10<00:00, 11.41it/s]


Our image embeddings shape is (34250, 512)
Building Model Backbone for efficientnet_b3 model


100%|██████████| 2855/2855 [02:19<00:00, 20.47it/s]


Our image embeddings shape is (34250, 512)
Building Model Backbone for tf_efficientnet_b5_ns model


100%|██████████| 2855/2855 [06:01<00:00,  7.91it/s]


Our image embeddings shape is (34250, 2048)


In [18]:
image_embeddings = np.concatenate([image_embeddings1,image_embeddings2],axis=1)
print(f'embeddings ensemble method (concatenate axis=1) shape : {image_embeddings.shape}')

embeddings ensemble method (concatenate axis=1) shape : (34250, 1024)


In [20]:
image_embeddings2 = np.concatenate([image_embeddings1,image_embeddings2,image_embeddings3],axis=1)
print(f'embeddings ensemble method (concatenate axis=1) shape : {image_embeddings2.shape}')

embeddings ensemble method (concatenate axis=1) shape : (34250, 3072)


In [19]:
image_predictions = get_image_predictions(df, image_embeddings, threshold = 0.36) # 0.36 임베딩 1,2만 concat

KNN=3
Our f1 score for threshold 0.2 is 0.6369967817416988
Our f1 score for threshold 0.21000000000000002 is 0.6409133125957238
Our f1 score for threshold 0.22000000000000003 is 0.6448629298975983
Our f1 score for threshold 0.23000000000000004 is 0.6492503288902197
Our f1 score for threshold 0.24000000000000005 is 0.6536395785640392
Our f1 score for threshold 0.25000000000000006 is 0.6576041386100375
Our f1 score for threshold 0.26000000000000006 is 0.6619585391185512
Our f1 score for threshold 0.2700000000000001 is 0.6655330396574697
Our f1 score for threshold 0.2800000000000001 is 0.6693885171104487
Our f1 score for threshold 0.2900000000000001 is 0.6731721001426857
Our f1 score for threshold 0.3000000000000001 is 0.6770120422420773
Our f1 score for threshold 0.3100000000000001 is 0.6807460861431948
Our f1 score for threshold 0.3200000000000001 is 0.6843006148914904
Our f1 score for threshold 0.3300000000000001 is 0.6879486731943992
Our f1 score for threshold 0.34000000000000014 is 0

Our f1 score for threshold 1.410000000000001 is 0.6960890098536552
Our f1 score for threshold 1.420000000000001 is 0.6960890098536552
Our f1 score for threshold 1.430000000000001 is 0.6960890098536552
Our f1 score for threshold 1.440000000000001 is 0.6960890098536552
Our f1 score for threshold 1.450000000000001 is 0.6960890098536552
Our f1 score for threshold 1.460000000000001 is 0.6960890098536552
Our f1 score for threshold 1.470000000000001 is 0.6960890098536552
Our f1 score for threshold 1.480000000000001 is 0.6960890098536552
Our f1 score for threshold 1.490000000000001 is 0.6960890098536552
Our f1 score for threshold 1.500000000000001 is 0.6960890098536552
Our f1 score for threshold 1.5100000000000011 is 0.6960890098536552
Our f1 score for threshold 1.5200000000000011 is 0.6960890098536552
Our f1 score for threshold 1.5300000000000011 is 0.6960890098536552
Our f1 score for threshold 1.5400000000000011 is 0.6960890098536552
Our f1 score for threshold 1.5500000000000012 is 0.6960890

Our f1 score for threshold 2.6400000000000023 is 0.6960890098536552
Our f1 score for threshold 2.650000000000002 is 0.6960890098536552
Our f1 score for threshold 2.6600000000000024 is 0.6960890098536552
Our f1 score for threshold 2.6700000000000026 is 0.6960890098536552
Our f1 score for threshold 2.6800000000000024 is 0.6960890098536552
Our f1 score for threshold 2.690000000000002 is 0.6960890098536552
Our f1 score for threshold 2.7000000000000024 is 0.6960890098536552
Our f1 score for threshold 2.7100000000000026 is 0.6960890098536552
Our f1 score for threshold 2.7200000000000024 is 0.6960890098536552
Our f1 score for threshold 2.730000000000002 is 0.6960890098536552
Our f1 score for threshold 2.7400000000000024 is 0.6960890098536552
Our f1 score for threshold 2.7500000000000027 is 0.6960890098536552
Our f1 score for threshold 2.7600000000000025 is 0.6960890098536552
Our f1 score for threshold 2.7700000000000022 is 0.6960890098536552
Our f1 score for threshold 2.7800000000000025 is 0.

  8%|▊         | 2624/34250 [00:00<00:01, 26238.65it/s]

Our f1 score for threshold 2.990000000000003 is 0.6960890098536552
Our best score is 0.7426301734892266 and has a threshold 0.5700000000000003


100%|██████████| 34250/34250 [00:01<00:00, 25618.27it/s]


In [21]:
image_predictions2 = get_image_predictions(df, image_embeddings2, threshold = 0.36) #1,2,3

KNN=3
Our f1 score for threshold 0.2 is 0.6381790182659983
Our f1 score for threshold 0.21000000000000002 is 0.6419644872380234
Our f1 score for threshold 0.22000000000000003 is 0.6463404624781984
Our f1 score for threshold 0.23000000000000004 is 0.6505730706264684
Our f1 score for threshold 0.24000000000000005 is 0.654854147327336
Our f1 score for threshold 0.25000000000000006 is 0.6592160960372013
Our f1 score for threshold 0.26000000000000006 is 0.6633005941329243
Our f1 score for threshold 0.2700000000000001 is 0.6669880063203227
Our f1 score for threshold 0.2800000000000001 is 0.6707014028499619
Our f1 score for threshold 0.2900000000000001 is 0.6751254528561728
Our f1 score for threshold 0.3000000000000001 is 0.67846224682229
Our f1 score for threshold 0.3100000000000001 is 0.6823022524251661
Our f1 score for threshold 0.3200000000000001 is 0.6861373727814667
Our f1 score for threshold 0.3300000000000001 is 0.6899364054175792
Our f1 score for threshold 0.34000000000000014 is 0.69

Our f1 score for threshold 1.410000000000001 is 0.6963203956682527
Our f1 score for threshold 1.420000000000001 is 0.6963203956682527
Our f1 score for threshold 1.430000000000001 is 0.6963203956682527
Our f1 score for threshold 1.440000000000001 is 0.6963203956682527
Our f1 score for threshold 1.450000000000001 is 0.6963203956682527
Our f1 score for threshold 1.460000000000001 is 0.6963203956682527
Our f1 score for threshold 1.470000000000001 is 0.6963203956682527
Our f1 score for threshold 1.480000000000001 is 0.6963203956682527
Our f1 score for threshold 1.490000000000001 is 0.6963203956682527
Our f1 score for threshold 1.500000000000001 is 0.6963203956682527
Our f1 score for threshold 1.5100000000000011 is 0.6963203956682527
Our f1 score for threshold 1.5200000000000011 is 0.6963203956682527
Our f1 score for threshold 1.5300000000000011 is 0.6963203956682527
Our f1 score for threshold 1.5400000000000011 is 0.6963203956682527
Our f1 score for threshold 1.5500000000000012 is 0.6963203

Our f1 score for threshold 2.6300000000000026 is 0.6963203956682527
Our f1 score for threshold 2.6400000000000023 is 0.6963203956682527
Our f1 score for threshold 2.650000000000002 is 0.6963203956682527
Our f1 score for threshold 2.6600000000000024 is 0.6963203956682527
Our f1 score for threshold 2.6700000000000026 is 0.6963203956682527
Our f1 score for threshold 2.6800000000000024 is 0.6963203956682527
Our f1 score for threshold 2.690000000000002 is 0.6963203956682527
Our f1 score for threshold 2.7000000000000024 is 0.6963203956682527
Our f1 score for threshold 2.7100000000000026 is 0.6963203956682527
Our f1 score for threshold 2.7200000000000024 is 0.6963203956682527
Our f1 score for threshold 2.730000000000002 is 0.6963203956682527
Our f1 score for threshold 2.7400000000000024 is 0.6963203956682527
Our f1 score for threshold 2.7500000000000027 is 0.6963203956682527
Our f1 score for threshold 2.7600000000000025 is 0.6963203956682527
Our f1 score for threshold 2.7700000000000022 is 0.

 15%|█▍        | 5075/34250 [00:00<00:01, 25385.76it/s]

Our f1 score for threshold 2.990000000000003 is 0.6963203956682527
Our best score is 0.7426692021203819 and has a threshold 0.5600000000000003


100%|██████████| 34250/34250 [00:01<00:00, 25513.34it/s]


In [58]:
text_predictions = get_text_predictions(df, max_features = 25_000) # 25_000

Finding similar titles...
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 34250
Our f1 score for threshold 0.55 is 0.7956654441430413
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 34250
Our f1 score for threshold 0.5750000000000001 is 0.7573823909853572
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 34250
Our f1 score for threshold 0.6000000000000001 is 0.7221073872232033
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 34250
Our f1 score for threshold 0.6250000000000001 is 0.691089455

In [59]:
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions']])
    return ' '.join( np.unique(x))

In [60]:
# Function to combine predictions
def combine_predictions_oof(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions'], row['oof_text'], row['oof_hash']])
    return ' '.join( np.unique(x) )

In [61]:
# Concatenate image predctions with text predictions
tmp = df.groupby('image_phash').posting_id.agg('unique').to_dict()
#df['oof_hash'] = df.image_phash.map(tmp)
if GET_CV:
    df['image_predictions'] = image_predictions
    df['text_predictions'] = text_predictions
    df['pred_matches'] = df.apply(combine_predictions, axis = 1)
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df['f1'].mean()
    print(f'Our final f1 cv score is {score}')
    df['matches'] = df['pred_matches']
    #df[['posting_id', 'matches']].to_csv('submission.csv', index = False)
else:
    df['image_predictions'] = image_predictions
    df['text_predictions'] = text_predictions
    #df['matches'] = df.apply(combine_predictions_oof, axis = 1)
    df['matches'] = df.apply(combine_predictions, axis = 1)
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)

Our final f1 cv score is 0.9889929025026313


In [62]:
df[['posting_id', 'matches']]

Unnamed: 0,posting_id,matches
0,train_129225211,train_129225211 train_2278313361
1,train_3386243561,train_1816968361 train_1831941588 train_212059...
2,train_2288590299,train_2288590299 train_3803689425
3,train_2406599165,train_1508100548 train_1744956981 train_204309...
4,train_3369186413,train_3369186413 train_921438619
...,...,...
34245,train_4028265689,train_4028265689
34246,train_769054909,train_1463059254 train_2530102819 train_293057...
34247,train_614977732,train_1264798465 train_2325457554 train_269046...
34248,train_3630949769,train_1431563868 train_3419392575 train_363094...


Thanks you so much for reading this notebook. If you have any suggestions or ideas on ensembling models together then do let me know. 😁