In [None]:
# 引入必要的包
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
import efficientnet.tfkeras as efn
import math
from shutil import copyfile

import tokenization
import tensorflow_hub as hub

import os
import cv2
import random
from tqdm import tqdm

import albumentations
from albumentations.pytorch.transforms import ToTensorV2

import torch
import timm
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
import fasttext as ft

In [None]:
# 计算F1 Score的函数
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [None]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
BATCH_SIZE = 8
IMAGE_SIZE = [512, 512]
# Seed
SEED = 42
# Verbosity
VERBOSE = 1
# Number of classes
N_CLASSES = 11011

In [None]:
# 所有配置内容
class CFG:
    
    img_size = 512
    batch_size = 12
    seed = 2020
    
    device = 'cuda'
    classes = 11014
    
    
    
    scale = 30 
    margin = 0.5
    
    img_size = 512
    fc_dim = 512
    batch_size = 12
    seed = 2020
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    classes = 11014
    
    model_name1 =  'tf_efficientnet_b4'
    model_name2 = 'eca_nfnet_l0'
    model_name3 = 'resnext50_32x4d'
    model_name4 = 'tf_efficientnet_b5_ns'
    model_name5 = 'efficientnet_b3'

    model_path1 = './utils-shopee/arcface_512x512_tf_efficientnet_b4_LR.pt'
    model_path2 = './shopee-pytorch-models/arcface_512x512_nfnet_l0 (mish).pt'
    model_path3 = './shopee-pytorch-models/arcface_512x512_resnext32x4d.pt'
    model_path4 = './shopee-pytorch-models/arcface_512x512_eff_b5_.pt'
    model_path5 = './shopee-pytorch-models/arcface_512x512_eff_b3.pt'
    
    scale = 30 
    margin = 0.5

In [None]:
LIMIT = 2.0
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    except RuntimeError as e:
        print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
# Flag to check ram allocations (debug)
CHECK_SUB = False

df = cudf.read_csv('./data/match/test.csv')
# If we are comitting, replace train set for test set and dont get cv
if len(df) > 3:
    GET_CV = False
del df
GET_CV = False  # 决定出测试集数据还是训练集数据

# Function to get f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1


# Function to combine predictions
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions'], row['oof_text'], row['oof_hash']])
    return ' '.join( np.unique(x) )

# Function to read dataset
def read_dataset():
    if GET_CV:
        df = pd.read_csv('./data/match/train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
        df['target'] = df.label_group.map(tmp)
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
        image_paths = './data/images/train_images/' + df['image']
    else:
        df = pd.read_csv('./data/match/test.csv')
        df_cu = cudf.DataFrame(df)
        image_paths = './data/images/test_images/' + df['image']
        
    return df, df_cu, image_paths


# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image

# Function to get our dataset that read images
def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

In [None]:
# Mish 函数
class Mish_func(torch.autograd.Function):
    
    """from: https://github.com/tyunist/memory_efficient_mish_swish/blob/master/mish.py"""
    
    @staticmethod
    def forward(ctx, i):
        result = i * torch.tanh(F.softplus(i))
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
  
        v = 1. + i.exp()
        h = v.log() 
        grad_gh = 1./h.cosh().pow_(2)
        
        # Note that grad_hv * grad_vx = sigmoid(x)
        #grad_hv = 1./v  
        #grad_vx = i.exp()
        
        grad_hx = i.sigmoid()

        grad_gx = grad_gh *  grad_hx #grad_hv * grad_vx 
        
        grad_f =  torch.tanh(F.softplus(i)) + i * grad_gx 
        
        return grad_output * grad_f 
    
class Mish(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        pass
    def forward(self, input_tensor):
        return Mish_func.apply(input_tensor)
    
    
def replace_activations(model, existing_layer, new_layer):
    
    """A function for replacing existing activation layers"""
    
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_activations(module, existing_layer, new_layer)

        if type(module) == existing_layer:
            layer_old = module
            layer_new = new_layer
            model._modules[name] = layer_new
    return model

In [None]:
# Tensorflow Arcmargin model
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m
        
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)
    
    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [None]:
# image embeddings for tensorflow
def get_image_embeddings_tensorflow(image_paths):
    embeds = []
    
    margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )

    inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp1')
    label = tf.keras.layers.Input(shape = (), name = 'inp2')
    x = efn.EfficientNetB3(weights = None, include_top = False)(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = margin([x, label])
        
    output = tf.keras.layers.Softmax(dtype='float32')(x)

    model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
    model.load_weights('efficientnet/EfficientNetB3_512_42.h5')
    model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        image_dataset = get_dataset(image_paths[a:b])
        image_embeddings = model.predict(image_dataset)
        embeds.append(image_embeddings)
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

# Return tokens, masks and segments from a text array or series
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def get_text_embeddings(df, max_len = 70):
    embeds = []
    module_url = "bert-en-uncased-l24-h1024-a16-1"
    bert_layer = hub.KerasLayer(module_url, trainable = True)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
    text = bert_encode(df['title'].values, tokenizer, max_len = max_len)
    
    margin = ArcMarginProduct(
            n_classes = 11014, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )
    input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    label = tf.keras.layers.Input(shape = (), name = 'label')

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    x = margin([clf_output, label])
    output = tf.keras.layers.Softmax(dtype='float32')(x)
    model = tf.keras.models.Model(inputs = [input_word_ids, input_mask, segment_ids, label], outputs = [output])
    
    model.load_weights('./Bert_123.h5')
    model = tf.keras.models.Model(inputs = model.input[0:3], outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        text_chunk = ((text[0][a:b], text[1][a:b], text[2][a:b]))
        text_embeddings = model.predict(text_chunk, batch_size = BATCH_SIZE)
        embeds.append(text_embeddings)
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

In [None]:
def get_neighbors(df, embeddings, KNN = 50, image = True):   
    if len(df)<10:
        KNN = 1
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        if image:
            thresholds = list(np.arange(4.5, 5.0, 0.1))
        else:
            thresholds = list(np.arange(31, 35, 1))
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        for k in range(embeddings.shape[0]):
            ids = np.array([])
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            if image:
                idx = np.where(distances[k,] < 3.3)[0]
                ids = indices[k,idx]
            else:
                idx = np.where(distances[k,] < 20.0)[0]
                ids = indices[k,idx]
                if ((len)(idx)>1):
                    arr = distances[k,np.where(distances[k,]<20)[0]][1:]
                    mean = np.mean(arr)
                    standard_deviation = np.std(arr)
                    if(standard_deviation>0):
                        distance_from_mean = abs(arr - mean)
                        max_deviations = 2
                        not_outlier = distance_from_mean < max_deviations * standard_deviation
                        max_dist = arr[not_outlier][-1]
                        idx = np.where(distances[k,] <= max_dist)[0]
                        ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
    
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            ids = np.array([])
            if image:
                idx = np.where(distances[k,] < 3.3)[0]
                ids = indices[k,idx]
            else:
                idx = np.where(distances[k,] < 16.0)[0]
                ids = indices[k,idx]
                if (len(idx)>1):
                    arr = distances[k,np.where(distances[k,]<16.0)[0]][1:]
                    mean = np.mean(arr)
                    standard_deviation = np.std(arr)
                    if(standard_deviation>0):
                        distance_from_mean = abs(arr - mean)
                        max_deviations = 2
                        not_outlier = distance_from_mean < max_deviations * standard_deviation
                        max_dist = arr[not_outlier][-1]
                        idx = np.where(distances[k,] <= max_dist)[0]
                        ids = indices[k,idx]
            
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [None]:
# 基于Tensorflow模型拿到的结果
df, df_cu, image_paths = read_dataset()
image_embeddings = get_image_embeddings_tensorflow(image_paths)
text_embeddings = get_text_embeddings(df)
gc.collect()

In [None]:
class ArcMarginProductTorch(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProductTorch, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale

        return output
    
class ShopeeModel(nn.Module):

    def __init__(
        self,
        n_classes = CFG.classes,
        model_name = CFG.model_name,
        fc_dim = 512,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = False,
        pretrained = False):


        super(ShopeeModel,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=pretrained)

        if model_name == 'resnext50_32x4d':
            final_in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'efficientnet_b3':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'tf_efficientnet_b5_ns':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
        
        elif model_name == 'nfnet_f3':
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()

        self.pooling =  nn.AdaptiveAvgPool2d(1)

        self.use_fc = use_fc

        self.dropout = nn.Dropout(p=0.0)
        self.fc = nn.Linear(final_in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        final_in_features = fc_dim

        self.final = ArcMarginProductTorch(
            final_in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, image, label):
        feature = self.extract_feat(image)
        #logits = self.final(feature,label)
        return feature

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x

In [None]:
def get_image_embeddings_torch(image_paths, model_name = CFG.model_name1, model_path = CFG.model_path1):
    embeds = []
    
    model = ShopeeModel(model_name = model_name)
    model.eval()
    
    if model_name == 'eca_nfnet_l0':
        model = replace_activations(model, torch.nn.SiLU, Mish())
    model.load_state_dict(torch.load(model_path))
    model = model.to(CFG.device)
    

    image_dataset = ShopeeDataset(image_paths=image_paths,transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat = model(img,label)
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    
    del model
    
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

In [None]:
import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2
class ShopeeDataset(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
    
        return image,torch.tensor(1)
    
def get_test_transforms():

    return A.Compose(
        [
            A.Resize(CFG.img_size,CFG.img_size,always_apply=True),
            A.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )

In [None]:
def get_image_predictions_torch(df, embeddings1, embeddings4, threshold = 3.4):
    
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 1
    
    #--
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings1)
    distances, indices = model.kneighbors(embeddings1)
    
    threshold = 2.2  # 1。7
    predictions1 = []
    for k in tqdm(range(embeddings1.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = list(df['posting_id'].iloc[ids])
        predictions1.append(posting_ids)
        
    del model, distances, indices, embeddings1
    gc.collect()

    #--
    """
    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
    model.fit(embeddings3)
    distances, indices = model.kneighbors(embeddings3)
    
    threshold=0.36   # 0.36
    predictions3 = []
    for k in tqdm(range(embeddings3.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = list(df['posting_id'].iloc[ids])
        predictions3.append(posting_ids)
        
    del model, distances, indices, embeddings3
    gc.collect()
    """
    
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings4)
    distances, indices = model.kneighbors(embeddings4)
    threshold = 5.2   # 4.5
    predictions4 = []
    
    for k in tqdm(range(embeddings4.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = list(df['posting_id'].iloc[ids])
        predictions4.append(posting_ids)
        
    del model, distances, indices, embeddings4
    gc.collect()
    
         
    predictions = [list(set(a +b)) for a, b in zip(predictions1, predictions4)]
    
    return predictions

In [None]:
df, image_predictions = get_neighbors(df, image_embeddings, KNN = 25, image = True)

In [None]:
df, text_predictions = get_neighbors(df, text_embeddings, KNN = 25, image = False)

In [None]:
from cuml.feature_extraction.text import TfidfVectorizer

model = TfidfVectorizer(stop_words=None, binary=True, max_features=25000)
text_embeddings2 = model.fit_transform(df_cu.title).toarray()
print('text embeddings shape',text_embeddings2.shape)

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(df_cu)//CHUNK
if len(df_cu)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(df_cu))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    # cts = np.dot( text_embeddings, text_embeddings[a:b].T).T
    cts = cupy.matmul(text_embeddings2, text_embeddings2[a:b].T).T
    
    for k in range(b-a):
        # IDX = np.where(cts[k,]>0.7)[0]
        IDX = cupy.where(cts[k,]>0.75)[0]
        o = df_cu.iloc[cupy.asnumpy(IDX)].posting_id.to_pandas().values
        preds.append(o)
        
del model, text_embeddings2

In [None]:
df_cu['oof_text'] = preds

In [None]:
def read_dataset():
    df = pd.read_csv('../input/shopee-product-matching/test.csv')
    df_cu = cudf.DataFrame(df)
    image_paths = '../input/shopee-product-matching/test_images/' + df['image']
    return df, df_cu, image_paths

In [None]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(CFG.seed)

In [None]:
def get_image_predictions(df, embeddings,threshold = 0.0):
    
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    
    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return predictions

In [None]:
df_image,df_image_cu,image_paths = read_dataset()
df_image.head()

In [None]:
image_embeddings_tensorflow1 = get_image_embeddings_tensorflow(image_paths.values)
image_predictions_tensorflow = get_image_predictions(df_image, image_embeddings1, threshold = 0.3)

In [None]:
image_embeddings_torch1= get_image_embeddings_torch(image_paths.values, CFG.model_name1,CFG.model_path1)
image_embeddings_torch4 = get_image_embeddings_torch(image_paths.values, CFG.model_name4,CFG.model_path4)

In [None]:
image_predictions_torch = get_image_predictions_torch(df_image, image_embeddings_torch1, image_embeddings_torch4, threshold = 1.7)

In [None]:
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions'], row['oof_text'], row['oof_hash']])
    return ' '.join( np.unique(x) )

In [None]:
def combine_predictions_submit(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions'], row['oof_text'], row['oof_hash'],row['oof_image'],row['image_predictions_new']])
    return ' '.join( np.unique(x) )

In [None]:
tmp = df.groupby('image_phash').posting_id.agg('unique').to_dict()
df['oof_hash'] = df.image_phash.map(tmp)
if GET_CV:
    df['image_predictions'] = image_predictions_tensorflow1
    #df['image_predictions_new'] = image_predictions_new
    df['text_predictions'] = text_predictions
    df['oof_text'] = df_cu['oof_text'].to_pandas().values
    df['pred_matches'] = df.apply(combine_predictions, axis = 1)
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df['f1'].mean()
    print(f'Our final f1 cv score is {score}')
    df['matches'] = df['pred_matches']
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)
else:
    df['image_predictions_new'] = image_predictions_torch
    df['image_predictions'] = image_predictions_tensorflow
    df['oof_text'] = df_cu['oof_text'].to_pandas().values
    df['text_predictions'] = text_predictions
    df['oof_image']=image_predictions1
    df['matches'] = df.apply(combine_predictions_submit, axis = 1)
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)