In [1]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import cv2,math,gc

import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.nn import Parameter

!pip install "../input/efficient-net/dist/efficientnet_pytorch-0.7.0.tar"
from efficientnet_pytorch import EfficientNet

!pip install "../input/faissgpuwheel/faiss_gpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl"
import faiss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

import warnings
warnings.simplefilter('ignore')

torch.backends.cudnn.benchmark = True

Processing /kaggle/input/efficient-net/dist/efficientnet_pytorch-0.7.0.tar
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l- \ done
[?25h  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.7.0-py3-none-any.whl size=16033 sha256=4f3723ef7d75dbc632570d258c4ebad92c6f85c6bf82f82ad7c8f3cc979adc9a
  Stored in directory: /root/.cache/pip/wheels/af/8c/80/1bf8cc2fa471c320978f34c5290675daaa96446e1b9ba45555
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.0
Processing /kaggle/input/faissgpuwheel/faiss_gpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl
Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.0


In [2]:
class cfg:
    img_size = (380,380)
    feavec_num1 = 512
    feavec_num2 = 1280
    fea_norm = 64
    margin = 0.35
    batch = 50
    wpath = ["../input/shopee-weight/w_eff6_s380_cl8812_fold1_v2.pt",
             "../input/shopee-weight/w_effb3_s380_cl8811_fold2_0.80.pt",
             "../input/shopee-weight/w_effb5_s380_cl8811_fold3.pt",
             "../input/shopee-weight/w_effb4_s380_cl8811_fold4.pt",
             "../input/shopee-weight/w_effb3_s380_cl8811_fold5_m0.35.pt"]
    mname = ['efficientnet-b6','efficientnet-b3','efficientnet-b5','efficientnet-b4','efficientnet-b3']
    clsize = [8812,8811,8811,8811,8811]

In [3]:
COMPUTE_CV = False

#make target clustering
if COMPUTE_CV:
    df = pd.read_csv("../input/shopee-product-matching/train.csv")
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    df['target'] = df['target'].apply(lambda x: ' '.join(x))
    df_cu = cudf.DataFrame(df)
else:
    df = pd.read_csv("../input/shopee-product-matching/test.csv")
    df_cu = cudf.DataFrame(df)
    if len(df)==3:
        cfg.batch = 3
    
print('df shape is', df.shape )
df.head()

df shape is (3, 4)


Unnamed: 0,posting_id,image,image_phash,title
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng


# Use Image Embeddings

In [4]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, s=30.0, m=0.30, easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size(), device=device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s
        return output


class Model(nn.Module):
    def __init__(self,name,clustersize,feavec=512):
        super(Model, self).__init__()
        self.eff = EfficientNet.from_name(name)
        self.out = nn.Linear(1000,feavec)
        self.margin = ArcMarginProduct(in_features=feavec, 
                                       out_features = clustersize, 
                                       s=cfg.fea_norm, 
                                       m=cfg.margin)      

    def forward(self, x, labels=None):
        x = self.eff(x)
        x = self.out(x)
        if labels is not None:
            return self.margin(x,labels)
        return F.normalize(x,dim=1)

In [5]:
model1 = Model(name=cfg.mname[0],clustersize=cfg.clsize[0]).to(device).half()
model1.load_state_dict(torch.load(cfg.wpath[0], map_location=device))

model2 = Model(name=cfg.mname[1],clustersize=cfg.clsize[1]).to(device).half()
model2.load_state_dict(torch.load(cfg.wpath[1], map_location=device))

model3 = Model(name=cfg.mname[2],clustersize=cfg.clsize[2]).to(device).half()
model3.load_state_dict(torch.load(cfg.wpath[2], map_location=device))

model4 = Model(name=cfg.mname[3],clustersize=cfg.clsize[3]).to(device).half()
model4.load_state_dict(torch.load(cfg.wpath[3], map_location=device))

model5 = Model(name=cfg.mname[4],clustersize=cfg.clsize[4]).to(device).half()
model5.load_state_dict(torch.load(cfg.wpath[4], map_location=device))

<All keys matched successfully>

In [6]:
# make image Datasets
def load_image(file_name):
    if COMPUTE_CV:
        file_path = f'/kaggle/input/shopee-product-matching/train_images/{file_name}'
    else:
        file_path = f'/kaggle/input/shopee-product-matching/test_images/{file_name}'

    img = cv2.imread(file_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, cfg.img_size)
    tensor_img = torch.tensor(img)
    tensor_img = tensor_img.permute(( 2, 0, 1)).float()/255.0
    return tensor_img

class valDataset(Dataset):
    def __init__(self, df):
        self.img = df.image.values
        
    def __len__(self):
        return len(self.img)

    def __getitem__(self, idx):
        img = self.img[idx]
        img = load_image(img)
        return img

In [7]:
def image_embeddings(df):
    dataset = valDataset(df)
    loader = DataLoader(dataset,
                        batch_size=cfg.batch,
                        shuffle=False,
                        num_workers=2,
                        pin_memory=True,
                        drop_last=False)
    
    model1.eval()
    model2.eval()
    model3.eval()
    model4.eval()
    model5.eval()
    print('start collection')
    feavec = 512
    embedded1 = np.empty((0,feavec),dtype='float32')
    embedded2 = np.empty((0,feavec),dtype='float32')
    embedded3 = np.empty((0,feavec),dtype='float32')
    embedded4 = np.empty((0,feavec),dtype='float32')
    embedded5 = np.empty((0,feavec),dtype='float32')
    with torch.no_grad():
        for idx,images in enumerate(loader):
            images = images.to(device,non_blocking=True).half()
            outputs = model1(images)
            embedded1 = np.append(embedded1, outputs.cpu().detach().numpy(),axis=0)
            outputs = model2(images)
            embedded2 = np.append(embedded2, outputs.cpu().detach().numpy(),axis=0)
            outputs = model3(images)
            embedded3 = np.append(embedded3, outputs.cpu().detach().numpy(),axis=0)
            outputs = model4(images)
            embedded4 = np.append(embedded4, outputs.cpu().detach().numpy(),axis=0)
            outputs = model5(images)
            embedded5 = np.append(embedded5, outputs.cpu().detach().numpy(),axis=0)

            if idx%100==0:
                print(idx,len(loader)) 
                print(embedded1.shape)
                print(embedded2.shape)
                print(embedded3.shape)
                print(embedded4.shape)
                print(embedded5.shape)
    #del model1,model2,model3,model4
    return embedded1,embedded2,embedded3,embedded4,embedded5

In [8]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

def predict_img(df,embeddings,topk=50,threshold=0.63):
    N,D = embeddings.shape
    cpu_index = faiss.IndexFlatL2(D)
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
    gpu_index.add(embeddings)
    cluster_distance,cluster_index = gpu_index.search(x=embeddings, k=topk)
    
    df['pred_images'] = ''
    pred = []
    for k in range(embeddings.shape[0]):
        idx = np.where(cluster_distance[k,] < threshold)[0]
        ids = cluster_index[k,idx]
        #posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
        posting_ids = df['posting_id'].iloc[ids].values
        pred.append(posting_ids)
    df['pred_images'] = pred
    if COMPUTE_CV:
        df['pred_imgonly'] = df.pred_images.apply(lambda x: ' '.join(x))
        df['f1_img'] = f1_score(df['target'], df['pred_imgonly'])
        score = df['f1_img'].mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
    return df

def predict_text(df,embeddings,topk=50,threshold=0.63):
    N,D = embeddings.shape
    cpu_index = faiss.IndexFlatL2(D)
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
    gpu_index.add(embeddings)
    cluster_distance,cluster_index = gpu_index.search(x=embeddings, k=topk)
    
    df['pred_text'] = ''
    pred = []
    for k in range(embeddings.shape[0]):
        idx = np.where(cluster_distance[k,] < threshold)[0]
        ids = cluster_index[k,idx]
        #posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
        posting_ids = df['posting_id'].iloc[ids].values
        pred.append(posting_ids)
    df['pred_text'] = pred
    if COMPUTE_CV:
        df['pred_textonly'] = df.pred_images.apply(lambda x: ' '.join(x))
        df['f1_text'] = f1_score(df['target'], df['pred_textonly'])
        score = df['f1_text'].mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
    return df

# Use Text Embeddings

In [9]:
def get_text_predictions(df, max_features = 25000,threshold=0.7):
    from cuml.feature_extraction.text import TfidfVectorizer
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu.title).toarray()
    #print(text_embeddings)
    preds = []
    CHUNK = 1024*4

    print('Finding similar titles...')
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(cts[k,]>threshold)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
    df['pred_text'] = preds
    del model,text_embeddings
    gc.collect()
    if COMPUTE_CV:
        df['pred_textonly'] = df.pred_text.apply(lambda x: ' '.join(x))
        df['f1_text'] = f1_score(df['target'], df['pred_textonly'])
        score = df['f1_text'].mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
    return df

In [10]:
class textvalDataset(Dataset):
    def __init__(self, textlist):
        self.text = textlist
        
    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = torch.tensor(self.text[idx])
        text = text.float()
        return text

class Model(nn.Module):
    def __init__(self,clustersize,feavec=512):
        super(Model, self).__init__()
        self.linear1 = nn.Linear(24939,4000)
        self.linear2 = nn.Linear(4000,feavec)
        self.dropout = nn.Dropout(p=0.5)
        self.relu = nn.ReLU()
        self.margin = ArcMarginProduct(in_features=feavec, 
                                       out_features = clustersize, 
                                       s=64, 
                                       m=0.7)      

    def forward(self, x, labels=None):
        x = self.linear1(x)
        #x = self.relu(x)
        x = self.linear2(x)
        #x = self.relu(x)
        x = self.dropout(x)
        if labels is not None:
            return self.margin(x,labels)
        return F.normalize(x,dim=1)
    

def get_deeptext_predictions(df):
    from sklearn.feature_extraction.text import TfidfVectorizer
    df_t = pd.read_csv("../input/shopee-product-matching/train.csv")
    models = TfidfVectorizer(stop_words = 'english', binary = True, max_features = 24939)
    models.fit(pd.concat([df,df_t],axis=0).title)
    text = models.transform(df.title).toarray()
    batch = 100
    if len(df)==3:
        batch=3
    test_dataset = textvalDataset(text)
    test_loader = DataLoader(test_dataset,
                            batch_size=batch,
                            shuffle=False,
                            num_workers=2,
                            pin_memory=True)
    model_t1 = Model(8811)
    model_t2 = Model(8811)
    model_t1 = model_t1.to(device)
    model_t2 = model_t2.to(device)
    model_t1.load_state_dict(torch.load('../input/shopee-weight/w_lin_e5_fold1.pt'))
    model_t2.load_state_dict(torch.load('../input/shopee-weight/w_lin_e5_fold2.pt'))
    #model.load_state_dict(torch.load('../input/shopee-weight-text/w_lin_e5_fold0.pt'))
    model_t1.eval()
    model_t2.eval()
    print('start collection')
    embedded1 = np.empty((0,512),dtype='float32')
    embedded2 = np.empty((0,512),dtype='float32')
    with torch.no_grad():
        for idx,(images) in enumerate(test_loader):
            images = images.to(device,non_blocking=True)
            outputs = model_t1(images)
            embedded1 = np.append(embedded1, outputs.cpu().detach().numpy(),axis=0)
            outputs = model_t2(images)
            embedded2 = np.append(embedded2, outputs.cpu().detach().numpy(),axis=0)

            if idx%100==0:
                print(idx,len(test_loader)) 
                print(embedded1.shape)
                print(embedded2.shape)
    print(embedded1.shape,embedded2.shape)
    return embedded1,embedded2

In [11]:
text_embeddings1, text_embeddings2 = get_deeptext_predictions(df)

start collection
0 1
(3, 512)
(3, 512)
(3, 512) (3, 512)


# Carry out image prediction

In [12]:
image_embeddings1,image_embeddings2,image_embeddings3, image_embeddings4, image_embeddings5 = image_embeddings(df)

#image_embeddings2 = image_embeddings(df,cfg.wpath2,cfg.mname2,cfg.feavec_num1)
#image_embeddings3 = image_embeddings(df,cfg.wpath3,cfg.mname3,cfg.feavec_num1)

start collection
0 1
(3, 512)
(3, 512)
(3, 512)
(3, 512)
(3, 512)


In [13]:
embed_reuse = False
if embed_reuse:
    image_embeddings1 = np.load("../input/shopeeinferoutput/fold1_512.npy")
    image_embeddings2 = np.load("../input/shopeeinferoutput/fold2_512.npy")
    image_embeddings3 = np.load("../input/shopeeinferoutput/fold3_512.npy")
    image_embeddings4 = np.load("../input/shopeeinferoutput/fold4_512.npy")
    image_embeddings5 = np.load("../input/shopeeinferoutput/fold5_512.npy")

In [14]:
w = np.array([1.2,0.8,1.1,0.8,0.7])
image_embeddings = (w[0]*image_embeddings1+w[1]*image_embeddings2+w[2]*image_embeddings3+w[3]*image_embeddings4+w[4]*image_embeddings5)/w.sum()
wt = np.array([1,1])
text_embeddings = (wt[0]*text_embeddings1+wt[1]*text_embeddings2)/wt.sum()
img_text_embeddings = (image_embeddings + 0.4*text_embeddings)/1.4

In [15]:
#image_embeddings = np.average([image_embeddings1, image_embeddings2, image_embeddings3, image_embeddings4, image_embeddings5], axis = 0)
#w = np.array([0.80,1.05,1.4,1.05,1.15])
#image_embeddings = np.average([image_embeddings2, image_embeddings3, image_embeddings4], axis = 0)
#image_embeddings = np.average([image_embeddings3, image_embeddings4], axis = 0)
if COMPUTE_CV:
    df = predict_img(df,image_embeddings1,topk=50,threshold=0.88)
    df = predict_img(df,image_embeddings2,topk=50,threshold=0.88)
    df = predict_img(df,image_embeddings3,topk=50,threshold=0.88)
    df = predict_img(df,image_embeddings4,topk=50,threshold=0.88)
    df = predict_img(df,image_embeddings5,topk=50,threshold=0.88)
    df = predict_img(df,image_embeddings,topk=50,threshold=0.196)
    df = predict_img(df,img_text_embeddings,topk=50,threshold=0.138)

In [16]:
if COMPUTE_CV:
    np.save('fold1_512.npy', image_embeddings1)
    np.save('fold2_512.npy', image_embeddings2)
    np.save('fold3_512.npy', image_embeddings3)
    np.save('fold4_512.npy', image_embeddings4)
    np.save('fold5_512.npy', image_embeddings5)

In [17]:
#2,3,4
#df = predict_img(df,image_embeddings,topk=50,threshold=0.13)
df = predict_img(df,img_text_embeddings,topk=50,threshold=0.12)
#3,4
#df = predict_img(df,image_embeddings,topk=50,threshold=0.30)

#df = predict_img(df,image_embeddings,topk=50,threshold=0.60)

In [18]:
theresholds=np.linspace(0.13,0.15,10)
if COMPUTE_CV:
    #for topk in [49,50,51,60]:
    for threshold in theresholds:
        df = predict_img(df,img_text_embeddings,topk=50,threshold=threshold)

# Carry out text predictions

In [19]:
df = get_text_predictions(df, max_features = 25000,threshold=0.75)
df.head()

Finding similar titles...
chunk 0 to 3


Unnamed: 0,posting_id,image,image_phash,title,pred_images,pred_text
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...,[test_2255846744],[test_2255846744]
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...,[test_3588702337],[test_3588702337]
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,[test_4015706929],[test_4015706929]


# combine_predictions

In [20]:
def combine_predictions(row):
    x = np.concatenate([row['pred_images'], row['pred_text']])
    return ' '.join( np.unique(x) )

In [21]:
df['matches'] = df.apply(combine_predictions, axis=1)
#df['matches'] = df['pred_images'].apply(lambda x: ' '.join(x))
if COMPUTE_CV:
    df['f1'] = f1_score(df['target'], df['matches'])
    score = df['f1'].mean()
    print(f'Final f1 score is {score}')
else:
    with open('submission.csv', 'w') as outf:
        print('posting_id,matches', file=outf)
        for i,(idnum,match) in enumerate(zip(df['posting_id'],df['matches'])):
            print(f'{idnum},{match}', file=outf)

In [22]:
# df_t = pd.read_csv("submission.csv")
# print(df_t)