In [None]:
# 定义文件路径
CSV_PATH = './data/match/'
DATA_PATH = './data/images/'

In [None]:
import psutil

import numpy as np
import pandas as pd
import cv2, matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import gc

In [None]:
# getMatric用于衡量最终表现
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [None]:
# True代表读取训练集数据，而False代表读取测试集数据
COMPUTE_CV = True

if COMPUTE_CV:
    train = pd.read_csv(CSV_PATH + 'train.csv')
    train['image'] = DATA_PATH + 'train_images/' + train['image']
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(tmp)
else:
    train = pd.read_csv(CSV_PATH + 'test.csv')
    train['image'] = DATA_PATH + 'test_images/' + train['image']
print('train shape is', train.shape)
train.head()

In [None]:
tmp = train.groupby('image_phash').posting_id.agg('unique').to_dict()
train['oof_hash'] = train.image_phash.map(tmp)

# hash比较算法，最后f1可到0.553
if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_hash'),axis=1)
    print('CV score for baseline =',train.f1.mean())

In [None]:
from PIL import Image

import torch

torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset

In [None]:
class ShopeeImageDataset(Dataset):
    def __init__(self, img_path, transform):
        self.img_path = img_path
        self.transform = transform

    def __getitem__(self, index):
        img = Image.open(self.img_path[index]).convert('RGB')
        img = self.transform(img)
        return img

    def __len__(self):
        return len(self.img_path)

In [None]:
# 读取图像数据
imagedataset = ShopeeImageDataset(
    train['image'].values,
    transforms.Compose([
        transforms.Resize((512, 512)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]))

imageloader = torch.utils.data.DataLoader(
    imagedataset,
    batch_size=40, shuffle=False, num_workers=2
)

In [None]:
# 使用预训练的Resnet18
class ShopeeImageEmbeddingNet(nn.Module):
    def __init__(self):
        super(ShopeeImageEmbeddingNet, self).__init__()

        model = models.resnet18(True)
        model.avgpool = nn.AdaptiveMaxPool2d(output_size=(1, 1))
        model = nn.Sequential(*list(model.children())[:-1])
        model.eval()
        self.model = model

    def forward(self, img):
        out = self.model(img)
        return out

In [None]:
# 读取图片的Embedding
DEVICE = 'cuda'

imgmodel = ShopeeImageEmbeddingNet()
imgmodel = imgmodel.to(DEVICE)

imagefeat = []
with torch.no_grad():
    for data in tqdm_notebook(imageloader):
        data = data.to(DEVICE)
        feat = imgmodel(data)
        feat = feat.reshape(feat.shape[0], feat.shape[1])
        feat = feat.data.cpu().numpy()

        imagefeat.append(feat)

In [None]:
from sklearn.preprocessing import normalize

# 对数据做归一化
imagefeat = np.vstack(imagefeat)
imagefeat = normalize(imagefeat)

imagefeat = torch.from_numpy(imagefeat)
imagefeat = imagefeat.cuda()

In [None]:
preds = []
CHUNK = 1024*4


print('Finding similar images...')
CTS = len(imagefeat)//CHUNK
if len(imagefeat)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b, len(imagefeat))
    print('chunk',a,'to',b)
    
    distances = torch.matmul(imagefeat, imagefeat[a:b].T).T
    distances = distances.data.cpu().numpy()
    
    for k in range(b-a):
        IDX = np.where(distances[k,]>0.95)[0][:]
        o = train.iloc[IDX].posting_id.values
        preds.append(o)
        
del imagefeat, imgmodel

In [None]:
# 基于image的embedding，f1可以到0.653
train['oof_cnn'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_cnn'),axis=1)
    print('CV score for baseline =',train.f1.mean())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(stop_words=None, binary=True, max_features=55000)
text_embeddings = model.fit_transform(train.title).toarray()
print('text embeddings shape',text_embeddings.shape)

In [None]:
text_embeddings = torch.from_numpy(text_embeddings)
text_embeddings = text_embeddings.cuda()

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(train)//CHUNK
if len(train)%CHUNK!=0: CTS += 1
CTS_index = 0
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(train))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    # cts = np.dot( text_embeddings, text_embeddings[a:b].T).T
    cts = torch.matmul(text_embeddings, text_embeddings[a:b].T).T
    cts = cts.data.cpu().numpy()
    print(cts.shape)
    for k in range(b-a):
        # IDX = np.where(cts[k,]>0.7)[0]
        IDX = np.where(cts[k,]>0.7)[0]
        o = train.iloc[IDX].posting_id.values
        preds.append(o)
        CTS_index += 1
# del model, text_embeddings

In [None]:
train['oof_text'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_text'),axis=1)
    print('CV score for baseline =',train.f1.mean())

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.oof_text,row.oof_cnn, row.oof_hash])
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    x = np.concatenate([row.oof_text,row.oof_cnn, row.oof_hash])
    return np.unique(x)

In [None]:
# 训练集上可以到0.734，测试集上有0.72
if COMPUTE_CV:
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(tmp)
    train['oof'] = train.apply(combine_for_cv,axis=1)
    train['f1'] = train.apply(getMetric('oof'),axis=1)
    print('CV Score =', train.f1.mean() )

train['matches'] = train.apply(combine_for_sub,axis=1)

In [None]:
train[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()