In [None]:
from itertools import chain

import numpy as np
import pandas as pd
import scipy.misc

import torch.nn as nn
from torch.autograd import Variable
import torch
import torchvision.models as models

from concurrent.futures import ThreadPoolExecutor
from multiprocessing import cpu_count

from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score

import cv2

In [None]:
DATA_DIR = '/home/user/data/amazon_planet'
TRAIN_DIR = 'train-jpg'
TRAIN_DATA = 'train_v2.csv'
TEST_DIR = 'test-jpg'
IMG_EXT = '.jpg'

In [None]:
labels_df = pd.read_csv(DATA_DIR + '/' + TRAIN_DATA)
labels_df.head(10)

In [None]:

labels_list = list(chain.from_iterable([tags.split(" ") for tags in labels_df['tags'].values]))
labels_set = set(labels_list)

In [None]:
labels = sorted(labels_set)
labels_map = {l: i for i, l in enumerate(labels)}
y_map = {v: k for k, v in labels_map.items()}

In [None]:
y_map

In [None]:
labels_s = pd.Series(labels_list).value_counts()

In [None]:
angles = np.arange(0, 360, 45)
offsets = np.arange(0, 12)
img_resize = (224,224)

In [None]:
def get_imgs(*args):
    path,size_,mode = list(args[0])
    bgr_img = cv2.imread(path)
    b,g,r = cv2.split(bgr_img)
    img = cv2.merge([r,g,b])
    
    # resize
    if mode == 'val':
        img = cv2.resize(img, img_resize, interpolation = cv2.INTER_AREA)
    else:
        x, y = np.random.choice(offsets, 2)
        img = img[x:x+img_resize[0], y:y+img_resize[0]]
    
    # scale
    img = img/ 255.0
    
    # augment
    if mode == 'train':
        for i in xrange(2):
            if np.random.randint(2) == 1:
                img = np.flip(img, i)
        num_rows, num_cols = img.shape[:2]
        rotation_matrix = cv2.getRotationMatrix2D((num_cols/2, num_rows/2), np.random.choice(angles), 1)
        img = cv2.warpAffine(img, rotation_matrix, (num_cols, num_rows), borderMode=cv2.BORDER_REFLECT_101)
        
    return img

def get_batch(files_path,img_resize,dir_path, mode):
    x_train = []
    with ThreadPoolExecutor(cpu_count()) as pool:
        for img_array in pool.map(get_imgs,[(dir_path+file_path+'.jpg',img_resize,mode) for file_path in files_path]):
                x_train.append(img_array)
    return x_train

In [None]:
# labels_df = pd.read_csv("../input/train_v2.csv")
labels = sorted(set(chain.from_iterable([tags.split(" ") for tags in labels_df['tags'].values])))
labels_map = {l: i for i, l in enumerate(labels)}

files_path = []
y_label = []
for file_name, tags in labels_df.values:
    files_path.append(file_name)
    targets = np.zeros(len(labels_map))
    for t in tags.split(' '):
        targets[labels_map[t]] = 1
    y_label.append(targets)

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(files_path, y_label,test_size=0.1,random_state=42)

In [None]:
class Amazon(nn.Module):
    def __init__(self, pretrained_model_1):
        super(Amazon, self).__init__()
        self.pretrained_model_1 = pretrained_model_1
        self.pretrained_model_1.fc = nn.Linear(pretrained_model1.fc.in_features, 17)
        self.pretrained_model_1 = nn.DataParallel(self.pretrained_model_1)
#         self.classifier = 
        # self.pretrained_model_2 = pretrained_model_2
#         self.relu = nn.ReLU()
#         self.fc1 = nn.Linear(2000,1000)
#         self.fc2 = nn.Linear(1000,len(labels_set)) # create layer
#         self.sigmoid = torch.nn.Sigmoid()
    def forward(self, x):
#         model_1 = self.relu(self.pretrained_model_1(x))
        #model_2 = self.relu(self.pretrained_model_2(x))
        #out1 = torch.cat((model_1,model_2),1)
        return self.pretrained_model_1(x)

# #pretrained_model1 = models.densenet169(pretrained=True)
# pretrained_model1 = models.resnet50(pretrained=True)#in fact, this should be set as true

# model = Amazon(pretrained_model1)

In [None]:
dir_path = DATA_DIR + '/' + TRAIN_DIR + '/'

In [None]:
# path = './model_3.pkl'
pretrained_model1 = models.resnet50(pretrained=True)
for param in pretrained_model1.parameters():
    param.requires_grad = False
model = Amazon(pretrained_model1)
# if path:
#     model.load_state_dict(torch.load(path))

In [None]:
# pretrained_model1
model

In [None]:
# input_ = Variable(torch.from_numpy(np.transpose(get_batch(X_train[0:32],(224,224),dir_path,'val'), (0, 3,1, 2)))).float()
# o = model(input_)
# o.size()

In [None]:
def exp_lr_scheduler(optimizer, epoch, init_lr=0.0001, lr_decay_epoch=30):
    """Decay learning rate by a factor of 0.1 every lr_decay_epoch epochs."""
    lr = init_lr * (0.1**(epoch // lr_decay_epoch))

    if epoch % lr_decay_epoch == 0:
        print('LR is set to {}'.format(lr))

    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    return optimizer

In [None]:
# model.pretrained_model_1.classifier

In [None]:
def train(train_x,train_y,valid_x, valid_y,epoch,num_model,img_resize,dir_path,lr_scheduler,model,batch_size):
#     print model
    torch.cuda.set_device(0)
    criterion = nn.BCELoss().cuda()
    optimizer = torch.optim.SGD(model.pretrained_model_1.module.fc.parameters(), lr=1e-04, momentum=0.9)
#     optimizer = torch.optim.SGD(model.parameters(), lr=1e-04, momentum=0.9)
    model.cuda()
    best_score = 0
    for epo in range(epoch):
        print 'epo: ' + str(epo)
#         if epo>0:
        optimizer = lr_scheduler(optimizer, epo)
        num_shuffle = np.random.permutation(range(len(train_y)))
        for step in range(len(train_x)/batch_size):
            x_batch = np.transpose(get_batch(train_x[num_shuffle[step*batch_size:(step+1)*batch_size]],img_resize,dir_path,'train'), (0, 3,1, 2))
            input_var = Variable(torch.from_numpy(x_batch)).float().cuda()
            target_var = Variable(torch.from_numpy(train_y[num_shuffle[step*batch_size:(step+1)*batch_size]])).cuda().float()
            output = model(input_var)
            output.clamp(min=1e-8,max=1e+8)
            loss = criterion(output, target_var)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if step % 30 ==0:
                valid_pred = validate(model,valid_x, valid_y,32,img_resize,dir_path)
                threshhold = [0.2]*17
                score = fbeta_score(np.array(valid_y)[:len(valid_pred)], np.array(valid_pred) >threshhold, beta=2, average='samples')
                print("epo: "+str(epo)+" step: "+str(step)+"  score: "+str(score))
                print('loss: '+ str(loss.data.cpu().numpy()[0].astype(float)))
                path = './model_'+str(num_model)+'.pkl'
                if score > best_score:
                    best_score = score
                    torch.save(model.state_dict(), path)
                    print("save in : "+ path)

In [None]:
def validate(model_,x_valid, y_valid,batch_val_size,img_resize,dir_path):
    p_valid = []
    pred_true = []
    for i in range(len(x_valid)/batch_val_size-1):
        #target = target.cuda(async=True)
        x_batch = np.transpose(get_batch(x_valid[i*batch_val_size:(i+1)*batch_val_size],img_resize,dir_path,'val'), (0, 3,1, 2))
        input_var = Variable(torch.from_numpy(x_batch)).float().cuda()
        target_var = Variable(torch.from_numpy(y_valid[i*batch_val_size:(i+1)*batch_val_size])).cuda().float()
        output = model_(input_var).data.cpu().numpy().astype(float)
        p_valid.extend(output)
        pred_true.extend(y_valid[i*batch_val_size:(i+1)*batch_val_size])
    return p_valid

In [None]:
def test_pred(x_test,batch_test_size,path,dir_path):
    pretrained_model1 = models.resnet18(pretrained=True)
    model = Amazon(pretrained_model1)
    model.load_state_dict(torch.load(path))
#     torch.cuda.set_device(2)
    model.cuda()
    p_test = []
    for step in range(len(x_test)/batch_test_size):
        if step%20==0:
            print(step)
        x_batch = np.transpose(get_batch(x_test[step*batch_test_size:(step+1)*batch_test_size],img_resize,dir_path,'test'), (0, 3,1, 2))
        input_var = Variable(torch.from_numpy(x_batch)).float().cuda()
        output = model(input_var).data.cpu().numpy().astype(float)
        p_test.extend(output)
    left_data = get_batch(x_test[-(len(x_test)- len(x_test)/batch_test_size*batch_test_size):],img_resize,dir_path,'test')
    input_var = Variable(torch.from_numpy(np.transpose(left_data, (0, 3,1, 2)))).float().cuda()
    output = model(input_var).data.cpu().numpy().astype(float)
    p_test.extend(output)
    return p_test

In [None]:
# model.pretrained_model_1.module.fc
cv2.BO

In [None]:
epoch=5
num_model=4
batch_size = 256
train(np.array(X_train),np.array(Y_train),np.array(X_valid),np.array(Y_valid),epoch,num_model,img_resize,dir_path,
     exp_lr_scheduler, model,batch_size)

In [None]:
input_var

In [None]:
path = './model_'+ str(num_model) + '.pkl'
y_pred = test_pred(np.array(X_valid),batch_size,path,dir_path)

In [None]:
import numpy as np
from sklearn.metrics import fbeta_score
def get_optimal_threshhold(true_label, prediction, iterations = 100):

    best_threshhold = [0.2]*17    
    for t in range(17):
        best_fbeta = 0
        temp_threshhold = [0.2]*17
        for i in range(iterations):
            temp_value = i / float(iterations)
            temp_threshhold[t] = temp_value
            temp_fbeta = fbeta(true_label, prediction >temp_threshhold)
            if  temp_fbeta>best_fbeta:
                best_fbeta = temp_fbeta
                best_threshhold[t] = temp_value
    return best_threshhold

def fbeta(true_label, prediction):
    return fbeta_score(true_label, prediction, beta=2, average='samples')

In [None]:
best_threshhold = get_optimal_threshhold(np.array(Y_valid)[:len(y_pred)], np.array(y_pred), iterations = 100)

In [None]:
test_sub = pd.read_csv(DATA_DIR + '/sample_submission_v2.csv')

In [None]:
sample_sub = test_sub.image_name.values

In [None]:
import os
files_name_test1 = sample_sub[:len(os.listdir(DATA_DIR + "/test-jpg/"))]

In [None]:
path = './model_0.pkl'
predictions = test_pred(files_name_test1,batch_size,path,DATA_DIR + "/test-jpg/")

In [None]:
x_test_filename = sample_sub

In [None]:
def map_predictions(predictions, labels_map, thresholds):
    """
    Return the predictions mapped to their labels
    :param predictions: the predictions from the predict() method
    :param labels_map: the map
    :param thresholds: The threshold of each class to be considered as existing or not existing
    :return: the predictions list mapped to their labels
    """
    predictions_labels = []
    for prediction in predictions:
        labels = [labels_map[i] for i, value in enumerate(prediction) if value > thresholds[i]]
        predictions_labels.append(labels)

    return predictions_labels

In [None]:
predicted_labels = map_predictions(predictions, y_map, best_threshhold)

In [None]:
tags_list = [None] * len(predicted_labels)
for i, tags in enumerate(predicted_labels):
    tags_list[i] = ' '.join(map(str, tags))
final_data = [[filename.split(".")[0], tags] for filename, tags in zip(x_test_filename, tags_list)]

In [None]:
final_df = pd.DataFrame(final_data, columns=['image_name', 'tags'])
final_df.head()

In [None]:
final_df.to_csv('./submission_0.csv', index=False)