# iMet Collection 2019 - FGVC6
**Simple baseline for iMet Collection 2019 competition using fastai v1**

In [1]:
import fastai
from fastai.vision import *
fastai.__version__

'1.0.51'

# Setup

In [2]:
BATCH  = 126
SIZE   = 250
path = Path('../input/imet-2019-fgvc6/') # iMet data path

In [3]:
!ls '../input/imet-my-pretrained-models/'

resnet101_0.584.pth  resnet50v15.pth  resnet50v42.pth
resnet18_0.562.pth   resnet50v27.pth  resnet50v6.pth
resnet50v12.pth      resnet50v3.pth   resnet50v9.pth


In [4]:
Path('models').mkdir(exist_ok=True)
!cp '../input/imet-my-pretrained-models/resnet50v3.pth' 'models/'
!cp '../input/imet-my-pretrained-models/resnet50v6.pth' 'models/'
!cp '../input/imet-my-pretrained-models/resnet50v9.pth' 'models/'
!cp '../input/imet-my-pretrained-models/resnet50v12.pth' 'models/'
!cp '../input/imet-my-pretrained-models/resnet50v42.pth' 'models/'
!cp '../input/imet-my-pretrained-models/resnet50v15.pth' 'models/'
!cp '../input/imet-my-pretrained-models/resnet50v27.pth' 'models/'

In [5]:
os.listdir("models/")

['resnet50v42.pth',
 'resnet50v12.pth',
 'resnet50v9.pth',
 'resnet50v6.pth',
 'resnet50v3.pth',
 'resnet50v15.pth',
 'resnet50v27.pth']

# Data

In [6]:
# Load train dataframe
train_df = pd.read_csv(path/'train.csv')
train_df.head()

Unnamed: 0,id,attribute_ids
0,1000483014d91860,147 616 813
1,1000fe2e667721fe,51 616 734 813
2,1001614cb89646ee,776
3,10041eb49b297c08,51 671 698 813 1092
4,100501c227f8beea,13 404 492 903 1093


In [7]:
# Load labels dataframe
labels_df = pd.read_csv(path/'labels.csv')
labels_df.head()

Unnamed: 0,attribute_id,attribute_name
0,0,culture::abruzzi
1,1,culture::achaemenid
2,2,culture::aegean
3,3,culture::afghan
4,4,culture::after british


In [8]:
# Load sample submission
test_df = pd.read_csv(path/'sample_submission.csv')
test_df.head()

Unnamed: 0,id,attribute_ids
0,10023b2cc4ed5f68,0 1 2
1,100fbe75ed8fd887,0 1 2
2,101b627524a04f19,0 1 2
3,10234480c41284c6,0 1 2
4,1023b0e2636dcea8,0 1 2


# Create data object using datablock API

In [9]:
# Source: https://www.kaggle.com/c/human-protein-atlas-image-classification/discussion/78109
class FocalLoss(nn.Module):
    def __init__(self, gamma=2):
        super().__init__()
        self.gamma = gamma

    def forward(self, logit, target):
        target = target.float()
        max_val = (-logit).clamp(min=0)
        loss = logit - logit * target + max_val + \
               ((-max_val).exp() + (-logit - max_val).exp()).log()

        invprobs = F.logsigmoid(-logit * (target * 2.0 - 1.0))
        loss = (invprobs * self.gamma).exp() * loss
        if len(loss.size())==2:
            loss = loss.sum(dim=1)
        return loss.mean()

In [10]:
def prediction(model_path, seed_num):
    tfms = get_transforms(do_flip=True, flip_vert=False, max_rotate=0.10, max_zoom=1.5, max_warp=0.2, max_lighting=0.2,
                     xtra_tfms=[(symmetric_warp(magnitude=(-0,0), p=0)),])
    train, test = [ImageList.from_df(df, path=path, cols='id', folder=folder, suffix='.png') 
               for df, folder in zip([train_df, test_df], ['train', 'test'])]
    data = (train.split_by_rand_pct(0.1, seed=seed_num)
        .label_from_df(cols='attribute_ids', label_delim=' ')
        .add_test(test)
        .transform(tfms, size=SIZE, resize_method=ResizeMethod.PAD, padding_mode='border',)
        .databunch(path=Path('.'), bs=BATCH).normalize(imagenet_stats))
    learn = cnn_learner(data, base_arch=models.resnet50, loss_func=FocalLoss(), metrics=fbeta, pretrained=False)
    learn.load(model_path)
    # Validation predictions
    #valid_preds = learn.get_preds(DatasetType.Valid)
    def find_best_fixed_threshold(preds, targs, do_plot=True):
        score = []
        thrs = np.arange(0, 0.5, 0.01)
        for thr in progress_bar(thrs):
            score.append(fbeta(valid_preds[0],valid_preds[1], thresh=thr))
        score = np.array(score)
        pm = score.argmax()
        best_thr, best_score = thrs[pm], score[pm].item()
        print(f'thr={best_thr:.3f}', f'F2={best_score:.3f}')
        return best_thr
    i2c = np.array([[i, c] for c, i in learn.data.train_ds.y.c2i.items()]).astype(int) # indices to class number correspondence
    #def join_preds(preds, thr):
        #return [' '.join(i2c[np.where(t==1)[0],1].astype(str)) for t in (preds[0].sigmoid()>thr).long()]
    #best_thr = find_best_fixed_threshold(*valid_preds)
    test_preds = learn.TTA(ds_type=DatasetType.Test)
    #test_df = pd.read_csv('../input/imet-2019-fgvc6/sample_submission.csv')
    #test_df.attribute_ids = join_preds(test_preds, best_thr)
    #print(f'thr={best_thr:.3f}')
    #return test_df
    return test_preds, i2c

In [11]:
base_df = pd.DataFrame()
base_df["label"] = labels_df['attribute_id']

In [12]:
pred1, i2c1 = prediction(model_path="resnet50v3", seed_num=3)

In [13]:
df1 = pd.DataFrame()
df1['index1'] = [x[0] for x in i2c1]
df1['label'] = [x[1] for x in i2c1]
base_df = base_df.merge(df1, on="label", how="left")

In [14]:
pred2, i2c2 = prediction(model_path="resnet50v6", seed_num=6)

In [15]:
df2 = pd.DataFrame()
df2['index2'] = [x[0] for x in i2c2]
df2['label'] = [x[1] for x in i2c2]
base_df = base_df.merge(df2, on="label", how="left")

In [16]:
pred3, i2c3 = prediction(model_path="resnet50v9", seed_num=9)

In [17]:
df3 = pd.DataFrame()
df3['index3'] = [x[0] for x in i2c3]
df3['label'] = [x[1] for x in i2c3]
base_df = base_df.merge(df3, on="label", how="left")

In [18]:
pred4, i2c4 = prediction(model_path="resnet50v12", seed_num=12)

In [19]:
df4 = pd.DataFrame()
df4['index4'] = [x[0] for x in i2c4]
df4['label'] = [x[1] for x in i2c4]
base_df = base_df.merge(df4, on="label", how="left")

In [20]:
pred5, i2c5 = prediction(model_path="resnet50v42", seed_num=42)

In [21]:
df5 = pd.DataFrame()
df5['index5'] = [x[0] for x in i2c5]
df5['label'] = [x[1] for x in i2c5]
base_df = base_df.merge(df5, on="label", how="left")

In [22]:
pred6, i2c6 = prediction(model_path="resnet50v15", seed_num=15)

In [23]:
df6 = pd.DataFrame()
df6['index6'] = [x[0] for x in i2c6]
df6['label'] = [x[1] for x in i2c6]
base_df = base_df.merge(df6, on="label", how="left")

In [24]:
pred7, i2c7 = prediction(model_path="resnet50v27", seed_num=27)

In [25]:
df7 = pd.DataFrame()
df7['index7'] = [x[0] for x in i2c7]
df7['label'] = [x[1] for x in i2c7]
base_df = base_df.merge(df7, on="label", how="left")

## Submission

In [26]:
base_df.head()

Unnamed: 0,label,index1,index2,index3,index4,index5,index6,index7
0,0,0,0.0,0.0,0.0,0,0.0,0.0
1,1,1,1.0,1.0,1.0,1,1.0,1.0
2,2,215,214.0,214.0,214.0,215,214.0,215.0
3,3,326,325.0,324.0,325.0,326,325.0,326.0
4,4,437,436.0,435.0,436.0,437,434.0,437.0


In [27]:
#print(base_df[base_df.index2.isnull()])
#print(base_df[base_df.index3.isnull()])
#print(base_df[base_df.index4.isnull()])

In [28]:
#base_df["index2"] = base_df["index2"].fillna(-1).apply(lambda x: int(x))
#base_df["index3"] = base_df["index3"].fillna(-1).apply(lambda x: int(x))
#base_df["index4"] = base_df["index4"].fillna(-1).apply(lambda x: int(x))

In [29]:
#print([i for i in base_df["label"].unique() if i not in base_df["index2"].unique()])
#print([i for i in base_df["label"].unique() if i not in base_df["index3"].unique()])
#print([i for i in base_df["label"].unique() if i not in base_df["index4"].unique()])

In [30]:
#base_df.loc[187, "index2"] = 1102
#base_df.loc[104, "index3"] = 1101
#base_df.loc[281, "index3"] = 1102
#base_df.loc[199, "index4"] = 1102

In [31]:
#columns = ["index1", "index2", "index3", "index4", "index5", "index6", "index7", "index8", "index9", "index10"]
columns = ["index1", "index2", "index3", "index4", "index5", "index6", "index7"]
missing_columns = []
missing_num = []
for c in progress_bar(columns):
    missing_num.append(base_df[c].isnull().sum())
    if base_df[c].isnull().sum()>=1:
        missing_columns.append(c)
        base_df[c] = base_df[c].fillna(-1).apply(lambda x: int(x))
        for i, j in enumerate(base_df[base_df[c]==-1].index):
            base_df.loc[j, c] = 1102-i

In [32]:
#print(torch.cat([pred2[0].sigmoid(), torch.zeros(len(pred2[0]), 1)], dim=1)[0][base_df.index2])
#print(torch.cat([pred2[0].sigmoid(), torch.zeros(len(pred2[0]), 1)], dim=1)[0][base_df.index2][187])

In [33]:
#missing_columns = ["index2", "index3", "index4"]
#missing_num = [0, 1, 2, 1, 0]

In [34]:
#pred_list = [pred1, pred2, pred3, pred4, pred5, pred6, pred7, pred8, pred9, pred10]
pred_list = [pred1, pred2, pred3, pred4, pred5, pred6, pred7]
test_preds = torch.zeros(len(test_df), 1103)
n = len(pred_list)

# No missing columns
for i in progress_bar(range(len(test_preds))):
    for j, c in enumerate(columns):
        if c in missing_columns:
            None
            #test_preds[i] += torch.cat([pred_list[j][0].sigmoid(), 
                                     #torch.zeros(len(pred_list[j][0]), base_df[c].isnull().sum())], 
                                     #dim=1)[i][base_df[c].values] / n
        else:
            test_preds[i] += pred_list[j][0].sigmoid()[i][base_df[c].values] / n

# missing columns
for i in progress_bar(range(len(test_preds))):
    for j, c in enumerate(columns):
        if c in missing_columns:
            test_preds[i] += torch.cat([pred_list[j][0].sigmoid(), 
                                     torch.zeros(len(pred_list[j][0]), missing_num[j])], 
                                     dim=1)[i][base_df[c].values] / n
        else:
            None

In [35]:
#pred1.to_csv("pred1.csv", index=False)
#pred2.to_csv("pred2.csv", index=False)
#pred3.to_csv("pred3.csv", index=False)
#pred4.to_csv("pred4.csv", index=False)
#pred5.to_csv("pred5.csv", index=False)

In [36]:
#test_df.to_csv('submission.csv', index=False) 

In [37]:
threshold = 0.270
preds = test_preds > threshold
prediction = []
for i in range(len(preds)):
    pred = np.argwhere(preds[i] == 1.0).reshape(-1).tolist()
    pred_str = " ".join(list(map(str, pred)))
    prediction.append(pred_str)

In [38]:
#pred_df = pd.DataFrame({"id": [i.split('.')[0] for i in os.listdir('../input/imet-2019-fgvc6/test/')]})
#pred_df["attribute_ids"] = prediction
test_df["attribute_ids"] = prediction
#submission = pd.DataFrame(test_df["id"]).merge(pred_df, on="id", how="left")
#submission.to_csv("submission.csv", index=False)
#submission.head()
test_df.to_csv("submission.csv", index=False)
test_df.head()

Unnamed: 0,id,attribute_ids
0,10023b2cc4ed5f68,195 223 289 344 369 766 1059
1,100fbe75ed8fd887,93 231 872 1039
2,101b627524a04f19,79 420 784
3,10234480c41284c6,147 480 483 725 738 776 813 830 923 1046
4,1023b0e2636dcea8,147 322 477 584 671 737 738 776 813 954 1046 1092
