## 0.Download dataset from google drive

In [None]:
import os
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
id = '1_6kHRoaob9xdO45ReB9hXOxlUr7IGOFK'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('2020s1comp5329assignment2.zip')
if not os.path.exists('/content/COMP5329S1A2Dataset'):
    !unzip -d /content/ /content/2020s1comp5329assignment2.zip > /dev/null
    print('File unzipped')
else:
    print('File existed')

File unzipped


## 1.Load dataset

In [None]:
# Acquire file names and corresponding labels from train.csv and test.csv
import re
import pandas as pd
from io import StringIO
import numpy as np
data_path = "/content/COMP5329S1A2Dataset/"
with open(data_path + 'train.csv','r') as f:
  # Caption texts contain '/', so '|' is used for excapechar.
  lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1|"\2', line) for line in f]
df_train = pd.read_csv(StringIO(''.join(lines)), escapechar="|")
train_files = list(map(lambda x: data_path + 'data/' + x, df_train.ImageID.to_list()))
train_labels = df_train.Labels.to_list()
train_caption = df_train.Caption.to_list()

with open(data_path + 'test.csv','r') as f:
  lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1|"\2', line) for line in f]
df_test = pd.read_csv(StringIO(''.join(lines)), escapechar="|")
test_files = list(map(lambda x: data_path + 'data/' + x, df_test.ImageID.to_list()))
test_caption = df_test.Caption.to_list()

# Acquire the number of classes
unique_label = list(set([int(lb) for lbs in train_labels for lb in lbs.split(' ')]))
n_classes = len(unique_label)

# Create a mapping of labels 
unique_label.sort()
unique_label_dict = {j:i for i,j in enumerate(unique_label)}
print('the number of classes: ',n_classes)

the number of classes:  18


## 2.Data pre-processing

### 2.1 Train-Validation splt

In [None]:
# Train-Validation split. 
from sklearn.model_selection import train_test_split
train_files,val_files,train_caption, val_caption, train_labels, val_labels = train_test_split(train_files, train_caption, train_labels, test_size=0.1, random_state=1)

print('size of trainning set: ',len(train_files))
print('size of training labels: ',len(train_labels))
print('size of validation set: ',len(val_files))
print('size of validation labels: ', len(val_labels))
print('size of test set: ',len(test_files))

size of trainning set:  27000
size of training labels:  27000
size of validation set:  3000
size of validation labels:  3000
size of test set:  10000


### 2.2 Image transformation & normalization

In [None]:
from torch.utils.data import Dataset,DataLoader
from PIL import Image 
import torch
import torchvision
import torchvision.transforms as transforms

# Define customized dataset so that it can be accepted by DataLoader
class ImageDataset(Dataset):
  def __init__(self,img_names,labels=None,transform=None,target_transform=None):
    self.img_names = img_names
    self.labels = labels
    self.transform = transform
    self.target_transform = target_transform

  def __getitem__(self,index):
    img_name = self.img_names[index]
    img = Image.open(img_name).convert('RGB')
    if self.labels:
      label = self.labels[index]
    else:
      label = None
    if self.transform:
      img =  self.transform(img)
    if self.target_transform:
      label = self.target_transform(label)
      label = torch.from_numpy(label).float()

    if self.labels:
      return img,label
    else:
      return img

  def __len__(self):
    return len(self.img_names)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transform = { 
    'train': transforms.Compose([
        # Resize images
        transforms.Resize([256,256]),
        # transforms.ColorJitter(brightness=1,saturation=1,contrast=1,hue=0.5),
        # Flip images randomly
        transforms.RandomHorizontalFlip(),
        # Crop a random 227x227 area of images 
        transforms.RandomResizedCrop([224,224]),
        # Converts a PIL Image or numpy.ndarray (H x W x C) in the range [0, 255] 
        # to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] 
        transforms.ToTensor(),
        # Nomalization using mean and std
        transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
    ]),
    'validation': transforms.Compose([
        transforms.Resize([256,256]),
        transforms.CenterCrop([224,224]),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize([256,256]),
        transforms.CenterCrop([224,224]),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
    ])
}
import numpy as np
def encode_label(label):
  # Encode lable to one-hot label
  one_hot_label = np.zeros(n_classes)
  for l in label.split(' '):
    one_hot_label[unique_label_dict[int(l)]] = 1 
  return np.array(one_hot_label)

def decode_label(one_hot_label):
  # Acquire labels from one-hot label
  true_classes = [str(ohl * ul) for ohl, ul in zip(one_hot_label,unique_label) if ohl*ul != 0]
  true_classes = ' '.join(true_classes)
  return true_classes

img_batch_size = 128 
num_workers = 0 # For our dataset, it costs less training time when "num_workers" equals to 0
train_dataset = ImageDataset(
    img_names=train_files,
    labels=train_labels,
    transform=transform['train'],
    target_transform=encode_label
)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=img_batch_size,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True
)
val_dataset = ImageDataset(
    img_names=val_files,
    labels=val_labels,
    transform=transform['validation'],
    target_transform=encode_label
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=16,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True
)
test_dataset = ImageDataset(
    img_names=test_files,
    transform=transform['test']
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=16,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True
)

### 2.3 Caption texts transformation

#### 2.3.1 Pre-process caption texts

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords as sw
from nltk.stem.porter import *
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize
from tqdm import tqdm
# Turn list into dict to improve lookup speed
stopwords_dict={word:'' for word in sw.words()}
def tknzr(caption):
  caption_tknzed = list()
  for sentence in tqdm(caption):
    tmp =list()
    for word in word_tokenize(sentence.lower()):
      # Remove stop words and punctuations
      if re.fullmatch(r'[a-z]+',word)  and word not in stopwords_dict:
        tmp.append(word)
    caption_tknzed.append(tmp)
  return caption_tknzed
train_caption_tknzed = tknzr(train_caption)
val_caption_tknzed = tknzr(val_caption)
test_caption_tknzed = tknzr(test_caption)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


100%|██████████| 27000/27000 [00:03<00:00, 8886.94it/s]
100%|██████████| 3000/3000 [00:00<00:00, 8856.92it/s]
100%|██████████| 10000/10000 [00:01<00:00, 8801.97it/s]


#### 2.3.2 Acquire word embeddings

In [None]:
# From the frequency of different length of captions, 
# the length of most captions are less than or equal to 10
from collections import Counter
Counter(map(len,train_caption_tknzed)).most_common()

[(5, 7989),
 (6, 6756),
 (4, 4892),
 (7, 3489),
 (3, 1542),
 (8, 1380),
 (9, 446),
 (2, 194),
 (10, 163),
 (11, 68),
 (12, 31),
 (13, 16),
 (1, 10),
 (14, 8),
 (15, 7),
 (17, 3),
 (16, 2),
 (19, 2),
 (21, 1),
 (27, 1)]

In [None]:
# Load pre-trained embedding vectors
import gensim.downloader as api
embedding_model = api.load('glove-wiki-gigaword-300')



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
def gen_embeddinig(caption_tknzed,embedding_model,padding_size=10):
  # The sentence length of all inputs has to be same. 
  # Long sentences will be truncated and short sentences will be padded.
  input_embeddings = []
  padding = np.random.uniform(-0.25, 0.25, embedding_model.vectors.shape[1]).round(6)
  for sentence in tqdm(caption_tknzed):
    tmp_embeddings = []
    for idx in range(padding_size):
      try:
        tmp_embeddings.append(embedding_model.get_vector(sentence[idx]))
      except:
        tmp_embeddings.append(padding)
    input_embeddings.append(tmp_embeddings)
  return np.array(input_embeddings)
train_embeddings = gen_embeddinig(train_caption_tknzed,embedding_model)
val_embeddings = gen_embeddinig(val_caption_tknzed,embedding_model)
test_embeddings = gen_embeddinig(test_caption_tknzed,embedding_model)

100%|██████████| 27000/27000 [00:00<00:00, 67587.40it/s]
100%|██████████| 3000/3000 [00:00<00:00, 112466.92it/s]
100%|██████████| 10000/10000 [00:00<00:00, 120271.61it/s]


In [None]:
from torch.utils.data import Dataset,DataLoader
# define customized dataset so that it can be accepted by DataLoader
class EmbeddingDataset(Dataset):
  def __init__(self,embeddings,labels=None,transform=None,target_transform=None):
    self.embeddings = embeddings
    self.labels = labels
    self.transform = transform
    self.target_transform = target_transform

  def __getitem__(self,index):
    embedding = self.embeddings[index]
    embedding = torch.from_numpy(embedding).float()
    if self.labels:
      label = self.labels[index]
    else:
      label = None
    if self.transform:
      img =  self.transform(img)
    if self.target_transform:
      label = self.target_transform(label)
      label = torch.from_numpy(label).float()

    if self.labels:
      return embedding,label
    else:
      return embedding

  def __len__(self):
    return len(self.embeddings)

emb_batch_size = 64
num_workers = 0
train_emb_dataset = EmbeddingDataset(
    embeddings=train_embeddings,
    labels=train_labels,
    target_transform=encode_label
)
train_emb_loader = DataLoader(
    dataset=train_emb_dataset,
    batch_size=emb_batch_size,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True
)
val_emb_dataset = EmbeddingDataset(
    embeddings=val_embeddings,
    labels=val_labels,
    target_transform=encode_label
)
val_emb_loader = DataLoader(
    dataset=val_emb_dataset,
    batch_size=emb_batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True
)
test_emb_dataset = EmbeddingDataset(
    embeddings=test_embeddings
)
test_emb_loader = DataLoader(
    dataset=test_emb_dataset,
    batch_size=emb_batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True
)


## 3.Define classifier

This class provides interfaces for model training, prediction and plotting curves.

In [None]:
from sklearn.metrics import accuracy_score,f1_score
import torch.optim as optim
import torch.nn.functional as F
import plotly.graph_objects as go
import pandas as pd
import time
from torch.utils.tensorboard import SummaryWriter
class Classifier(object):
  def __init__(self,model,loss_func=None,optimizer=None,lr_scheduler=None):
    self.model = model
    self.criterion = loss_func
    self.optimizer = optimizer
    self.lr_scheduler = lr_scheduler
    self.logs = {
        'epoch':[],
        'batch':[],
        'lr':[],
        'train_loss':[],
        'train_acc':[],
        'train_f1_score':[],
        'val_loss':[],
        'val_acc':[],
        'val_f1_score':[],    
      }
  def model_name(self):
    return self.model.__class__.__name__
  def fit(self,epochs,train_loader,val_loader,show_interval,lr_scheduler_on=True):
    for epoch in range(epochs): 
      print('Epoch: ',epoch + 1)
      start_time = time.time()  
      # forward + backward + optimize
      train_loss = list()
      train_preds = list()
      train_targets =list()
      for i,(X,y) in enumerate(train_loader):
        # Set the flag to training
        self.model.train()
        input_batch_torch = X.to(device)
        target_batch_torch = y.to(device)
        outputs = self.model(input_batch_torch)
        loss = self.criterion(outputs, target_batch_torch)
        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()
        # Set the flag to evaluation, which will 'turn off' the dropout
        self.model.eval()
        # Calculate metrics on training set
        with torch.no_grad():
          outputs = self.model(input_batch_torch) 
          loss = self.criterion(outputs, target_batch_torch).item()
          train_loss.append(loss)
          train_preds += torch.round(nn.Sigmoid()(outputs)).cpu().detach().numpy().astype(int).tolist()
          train_targets += target_batch_torch.cpu().numpy().astype(int).tolist()



          if i%show_interval == show_interval-1:

            train_loss = np.mean(train_loss)
            train_acc = accuracy_score(train_targets,train_preds)
            train_f1_score = f1_score(train_targets,train_preds,average='samples')

            # Calculate metrics on validation set
            val_loss = list()
            val_preds = list()
            val_targets = list()
            for val_X,val_y in val_loader:
              val_X_torch = val_X.to(device)
              val_y_torch = val_y.to(device)
              outputs = self.model(val_X_torch)
              loss = self.criterion(outputs, val_y_torch).item()
              val_loss.append(loss)
              val_preds += torch.round(nn.Sigmoid()(outputs)).cpu().detach().numpy().astype(int).tolist()
              val_targets += val_y_torch.cpu().numpy().astype(int).tolist()

            val_loss = np.mean(val_loss)
            val_acc = accuracy_score(val_targets,val_preds)
            val_f1_score = f1_score(val_targets,val_preds,average='samples')
            end_time = time.time()
            print(' Batch: %d, train_loss: %.5f, train_acc: %.4f, tran_f1_score: %.4f \n val_loss: %.5f, val_acc: %.4f, val_f1_score: %.4f, time_elapsed: %.2f' 
                %(i + 1, train_loss, train_acc, train_f1_score, val_loss, val_acc, val_f1_score, end_time - start_time))

            # Save the scores to plot 

            self.logs['epoch'].append(epoch + 1)
            self.logs['batch'].append(i+1)
            self.logs['lr'].append(self.optimizer.__dict__['param_groups'][0]['lr'])
            self.logs['train_loss'].append(train_loss)
            self.logs['train_acc'].append(train_acc)
            self.logs['train_f1_score'].append(train_f1_score)
            self.logs['val_loss'].append(val_loss)
            self.logs['val_acc'].append(val_acc)
            self.logs['val_f1_score'].append(val_f1_score)
            train_loss = list()
            train_preds = list()
            train_targets = list()
      print('Total time this epoch: ', time.time() - start_time)
      if self.lr_scheduler and lr_scheduler_on:
        self.lr_scheduler.step()
    print('Finished Training')
  def validate(self,loader):
    self.model.eval()
    preds = list()
    targets = list()
    preds_prob = list()
    preds_org = list()
    with torch.no_grad():
      for X,y in loader:
        X_torch = X.to(device)
        y_torch = y.to(device)
        outputs = self.model(X_torch)
        preds_org += outputs.cpu().detach().numpy().tolist()
        preds_prob += nn.Sigmoid()(outputs).cpu().detach().numpy().tolist()
        preds += torch.round(nn.Sigmoid()(outputs)).cpu().detach().numpy().astype(int).tolist()
        targets += y_torch.cpu().numpy().astype(int).tolist()
    f1 = f1_score(targets,preds,average='samples')

    return preds,preds_prob,preds_org,targets,f1
  def get_output(self,x):
    return self.model(x).cpu().detach().numpy().tolist()
  def predict(self,loader):
    self.model.eval()
    preds = list()
    preds_prob = list()
    preds_org = list()
    for X in loader:
      X_torch = X.to(device)
      outputs = self.model(X_torch)
      preds_org += outputs.cpu().detach().numpy().tolist()
      preds_prob += nn.Sigmoid()(outputs).cpu().detach().numpy().tolist()
      preds += torch.round(nn.Sigmoid()(outputs)).cpu().detach().numpy().astype(int).tolist()
    return preds,preds_prob,preds_org
  def save_model(self,path='./'):
    full_path = path + self.model.__class__.__name__ + '.pt'
    torch.save(self.model,full_path)
  def logs(self):
    return self.logs
  def plot(self,initial_lr,batch_size,save_dir='./'):
    df = pd.DataFrame(self.logs)
    df.to_csv(save_dir + '{}_{}_{}.log'.format(self.model_name(),initial_lr,batch_size),index=False)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.index, y=df.train_loss,mode='lines+markers',name='training loss')) 
    fig.add_trace(go.Scatter(x=df.index, y=df.val_loss,mode='lines+markers',name='validation loss'))
    fig.show()
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.index, y=df.train_acc,mode='lines+markers',name='training accuracy')) 
    fig.add_trace(go.Scatter(x=df.index, y=df.val_acc,mode='lines+markers',name='validation accuracy'))
    fig.show()
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.index, y=df.train_f1_score,mode='lines+markers',name='training f1 score')) 
    fig.add_trace(go.Scatter(x=df.index, y=df.val_f1_score,mode='lines+markers',name='validation f1 score'))
    fig.show()

## 4.Models

### 4.1 ResNeXt50_32x4d

In [None]:
from torchvision import models 
import torch.nn as nn
# Load pre-trained model
pre_train_model = models.resnext50_32x4d(pretrained=True)
dropout = True
# Modify the full connected layers of ResNext
in_features = pre_train_model.fc.in_features
fc_layers = list()
if dropout:
  fc_layers.append(nn.Dropout(p=0.5,inplace=True))
fc_layers.append(nn.Linear(in_features,n_classes,bias=True))
pre_train_model.fc = nn.Sequential(*fc_layers)

Downloading: "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth" to /root/.cache/torch/checkpoints/resnext50_32x4d-7cdf4587.pth


HBox(children=(FloatProgress(value=0.0, max=100441675.0), HTML(value='')))




### 4.2 LSTM

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class LSTM(nn.Module):
    def __init__(self,):
        super(LSTM, self).__init__()

        self.lstm = nn.LSTM(300,256,num_layers=3,batch_first =True,dropout=0.5)
        self.linear_layers = nn.Sequential(
            nn.Linear(256,18,bias=True),
        )

    def forward(self,x_emb):
        x_emb,_ = self.lstm(x_emb)
        x_emb = x_emb[:,-1,:]
        x = self.linear_layers(x_emb)
        return x

## 5.Train process

### 5.1 Train with ResNeXt50_32x4d

In [None]:
# Setting hyperparameters
epochs = 15
learning_rate = 0.08
momentum = 0.9
pre_train_model = pre_train_model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(pre_train_model.parameters(),lr=learning_rate,momentum=momentum,nesterov=True)
learning_rate_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=3,gamma=0.1)
# Initialization
model = Classifier(pre_train_model,loss_func=criterion,optimizer=optimizer,lr_scheduler=learning_rate_scheduler)

In [None]:
# Train model
model.fit(epochs=epochs,train_loader=train_loader,val_loader=val_loader,show_interval=50)

In [None]:
# Save model
model.save_model()

### 5.2 Train with LSTM

In [None]:
net = LSTM().to(device)
# Setting hyperparameters
epochs = 30
learning_rate = 0.0005
momentum = 0.9
criterion = nn.BCEWithLogitsLoss()
# Try different optimizer
# optimizer = optim.SGD(net.parameters(),lr=learning_rate,momentum=momentum,nesterov=True)
optimizer = optim.AdamW(net.parameters(),lr=learning_rate,weight_decay=0.5)
# Try different learning rate scheduler
# learning_rate_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,milestones=[15,30],gamma=0.1)
learning_rate_sheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=30)
# Initialization
model = Classifier(net,loss_func=criterion,optimizer=optimizer,
                    lr_scheduler=learning_rate_sheduler
                   )

In [None]:
# Train model
model.fit(epochs=epochs,train_loader=train_emb_loader,val_loader=val_emb_loader,show_interval=100)

In [None]:
model.save_model()


Couldn't retrieve source code for container of type LSTM. It won't be checked for correctness upon loading.



### 5.3 Ensemble ResNext and LSTM

#### 5.3.1 Acquire the outputs of ResNext and LSTM on validation set

In [None]:
# Load model from file
id = '1V0WJl4_c92-JQi6TkpIXFdHHE-_P1Wck'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('ResNext50.pt')
resnext50 = torch.load('/content/ResNext50.pt')
res_model = Classifier(resnext50)
preds_res,preds_res_prob,preds_res_org,targets,res_f1 = res_model.validate(val_loader) #

In [None]:
id = '1JyOH9gnxUE2GEUP0X0mKKeSpPBpUciiW'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('LSTM.pt')
net = torch.load('/content/LSTM.pt')
emb_model = Classifier(net)
preds_emb,preds_emb_prob,preds_emb_org,targets,emb_f1 = emb_model.validate(val_emb_loader) #

In [None]:
res_f1,emb_f1

(0.8616196488696488, 0.8218759259259261)

#### 5.3.2 Stacking Ensemble of the two **models**

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
print(np.array(preds_res_org).shape)
print(np.array(preds_emb_org).shape) 
# Concentate the outputs of ResNeXt50_32x4d and LSTM models
# The concentated has the shape of [N,n_classes * 2]
X_stacking = np.concatenate((np.array(preds_res_org),np.array(preds_emb_org)),axis=1)

(3000, 18)
(3000, 18)


In [None]:
# Search for optimal hyperparameters for RF classfifier
parameters = {'estimator__n_estimators':np.arange(100,1500,200), 
              'estimator__max_features':["auto","sqrt","log2"],
              'estimator__class_weight':['balanced',None],
              'estimator__n_jobs':[4]}
rf =  MultiOutputClassifier(RandomForestClassifier())
clf = GridSearchCV(rf, parameters,cv=3,n_jobs=4,verbose=1,scoring='f1_samples')
clf.fit(np.array(X_stacking), np.array(targets))

print('best score: ',clf.best_score_)
print('best estimator: ',clf.best_estimator_)

Fitting 3 folds for each of 42 candidates, totalling 126 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 15.4min
[Parallel(n_jobs=4)]: Done 126 out of 126 | elapsed: 50.6min finished


best score:  0.8731854978354979
best estimator:  MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                       ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='sqrt',
                                                       max_leaf_nodes=None,
                                                       max_samples=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                      

In [None]:
# Take 2000 samples as training set of RF classifier from validation set. 
# Other 1000 samples are leftover for validation set of RF classifier.
start = time.time()
X = X_stacking[:2000]
y = targets[:2000]
rf = MultiOutputClassifier(RandomForestClassifier(n_estimators=900,max_features='sqrt',n_jobs=4)).fit(X,y)
preds_stacking = rf.predict(X_stacking[2000:])

end = time.time()
print('Training time: ',end - start)
print('Score of stacking ensemble model: ', f1_score(targets[2000:],preds_stacking,average='samples'))
print('Score of single model - ResNext: ', f1_score(targets[2000:],preds_res[2000:],average='samples'))
print('Sore of single model - LSTM: ', f1_score(targets[2000:],preds_emb[2000:],average='samples'))


Training time:  53.778677463531494
Score of stacking ensemble model:  0.864554329004329
Score of single model - ResNext:  0.8539420634920636
Sore of single model - LSTM:  0.8301761904761904


#### 5.3.3 Weighted Average Ensemble of the two models

##### 5.3.3.1 Find optimal weight and threshold via differential evolution algorithm

In [None]:
from scipy.optimize import differential_evolution
def predict_by_threshold(preds_probs,threshold):
  return list(map(lambda probs:[1  if prob >= threshold else 0 for prob in probs],preds_probs))
def target_func(x):
  w = x[:2]
  # Ensure the sum of weight values equal to 1
  result = np.linalg.norm(w,1)
  if result != 0.0:
    w /= result
  w1 = w[0]
  w2 = w[1]
  th = x[2:]
  preds_weighted = np.array(preds_res_org) * w1 + np.array(preds_emb_org) * w2
  f1 = f1_score(predict_by_threshold(nn.Sigmoid()(torch.from_numpy(preds_weighted)).numpy(),th),targets,average='samples')
  return 1 - f1
bound_weight = [(0.0,1.0)] * 3
result = differential_evolution(target_func,bound_weight,maxiter=1000)
w = result.x[:2]
w /= np.linalg.norm(w,1)
w1 = w[0]
w2 = w[1]
threshold = result.x[2:]
best_socre = 1 - result.fun
print('Optimal solution: ',result.x)
print('Best score: ',best_socre)

Optimal solution:  [0.65043697 0.34956303 0.33409821]
Best score:  0.8797013708513708


In [None]:
preds_weighted = np.array(preds_res_org) * w1 + np.array(preds_emb_org) * w2 #+ np.array(preds_emb_org_1) * w3
weighted_f1 = f1_score(predict_by_threshold(nn.Sigmoid()(torch.from_numpy(preds_weighted)).numpy(),threshold),np.array(targets),average='samples')

print('Score of weighted ensemble model: ', weighted_f1)
print('Score of single model - ResNext: ', res_f1)
print('Sore of single model - LSTM: ', emb_f1)

Score of weighted ensemble model:  0.8797013708513708
Score of single model - ResNext:  0.8616196488696488
Sore of single model - LSTM:  0.8269235449735449


## 6.Predict on test set

In [None]:
# Load model from file
id = '1V0WJl4_c92-JQi6TkpIXFdHHE-_P1Wck'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('ResNext50.pt')
resnext50 = torch.load('/content/ResNext50.pt')
res_model = Classifier(resnext50)
# Predict labels
preds_res,preds_res_prob,preds_res_org = res_model.predict(test_loader) #

In [None]:
id = '1JyOH9gnxUE2GEUP0X0mKKeSpPBpUciiW'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('LSTM.pt')
net = torch.load('/content/LSTM.pt')
emb_model = Classifier(net)
preds_emb,preds_emb_prob,preds_emb_org = emb_model.predict(test_emb_loader) #

In [None]:
def predict_by_threshold(preds_probs,threshold):
  return list(map(lambda probs:[1  if prob >= threshold else 0 for prob in probs],preds_probs))

w1 = 0.65043697 
w2 = 0.34956303 
threshold = 0.33409821
preds_weighted = np.array(preds_res_org) * w1 + np.array(preds_emb_org) * w2
final_preds = predict_by_threshold(nn.Sigmoid()(torch.from_numpy(preds_weighted)).numpy(),threshold)

In [None]:
# Decode one-hot label and save as a file
predictions = list(map(decode_label,final_preds))
image_id = list(map(lambda x:x.split('/')[-1],test_files))
df_submission = pd.DataFrame(dict(ImageID=image_id,Labels=predictions))
df_submission.to_csv('/content/Predicted_labels.txt',index=False)