<a href="https://colab.research.google.com/github/alexvishnevskiy/PetFinder/blob/master/PetFinder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pytorch-lightning timm python-box -U albumentations wandb > /dev/null

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


old data

In [None]:
!cp /content/drive/MyDrive/PetFinder/petfinder-pawpularity-score.zip .
!unzip /content/petfinder-pawpularity-score.zip

new data

In [None]:
!unzip /content/drive/MyDrive/PetFinder/breeds/cat_breeds.zip -d cat_breeds
!unzip /content/drive/MyDrive/PetFinder/breeds/dog_breeds.zip -d dog_breeds
!unzip /content/drive/MyDrive/PetFinder/breeds/dog-breed-identification.zip -d dog_breeds_2

!mkdir cat_breeds_2
!tar -xvf /content/drive/MyDrive/PetFinder/breeds/annotations.tar.gz --directory cat_breeds_2
!tar -xvf /content/drive/MyDrive/PetFinder/breeds/images.tar.gz --directory cat_breeds_2

!mkdir PetFinder_old
!unzip /content/drive/MyDrive/PetFinder/breeds/PetFinder_old/train.csv.zip -d PetFinder_old
!unzip /content/drive/MyDrive/PetFinder/breeds/PetFinder_old/train_images.zip -d PetFinder_old
!cp /content/drive/MyDrive/PetFinder/breeds/PetFinder_old/breed_labels.csv -d PetFinder_old

In [49]:
from pytorch_lightning import LightningDataModule, LightningModule
from pytorch_lightning.utilities.seed import seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import callbacks
import pytorch_lightning as pl

from torch.utils.data import DataLoader, Dataset
from torch.utils.data import WeightedRandomSampler
from albumentations.pytorch.transforms import ToTensorV2
import albumentations as A

from timm import create_model
import torch.optim as optim
import torch.nn as nn
import torch

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np

import os
import cv2
import base64

from box import Box
import matplotlib.pyplot as plt
import wandb
from tqdm import tqdm
tqdm.pandas()

In [5]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

Config

In [6]:
cfg = {
    'seed': 42,
    'dir_path': '/content/',
    'photo_path': '/content/train',
    'csv_path': '/content/drive/MyDrive/PetFinder/train_dogs_cats.csv',
    'output_dir': '/content/',
    'logger': {
        'save_dir': '/content/drive/MyDrive/PetFinder/models',
        'project': 'PetFinder',
        'log_model': True,
    },
    'transform': {
        'img_size': (224, 224),
    },
    'loader': {
        'train': {
            'batch_size': 32,
            'num_workers': 4,
            'shuffle': True,
            'pin_memory': False,
        },
        'val': {
            'batch_size': 64,
            'num_workers': 4,
            'pin_memory': False,
        }
    },
    'train_args': {
        'n_splits': 5,
        'epoch': 20,
    },
    'model': {
        'name': 'swin_base_patch4_window7_224',
        'alias_name': 'swin_base_bn_fc',
        'freeze_layers': 0,
        'dropout_backbone': 0,
        'dropout_fc': 0.1,
        'output_dim': 1
    },
    'loss': {
        'module': 'nn.BCEWithLogitsLoss',
        'alias': 'bce',
    },
    'optimizer':{
        'name': 'optim.AdamW',
        'params':{
            'lr': 1e-5,
            'weight_decay': 1e-3,
        },
    },
    'scheduler':{
        'name': 'optim.lr_scheduler.CosineAnnealingWarmRestarts',
        'params':{
            'T_0': 7,
            'eta_min': 1e-6
        },
    },
    'trainer': {
        'gpus': 1,
        'accumulate_grad_batches': 2,
        'auto_lr_find': True,
        'progress_bar_refresh_rate': 3,
        'fast_dev_run': False,
        'num_sanity_val_steps': 2,
        #'overfit_batches': 1,
        'resume_from_checkpoint': None,
    },
    'results_callback': {
        'n_images': 30,
    },
}

cfg = Box(cfg)

In [7]:
seed_everything(cfg.seed, workers=True)

Global seed set to 42


42

In [8]:
val_transforms = lambda img_size: A.Compose([
     A.Resize(*img_size),
     A.Normalize(
         mean = [0.485, 0.456, 0.406],
         std = [0.229, 0.224, 0.225],
         always_apply = True
         ),
     ToTensorV2(),
     ])

train_transforms = lambda img_size: A.Compose([
     A.HorizontalFlip(p = 0.5),
     A.VerticalFlip(p = 0.4),
     A.RandomBrightnessContrast(p=0.3),
     #A.ShiftScaleRotate(p=0.2),
     #A.RandomResizedCrop(*img_size, scale = (0.7, 1)),
     A.Resize(*img_size),
     A.Normalize(
         mean = [0.485, 0.456, 0.406],
         std = [0.229, 0.224, 0.225],
         always_apply = True
         ),
     ToTensorV2(),                                
])

cfg.transform.train_transforms = train_transforms(cfg.transform.img_size)
cfg.transform.val_transforms = val_transforms(cfg.transform.img_size)

In [None]:
# df = pd.read_csv(cfg.csv_path)
# dat_v = CustomDataset(cfg, df, 'val')
# dat_tr = CustomDataset(cfg, df, 'train')
# for i in range(10):
#   fig, ax = plt.subplots(1, 2)
#   ax[0].imshow(dat_v[i][0])
#   ax[1].imshow(dat_tr[i][0])

Model and dataset

In [53]:
class CustomDataset(Dataset):
  def __init__(self, cfg, df: pd.DataFrame, stage: str):
    super().__init__()
    self.cfg = cfg
    self.df = self.prepare_df(df)
    self.stage = stage
    if stage == 'train':
      self.transforms = cfg.transform.train_transforms
    else:
      self.transforms = cfg.transform.val_transforms

  def prepare_df(self, df):
    if not hasattr(df, 'path'):
      df.loc[:, 'path'] = (
          df['Id']
          .apply(lambda x: os.path.join(self.cfg.photo_path, f'{x}.jpg'))
          )
    return df

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index: int):
    photo_path = self.df.iloc[index]['path']
    label = self.df.iloc[index]['Pawpularity']

    #перезаписать на isinstance
    if self.cfg.loss.alias == 'ce':
      label = min(label//20, 4)
    if self.cfg.loss.alias == 'bce':
      label /= 100
    if self.cfg.loss.alias == 'mse':
      label = label.float()

    img = self.prepare_img(photo_path, self.transforms)
    return img, label

  @staticmethod
  def prepare_img(path: str, transforms):
    _img = cv2.imread(path)
    _img = cv2.cvtColor(_img, cv2.COLOR_BGR2RGB)
    img = transforms(image=_img)['image']
    return img

In [10]:
class CustomDataModule(LightningDataModule):
  def __init__(self, cfg, train_df: pd.DataFrame, val_df: pd.DataFrame, sampler = None):
    super().__init__()
    self.cfg = cfg
    self.train_df = train_df
    self.val_df = val_df
    self.sampler = sampler

  def train_dataloader(self):
    train_split = CustomDataset(self.cfg, self.train_df, 'train')
    return DataLoader(
        train_split,
        sampler = self.sampler,
        batch_size=self.cfg.loader.train.batch_size, 
        #shuffle=self.cfg.loader.train.shuffle, 
        num_workers=self.cfg.loader.train.num_workers,
        drop_last = True
        )

  def val_dataloader(self):
    val_split = CustomDataset(self.cfg, self.val_df, 'val')
    return DataLoader(
        val_split, 
        batch_size=self.cfg.loader.val.batch_size, 
        shuffle=False,
        num_workers=self.cfg.loader.val.num_workers,
        )
    
  def predict_dataloader(self):
     predict_split = CustomDataset(self.cfg, self.val_df, 'val')
     return DataLoader(
        predict_split, 
        batch_size=self.cfg.loader.val.batch_size, 
        shuffle=False,
        num_workers=self.cfg.loader.val.num_workers,
        )

In [11]:
class CustomModel(LightningModule):
  def __init__(self, cfg):
    super().__init__()
    self.cfg = cfg
    self.__build_model(cfg.model.freeze_layers)
    self._criterion = eval(self.cfg.loss.module)()
    self.save_hyperparameters(cfg)

  def __build_model(self, freeze_layers: int = 0):
    ## add freezing of layers
    self.backbone = self.__create_model(freeze_layers)
    self.fc = nn.Sequential(
        nn.Dropout(self.cfg.model.dropout_fc),
        nn.LazyLinear(self.cfg.model.output_dim)
    )
  
  def __create_model(self, freeze_layers):
    model = create_model(
        self.cfg.model.name, 
        drop_rate = self.cfg.model.dropout_backbone, 
        pretrained=True, 
        num_classes=0, 
        in_chans=3
        )
    # for p in model.layers[:freeze_layers].parameters():
    #   p.requires_grad = False

    return model
    
  def forward(self, x):
    f = self.backbone(x)
    out = self.fc(f)
    return out

  def configure_optimizers(self):
    optimizer = eval(self.cfg.optimizer.name)(
        self.parameters(), **self.cfg.optimizer.params
        )
    scheduler = eval(self.cfg.scheduler.name)(
        optimizer,
        **self.cfg.scheduler.params
        )
    
    if isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau):
      return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'val_loss'}
    return [optimizer], [scheduler]

  def __share_step(self, batch, stage = 'train'):
    img, labels = batch
    logits = self(img).squeeze()
    loss = self._criterion(logits, labels)

    #переписать на isinstance
    if self.cfg.loss.alias == 'bce':
      preds = logits.sigmoid() * 100 
      labels = labels * 100
    if self.cfg.loss.alias == 'ce':
      preds = (logits.argmax(dim = -1).float() + 1) * 20
      labels = (labels + 1)* 20

    return loss, preds, labels

  def __share_epoch(self, outputs, stage):
    preds = []
    labels = []
    for out in outputs:
      pred, label = out['pred'], out['labels']
      preds.append(pred)
      labels.append(label)

    preds = torch.cat(preds).cpu().detach()
    labels = torch.cat(labels).cpu().detach()
    rmse = torch.sqrt(((labels - preds) ** 2).mean())

    if self.cfg.loss.alias == 'ce':
      self.log(f'{stage}_f1', f1_score(labels, preds, average = 'micro'))
    self.log(f'{stage}_rmse', rmse)

  def training_step(self, batch, batch_idx):
    loss, preds, labels = self.__share_step(batch, 'train')
    self.log('train_loss', loss)
    return {'loss': loss, 'pred': preds, 'labels': labels}
        
  def validation_step(self, batch, batch_idx):
    loss, preds, labels = self.__share_step(batch, 'val')
    self.log('val_loss', loss)
    return {'loss': loss, 'pred': preds, 'labels': labels}

  def predict_step(self, batch, batch_idx, dataloader_idx=0):
    img, labels = batch
    return self(img)

  def training_epoch_end(self, outputs):
    self.__share_epoch(outputs, 'train')

  def validation_epoch_end(self, outputs):
    self.__share_epoch(outputs, 'val')

Custom callbacks

In [12]:
class WandbWritter(callbacks.Callback):
  def __init__(self, cfg, df):
    self.cfg = cfg
    self.wandb_table = wandb.Table(columns=["Pawpularity", "pred", "image"])
    self.loader = self.configure_loader(cfg, df)

  def configure_loader(self, cfg, df):
    image_dataset = CustomDataset(
        cfg, 
        df.iloc[:cfg.results_callback.n_images], 
        'val'
        )
    image_dataloader = DataLoader(
        image_dataset, 
        batch_size=1, 
        shuffle=False,
        num_workers=cfg.loader.val.num_workers,
        )
    return image_dataloader

  def on_validation_epoch_end(self, trainer, pl_module):
    for b in self.loader:
      img, label = b
      pred = pl_module(img.cuda())[0]

      # переписать на isinstance
      if self.cfg.loss.alias == 'bce':
        pred = (pred.sigmoid() * 100).detach().cpu()
        label = (label * 100).detach().cpu()
      if self.cfg.loss.alias == 'ce':
        pred = ((pred.argmax(dim = -1).float() + 1) * 20).detach().cpu()
        label = ((label + 1) * 20).detach().cpu()

      self.wandb_table.add_data(label.squeeze(), pred.squeeze(), wandb.Image(img.squeeze()))
    trainer.logger.experiment.log({'results': self.wandb_table})

class CsvWritter(callbacks.BasePredictionWriter):
  def __init__(self, cfg, fold, write_interval='epoch'):
    super().__init__(write_interval)
    self.cfg = cfg
    self.fold = fold
    self.configure_output_dir()
        
  def configure_output_dir(self):
    if not os.path.exists(self.cfg.output_dir):
        os.mkdir(self.cfg.output_dir)
        
  def write_on_epoch_end(
      self, trainer, pl_module, predictions, batch_indices
  ):
      df = trainer.predict_dataloaders[0].dataset.df
      pred = torch.cat(predictions[0]).squeeze().detach().cpu()
      # переписать на isinstance
      if self.cfg.loss.alias == 'bce':
        pred = pred.sigmoid() * 100
      if self.cfg.loss.alias == 'ce':
        pred = (pred.argmax(dim = -1).float() + 1) * 20
  
      df['predictions'] = pred.numpy()
      df.to_csv(os.path.join(self.cfg.output_dir, f'{self.fold}_sub.csv'), index = False)
      print("prediction's done")

Predictions

In [71]:
skf = StratifiedKFold(n_splits=cfg.train_args.n_splits, shuffle=True, random_state=cfg.seed)
df = pd.read_csv(cfg.csv_path)
df["Pawpularity_class"] = df.apply(
    lambda x: f"{x['Pawpularity']}_{x['class']}", axis = 1
)
df['paw_bins'] = np.digitize(df['Pawpularity'], np.arange(0, 100, 10), right=False) - 1 
df['weights'] = df['paw_bins'].map(df['paw_bins'].value_counts(normalize = True).sort_index()**(-1.))

In [31]:
# dict_bin = {i: {'index': None, 'query': None} for i in range(10)}
# for bin in df['paw_bins'].unique():
#   dict_bin[bin]['query'] = df[df['paw_bins'] == bin].sample(10)
#   temp_index = []
#   for j in set(df['paw_bins'].unique()) - set([bin]):
#     temp_index.append(df[df['paw_bins'] == j].sample(1))
#   dict_bin[bin]['index'] = pd.concat(temp_index)

In [None]:
# from torch.nn import MarginRankingLoss

# MarginRankingLoss(margin = nn.Parameter(torch.tensor(0.1)))

MarginRankingLoss()

In [None]:
# index 10*10 items from 0 to 100
# query 10*10 items from 0 to 100

In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(df["Id"], df["Pawpularity_class"])):
  print(f"{'*'*100}\n{'*'*45} fold: {fold} {'*'*46}\n{'*'*100}")

  #model, data
  train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]
  sampler = WeightedRandomSampler(weights=train_df['weights'], num_samples = len(train_df))
  datamodule = CustomDataModule(cfg, train_df, val_df, sampler)
  model = CustomModel(cfg)

  #callbacks
  earystopping = EarlyStopping(monitor="val_rmse", patience = 3)
  lr_monitor = callbacks.LearningRateMonitor()
  csv_writer = CsvWritter(cfg, fold)
  wandb_writter = WandbWritter(cfg, val_df)
  loss_checkpoint = callbacks.ModelCheckpoint(
      dirpath = os.path.join(cfg.logger.save_dir, cfg.model.alias_name),
      filename=f"{cfg.model.alias_name}",
      monitor="val_rmse",
      save_top_k=1,
      mode="min",
      save_last=False,
      )

  wandb_logger = WandbLogger(
      log_model = cfg.logger.log_model,
      reinit = True,
      )
  #define metrics to watch for min value
  wandb.init(
      name = f"fold_{fold}_{cfg.model.alias_name}",
      project = cfg.logger.project,
      group = f"{cfg.model.alias_name}",
      )
  wandb.define_metric("val_rmse", summary="min")
  wandb.define_metric("train_rmse", summary="min")

  #trainer
  trainer = pl.Trainer(
      max_epochs=cfg.train_args.epoch,
      logger = wandb_logger,
      callbacks=[
            lr_monitor, 
            loss_checkpoint, 
            earystopping, 
            csv_writer, 
            wandb_writter
            ],
      #deterministic=True,
      **cfg.trainer,
      )
  
  trainer.fit(model, datamodule=datamodule)
  trainer.predict(model, datamodule=datamodule)
  #wandb.finish()
wandb.finish()

In [None]:
#Add more data to train
more_data_df = pd.read_csv('/content/drive/MyDrive/PetFinder/new_data.csv')
datamodule = CustomDataModule(cfg, more_data_df, more_data_df)

for fold in range(5):
  v = '' if fold == 0 else f'-v{fold}'
  model_path = f'/content/drive/MyDrive/PetFinder/models/swin_base_aug_new_val/swin_base_aug_new_val{v}.ckpt'
  model = CustomModel.load_from_checkpoint(model_path, cfg = cfg)
  
  csv_writer = CsvWritter(cfg, fold)
  trainer = pl.Trainer(
      callbacks=[
            csv_writer, 
            ],
      **cfg.trainer,
      )
  trainer.predict(model, datamodule=datamodule)

In [None]:
# train_idx, val_idx = next(iter(skf.split(df["Id"], df["Pawpularity_class"])))
# train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]
# datamodule = CustomDataModule(cfg, train_df, val_df)
# model = CustomModel(cfg)
# #init model
# model.cuda()
# batch = next(iter(datamodule.val_dataloader()))
# model(batch[0].cuda())

# trainer = pl.Trainer(
#       max_epochs=cfg.train_args.epoch,
#       **cfg.trainer,
#       )

# lr_finder = trainer.tuner.lr_find(model, min_lr = 1e-6, max_lr = 1e-2, datamodule=datamodule)

# # Results can be found in
# lr_finder.results

# # Plot with
# fig = lr_finder.plot(suggest=True)
# fig.show()

# # Pick point based on plot, or get suggestion
# new_lr = lr_finder.suggestion()


In [None]:
from tqdm import tqdm

In [None]:
cfg.loader.train.shuffle = False
cfg.loader.train.batch_size = 64
cfg.loader.train.num_workers = 4

In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(df["Id"], df["Pawpularity_class"])):
  print(f"{'*'*100}\n{'*'*45} fold: {fold} {'*'*46}\n{'*'*100}")

  #model, data
  train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]
  train_df.index = range(len(train_df))
  val_df.index = range(len(val_df))
  datamodule = CustomDataModule(cfg, train_df, val_df)
  csv_writer = CsvWritter(cfg, fold)

  v = '' if fold == 0 else f'-v{fold}'
  model_path = f'/content/drive/MyDrive/PetFinder/models/swin_base_aug_new_val/swin_base_aug_new_val{v}.ckpt'
  model = CustomModel.load_from_checkpoint(model_path, cfg = cfg)

  trainer = pl.Trainer(
      max_epochs=cfg.train_args.epoch,
      callbacks=[
            csv_writer
            ],
      #deterministic=True,
      **cfg.trainer,
      )
  trainer.predict(model, datamodule=datamodule)

In [None]:
subs = []
for i in range(5):
  sub = pd.read_csv(f'{i}_sub.csv')
  sub['pred_bin'] = np.digitize(sub['predictions'], np.arange(0, 100, 10))
  sub['paw_bin'] = np.digitize(sub['Pawpularity'], np.arange(0, 100, 10))
  subs.append(sub)

In [None]:
np.arange(0, 100, 10)

array([ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90])

In [None]:
np.digitize(np.array([21, 31, 41]), np.arange(0, 100, 10))
>20 <40

array([3, 4, 5])

In [None]:
from sklearn.metrics import classification_report

for i in range(5):
  print(f"fold: {i}")
  print(classification_report(subs[i]['paw_bin'], subs[i]['pred_bin']))

In [None]:
def add_data(df):
    df['pred_paw_group'] = np.digitize(100*sigmoid(df['pred_paw']), np.array([0, 20, 40, 60, 80, 101]))
    df['pred_paw_grouped'] = df['pred_paw_group'].map(df.groupby(['pred_paw_group'])['pred_paw'].mean())

    df['pred_paw_breed'] = df['top_preds'].map(df.groupby(['top_preds'])['pred_paw'].mean())
    df['pred_paw_class'] = df['class'].map(df.groupby(['class'])['pred_paw'].mean())
    return df

scores_bare = []
scores_grouped = []

for i in range(5):
  df_train = pd.read_csv(f'{i}_sub_train.csv')
  df_val = pd.read_csv(f'{i}_sub_val.csv')

  add_data(df_train).to_csv(f'{i}_sub_train.csv')
  add_data(df_val).to_csv(f'{i}_sub_val.csv')

In [None]:
from sklearn.metrics import mean_squared_error

def rmse(true, pred):
  return np.sqrt(np.mean((pred-true)**2))

def sigmoid(x):
  return 1 / (1 + np.exp(-x))

In [None]:
!pip install pytorch-tabnet

In [None]:
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error 
from sklearn.ensemble import RandomForestRegressor
from scipy.special import softmax
#from pytorch_tabnet.tab_model import TabNetRegressor
import joblib

In [None]:
x_columns_to_drop = ['Pawpularity_class', 'path', 'Pawpularity', 'Id']
y_columns = ['Pawpularity']

In [None]:
scores_gbm = []

for i in range(5):
  gbm = LGBMRegressor(max_depth=3, n_estimators=10, learning_rate= 0.1)
  df_train = pd.read_csv(f'/content/{i}_sub_train.csv')
  df_val = pd.read_csv(f'/content/{i}_sub_val.csv')

  df_train['pred_paw'] = df_train['pred_paw'].apply(lambda x: 100*sigmoid(x))
  df_val['pred_paw'] = df_val['pred_paw'].apply(lambda x: 100*sigmoid(x))
  df_train[[f'feature_{i}' for i in range(187)]] = softmax(df_train[[f'feature_{i}' for i in range(187)]], axis=-1)
  df_train[['cat', 'dog']] = softmax(df_train[['cat', 'dog']], axis=-1)
  df_val[['cat', 'dog']] = softmax(df_val[['cat', 'dog']], axis=-1)
  df_val[[f'feature_{i}' for i in range(187)]] = softmax(df_val[[f'feature_{i}' for i in range(187)]], axis=-1)

  gbm.fit(df_train.drop(columns = x_columns_to_drop), df_train[y_columns])
  rmse_score = rmse(df_val['Pawpularity'], gbm.predict(df_val.drop(columns = x_columns_to_drop), ))

  scores_gbm.append(rmse_score)
  # save model
  #joblib.dump(gbm, f'/content/drive/MyDrive/PetFinder/models/lgbm/lgb_{i}.pkl')

#scores_gbm = np.array(scores_gbm).mean()

In [None]:
df_val

In [None]:
np.array(scores_gbm).mean()

17.909193240484328

In [None]:
np.array(scores_gbm).mean()

17.721596729255783

In [None]:
np.array(scores_gbm).mean()

17.748799584716398

In [None]:
scores_gbm

18.09050279357285

In [None]:
scores_gbm

17.781984339757166

In [None]:
scores_nn

17.863203038121718