# Fitting Description

## Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/digital_breakthrough/task_3

/content/drive/MyDrive/digital_breakthrough/task_3


In [3]:
!cat requirements.txt
!pip install -r requirements.txt

albumentations==0.5.2
tensorflow==2.5.0
numpy>=1.19.5
torch>=1.7.1
pandas>=1.2.4
torchvision>=0.8.2
opencv-python
PyYAML
tqdm==4.56.0
scikit-image
scikit-learn
scipy
matplotlib
python-json-logger>=0.1.11
jupyterlab
seaborn
grad-cam
ttach
transformers
omegaconfCollecting albumentations==0.5.2
  Downloading albumentations-0.5.2-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 759 kB/s 
Collecting pandas>=1.2.4
  Downloading pandas-1.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.5 MB)
[K     |████████████████████████████████| 11.5 MB 8.8 MB/s 
Collecting tqdm==4.56.0
  Downloading tqdm-4.56.0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 987 kB/s 
Collecting python-json-logger>=0.1.11
  Downloading python_json_logger-2.0.2-py3-none-any.whl (7.4 kB)
Collecting jupyterlab
  Downloading jupyterlab-3.1.1-py3-none-any.whl (8.6 MB)
[K     |████████████████████████████████| 8.6 MB 23.2 MB/s 
Collecting grad-cam
  Down

In [1]:
cd /content/drive/MyDrive/digital_breakthrough/task_3

/content/drive/MyDrive/digital_breakthrough/task_3


## Load Data

In [2]:
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
sys.path.append('.')
from definitions import ROOT_DIR
from airotica.utils import read_image

In [3]:
DATA_PATH = ROOT_DIR / 'data'
TRAIN_IMAGES = DATA_PATH / 'images'
DOWNLOADED_TRAIN_IMAGES = DATA_PATH / 'downloaded_images'

In [4]:
train = pd.read_csv(DATA_PATH / 'train.csv')
train_url_only = pd.read_csv(DATA_PATH / 'train_url_only.csv')
train_url_loaded_images = pd.read_csv(DATA_PATH / 'train_loaded_images.csv')
test = pd.read_csv(DATA_PATH / 'test.csv')
sample_submission = pd.read_csv(DATA_PATH / 'sample_submission.csv')

In [5]:
import os
from os import listdir
train_images = listdir(TRAIN_IMAGES)
guid_train_images = [f.split('.')[0] for f in train_images if os.path.getsize(f"{str(TRAIN_IMAGES / f)}") != 0]

In [6]:
dummy = test[test.guid.isin(guid_train_images)]
test_only_description = test[~(test.guid.isin(guid_train_images))]
test_only_images = dummy[dummy.description.isna()]
test_images_and_description = dummy[~(dummy.description.isna())]
print('without image or description:', len(test_only_description[test_only_description.description.isna()]))
print('only description:', test_only_description.shape[0])
print('only images:', test_only_images.shape[0])
print('images and description:', test_images_and_description.shape[0])

without image or description: 0
only description: 106
only images: 547
images and description: 570


 ## FitImages

### Prepared Data

In [7]:
train_url_loaded_images['typology'] = train_url_loaded_images.typology.replace(
    {'предметы прикладного искусства, быта и этнографии ': 'предметы прикладного искусства, быта и этнографии'}
    )

In [8]:
train_labels = train.typology.unique()
typology_to_label = dict(zip(sorted(train_labels), range(len(train_labels))))

In [9]:
train_labels_url = train_url_loaded_images[~(train_url_loaded_images.typology.isna())].typology.unique()
typology_to_label_url = dict(zip(sorted(train_labels_url), range(len(train_labels_url))))

In [10]:
train_url_only_train_labels = train_url_loaded_images[train_url_loaded_images.typology.isin(typology_to_label.keys())]
print(len(train_url_only_train_labels))

175076


In [11]:
dummy_train = train[train.guid.isin(guid_train_images)]
dummy_train.loc[:, 'add'] = 0
train_url_only_train_labels.loc[:, 'add'] = 1
full_train = pd.concat((dummy_train, train_url_only_train_labels.drop('url', axis=1)), axis=0)
full_train = full_train[~(full_train.typology.isna())]
full_train['label'] = full_train['typology'].map(typology_to_label)
full_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,guid,description,typology,add,label
0,c84c547b-c5c5-45cf-9199-736df1301124,Монета. Екатерина II. Две копейки. 1789 г.,предметы нумизматики,0,7
1,af6fb03f-3d31-484f-ba9d-51e7b4ef55b9,Владельческий конволют. Собрание сочинений. / ...,редкие книги,0,12
2,5ad52d30-8239-4b41-bd56-da99ab5a555b,"Медаль ВДНХ ""За успехи в народном хозяйстве СС...",предметы нумизматики,0,7
3,6ce2ecbe-80d2-45d1-8a4c-7599950a7792,Монета. Михаил Федорович. Копейка,предметы нумизматики,0,7
4,73a48c2d-12c3-40da-b071-f9abd5bed64b,Временное удостоверение №12849 Охотина Р.А. о ...,документы,0,1
...,...,...,...,...,...
175119,0313dde2-b5bd-4a1a-b7ef-63436ad975ad,Фото ч/б матов. групповое с фигурными краями ...,фотографии и негативы,1,14
175120,669ecd91-4197-4734-a9a9-7cb79d4af9d3,Открытое письмо «Петродворец. Вид на Аллею фо...,предметы печатной продукции,1,8
175121,07ca9b7c-efea-49e3-8db8-e69e9a079bb4,Фото черно-белое. Художник Алексей Як. Кольцов...,фотографии и негативы,1,14
175122,c0df1cb4-1ceb-47ea-873e-6d96fc5a1a95,Грампластинка.,прочие,1,11


In [12]:
full_train.to_csv('./data/full_images_data.csv', index=False)

### Model

In [56]:
import os
import sys
import torch
import yaml
import logging
import numpy as np
import random
from tqdm import tqdm
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

sys.path.append('.')
from definitions import ROOT_DIR
from airotica.utils import save_checkpoint, convert_dict_to_tuple
from airotica.models import load_model
from airotica.loss_function import get_loss
from airotica.optimizers import (
    get_optimizer,
    get_scheduler
)
from airotica.dataset import get_data_loaders
from airotica.train import train as train_model, validation

%matplotlib inline
%load_ext autoreload
%autoreload 2

log = logging.getLogger(__name__)

{"asctime": "2021-07-31 14:15:43", "name": "matplotlib.pyplot", "filename": "pyplot.py", "levelname": "DEBUG", "message": "Loaded backend module://ipykernel.pylab.backend_inline version unknown."}
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
DATA_PATH = ROOT_DIR / 'data'
CONFIG_PATH = ROOT_DIR / 'config.yml'

In [58]:
with open(CONFIG_PATH) as f:
    data = yaml.safe_load(f)
config = convert_dict_to_tuple(dictionary=data)

device_name = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device_name)
print(f'device: {device_name}')

device: cuda


In [59]:
seed = config.dataset.seed
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

os.environ['CUDA_VISIBLE_DEVICES'] = config.cuda_id

In [60]:
print("Loading model...")
net = load_model(config, 
                 device=device_name)
print("Done.")
criterion, criterion_val = get_loss(config, device=device_name)
optimizer = get_optimizer(config, 'SGD', net)

n_epoch = 20
scheduler = get_scheduler(config, optimizer)
train_epoch = tqdm(range(config.train.n_epoch),
                   dynamic_ncols=True,
                   desc='Epochs',
                   position=0)

Loading model...
{"asctime": "2021-07-31 14:15:46", "name": "airotica.models", "filename": "models.py", "levelname": "INFO", "message": "ResNext101_32x8d"}
Done.
{"asctime": "2021-07-31 14:15:48", "name": "airotica.optimizers", "filename": "optimizers.py", "levelname": "INFO", "message": "0.002"}
{"asctime": "2021-07-31 14:15:48", "name": "airotica.optimizers", "filename": "optimizers.py", "levelname": "INFO", "message": "Opt: SGD"}


Epochs:   0%|          | 0/21 [00:00<?, ?it/s]

### Data

In [61]:
trained = pd.read_csv('./data/full_images_data.csv')

In [62]:
label_to_typology = {v: k for k, v in typology_to_label.items()}

In [63]:
for i in range(15):
    globals()[f'label_{i}'] = trained[trained['label'] == i]
    print(globals()[f'label_{i}'].shape[0], label_to_typology[i], i)

14027 графика 0
24186 документы 1
3365 живопись 2
695 оружие 3
22153 предметы археологии 4
4074 предметы естественнонаучной коллекции 5
1431 предметы минералогической коллекции 6
21596 предметы нумизматики 7
19836 предметы печатной продукции 8
21262 предметы прикладного искусства, быта и этнографии 9
1532 предметы техники 10
8244 прочие 11
7746 редкие книги 12
505 скульптура 13
28772 фотографии и негативы 14


In [64]:
n = 1000
full = pd.concat((
     label_0[:n],
     label_1[:n],
     label_2[:n],
     label_3,
     label_4[:n],
     label_5[:n],
     label_6,
     label_7[:n],
     label_8[:n],
     label_9[:n],
     label_10,
     label_11[:n],
     label_12[:n],
     label_13,
     label_14[:n])
    )

In [80]:
failed_guid = ['5c989b07-c6a2-4c7b-bd48-e6f9bca7eb27', 
               'cbd73791-e011-40e1-8cc7-f72a745bcebb', 
               'bebd7545-b84c-4af1-84bd-425e2b6313a9',
               'baa188a2-d630-489f-b92e-6cfaba762b0b',
               '4c39c840-cf9f-4363-82bf-74ea084c4f0d',
               '94c866c7-6230-409f-a5f9-b50f976ab62f',
               '46ecbdac-521d-4967-8d04-78e1811d3403',
               '60cacca9-e773-4932-83ca-e4d61bf5c134',
               '76975b28-9dfd-4cb6-bb54-c10a71b886a8',
               'b8165817-1662-4c17-ab53-fd0525537140',
               '99d35800-949b-445f-a018-d2e60b67d9c6',
               '852f75be-19bf-4148-932c-1637facbe8bc',
               'b8165817-1662-4c17-ab53-fd0525537140',
               'a733aa01-0e72-4a69-9045-6d262110c766',
               '78d9b613-9ced-4c65-9cb0-6b2d4dc8006c',
               '26e283c4-5919-4586-912c-3a45bfe290da']

full_train_2 = full[~(full.guid.isin(failed_guid))].reset_index(drop=True)

In [81]:
X_train, X_valid = train_test_split(full_train_2, test_size=0.1, random_state=42)
print(f'train: {X_train.shape[0]}, valid: {X_valid.shape[0]}')

train: 13638, valid: 1516


In [82]:
dt, dv = get_data_loaders(train_data=X_train, 
                          valid_data=X_valid, 
                          config=config, 
                          img_path=TRAIN_IMAGES, 
                          download_img_path=DOWNLOADED_TRAIN_IMAGES, 
                          use_data=True)

{"asctime": "2021-07-31 14:26:24", "name": "airotica.dataset", "filename": "dataset.py", "levelname": "INFO", "message": "Preparing train reader..."}
{"asctime": "2021-07-31 14:26:24", "name": "airotica.dataset", "filename": "dataset.py", "levelname": "INFO", "message": "Done."}
{"asctime": "2021-07-31 14:26:24", "name": "airotica.dataset", "filename": "dataset.py", "levelname": "INFO", "message": "here"}
{"asctime": "2021-07-31 14:26:24", "name": "airotica.dataset", "filename": "dataset.py", "levelname": "INFO", "message": "Preparing valid reader..."}
{"asctime": "2021-07-31 14:26:24", "name": "airotica.dataset", "filename": "dataset.py", "levelname": "INFO", "message": "Done."}


  cpuset_checked))


In [83]:
out_dir = str(ROOT_DIR / os.path.join(config.outdir, 'resnext152'))
print("Savedir: {}".format(out_dir))
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

Savedir: /content/drive/MyDrive/digital_breakthrough/task_3/EXPERIMENTS/resnext152


In [84]:
f1_best = 0
for epoch in train_epoch:
    train_model(net, dt, criterion, optimizer, config, epoch)
    f1_val = validation(net, dv, criterion_val, epoch)
    if f1_val > f1_best:
        save_checkpoint(net, optimizer, scheduler, epoch, out_dir, f1_val)
        f1_best = f1_val
    scheduler.step()


Train:  71%|███████   | 201/284 [03:42<01:24,  1.02s/it][A
Train:  71%|███████   | 202/284 [03:43<01:23,  1.02s/it][A
Train:  71%|███████▏  | 203/284 [03:44<01:22,  1.02s/it][A
Train:  72%|███████▏  | 204/284 [03:45<01:21,  1.02s/it][A
Train:  72%|███████▏  | 205/284 [03:48<01:27,  1.11s/it]


KeyboardInterrupt: ignored