# Fitting Description

## Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/digital_breakthrough/task_3

/content/drive/MyDrive/digital_breakthrough/task_3


In [3]:
!cat requirements.txt
!pip install -r requirements.txt

albumentations==0.5.2
tensorflow==2.5.0
numpy>=1.19.5
torch>=1.7.1
pandas==1.1.5
torchvision>=0.8.2
opencv-python
PyYAML
tqdm==4.56.0
scikit-image
scikit-learn
scipy
matplotlib
python-json-logger>=0.1.11
jupyterlab
seaborn
#grad-cam
#ttach
transformers
omegaconfCollecting albumentations==0.5.2
  Downloading albumentations-0.5.2-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 621 kB/s 
Collecting tqdm==4.56.0
  Downloading tqdm-4.56.0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 963 kB/s 
Collecting python-json-logger>=0.1.11
  Downloading python_json_logger-2.0.2-py3-none-any.whl (7.4 kB)
Collecting jupyterlab
  Downloading jupyterlab-3.1.2-py3-none-any.whl (8.6 MB)
[K     |████████████████████████████████| 8.6 MB 58.7 MB/s 
Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 66.8 MB/s 
[?25hCollecting omegaconf
  Downloading omegaconf-2.1.0-py3-

In [3]:
cd /content/drive/MyDrive/digital_breakthrough/task_3

/content/drive/MyDrive/digital_breakthrough/task_3


## Load Data

In [4]:
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
sys.path.append('.')
from definitions import ROOT_DIR
from airotica.utils import read_image

In [5]:
DATA_PATH = ROOT_DIR / 'data'
TRAIN_IMAGES = DATA_PATH / 'images'
DOWNLOADED_TRAIN_IMAGES = DATA_PATH / 'downloaded_images'

In [6]:
train = pd.read_csv(DATA_PATH / 'train.csv')
train_url_only = pd.read_csv(DATA_PATH / 'train_url_only.csv')
train_url_loaded_images = pd.read_csv(DATA_PATH / 'train_loaded_images.csv')
test = pd.read_csv(DATA_PATH / 'test.csv')
sample_submission = pd.read_csv(DATA_PATH / 'sample_submission.csv')

In [7]:
import os
from os import listdir
train_images = listdir(TRAIN_IMAGES)
guid_train_images = [f.split('.')[0] for f in train_images if os.path.getsize(f"{str(TRAIN_IMAGES / f)}") != 0]

In [8]:
dummy = test[test.guid.isin(guid_train_images)]
test_only_description = test[~(test.guid.isin(guid_train_images))]
test_only_images = dummy[dummy.description.isna()]
test_images_and_description = dummy[~(dummy.description.isna())]
print('without image or description:', len(test_only_description[test_only_description.description.isna()]))
print('only description:', test_only_description.shape[0])
print('only images:', test_only_images.shape[0])
print('images and description:', test_images_and_description.shape[0])

without image or description: 0
only description: 106
only images: 547
images and description: 570


 ## FitImages

### Prepared Data

In [9]:
train_url_loaded_images['typology'] = train_url_loaded_images.typology.replace(
    {'предметы прикладного искусства, быта и этнографии ': 'предметы прикладного искусства, быта и этнографии'}
    )

In [10]:
train_labels = train.typology.unique()
typology_to_label = dict(zip(sorted(train_labels), range(len(train_labels))))

In [11]:
train_labels_url = train_url_loaded_images[~(train_url_loaded_images.typology.isna())].typology.unique()
typology_to_label_url = dict(zip(sorted(train_labels_url), range(len(train_labels_url))))

In [12]:
train_url_only_train_labels = train_url_loaded_images[train_url_loaded_images.typology.isin(typology_to_label.keys())]
print(len(train_url_only_train_labels))

175076


In [13]:
dummy_train = train[train.guid.isin(guid_train_images)]
dummy_train.loc[:, 'add'] = 0
train_url_only_train_labels.loc[:, 'add'] = 1
full_train = pd.concat((dummy_train, train_url_only_train_labels.drop('url', axis=1)), axis=0)
full_train = full_train[~(full_train.typology.isna())]
full_train['label'] = full_train['typology'].map(typology_to_label)
full_train = full_train[full_train.typology != 'прочие']
full_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0,guid,description,typology,add,label
0,c84c547b-c5c5-45cf-9199-736df1301124,Монета. Екатерина II. Две копейки. 1789 г.,предметы нумизматики,0,7
1,af6fb03f-3d31-484f-ba9d-51e7b4ef55b9,Владельческий конволют. Собрание сочинений. / ...,редкие книги,0,12
2,5ad52d30-8239-4b41-bd56-da99ab5a555b,"Медаль ВДНХ ""За успехи в народном хозяйстве СС...",предметы нумизматики,0,7
3,6ce2ecbe-80d2-45d1-8a4c-7599950a7792,Монета. Михаил Федорович. Копейка,предметы нумизматики,0,7
4,73a48c2d-12c3-40da-b071-f9abd5bed64b,Временное удостоверение №12849 Охотина Р.А. о ...,документы,0,1
...,...,...,...,...,...
175117,949efab1-d485-416d-bc60-354e5488ec38,Материалы военкомата. Список именной от 20.12...,документы,1,1
175118,4924f819-1860-411e-bf1d-19a041e9bc1b,"Литография. "" Сингала месджидъ въ Баку"". "" ""К...",предметы печатной продукции,1,8
175119,0313dde2-b5bd-4a1a-b7ef-63436ad975ad,Фото ч/б матов. групповое с фигурными краями ...,фотографии и негативы,1,14
175120,669ecd91-4197-4734-a9a9-7cb79d4af9d3,Открытое письмо «Петродворец. Вид на Аллею фо...,предметы печатной продукции,1,8


In [None]:
full_train.to_csv('./data/full_images_data_2.csv', index=False)

### Model

In [14]:
import os
import sys
import torch
import yaml
import logging
import numpy as np
import random
from tqdm import tqdm
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

sys.path.append('.')
from definitions import ROOT_DIR
from airotica.utils import save_checkpoint, convert_dict_to_tuple
from airotica.models import load_model
from airotica.loss_function import get_loss
from airotica.optimizers import (
    get_optimizer,
    get_scheduler
)
from airotica.dataset import get_data_loaders
from airotica.train import train as train_model, validation

%matplotlib inline
%load_ext autoreload
%autoreload 2

log = logging.getLogger(__name__)

{"asctime": "2021-08-04 18:16:37", "name": "matplotlib.pyplot", "filename": "pyplot.py", "levelname": "DEBUG", "message": "Loaded backend module://ipykernel.pylab.backend_inline version unknown."}


In [15]:
DATA_PATH = ROOT_DIR / 'data'
CONFIG_PATH = ROOT_DIR / 'config5.yml'

In [16]:
with open(CONFIG_PATH) as f:
    data = yaml.safe_load(f)
config = convert_dict_to_tuple(dictionary=data)

device_name = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device_name)
print(f'device: {device_name}')

device: cuda


In [17]:
seed = config.dataset.seed
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

os.environ['CUDA_VISIBLE_DEVICES'] = config.cuda_id

In [18]:
print("Loading model...")
net = load_model(config, 
                 device=device_name)
print("Done.")
criterion, criterion_val = get_loss(config, device=device_name)
optimizer = get_optimizer(config, 'SGD', net)

n_epoch = 20
scheduler = get_scheduler(config, optimizer)
train_epoch = tqdm(range(config.train.n_epoch),
                   dynamic_ncols=True,
                   desc='Epochs',
                   position=0)

Loading model...
{"asctime": "2021-08-04 18:16:38", "name": "airotica.models", "filename": "models.py", "levelname": "INFO", "message": "ResNext101_32x8d"}
Done.
{"asctime": "2021-08-04 18:16:42", "name": "airotica.optimizers", "filename": "optimizers.py", "levelname": "INFO", "message": "0.002"}
{"asctime": "2021-08-04 18:16:42", "name": "airotica.optimizers", "filename": "optimizers.py", "levelname": "INFO", "message": "Opt: SGD"}


Epochs:   0%|          | 0/21 [00:00<?, ?it/s]

### Data

In [19]:
trained = pd.read_csv('./data/full_images_data_2.csv')

In [20]:
label_to_typology = {v: k for k, v in typology_to_label.items()}

In [21]:
for i in range(15):
    globals()[f'label_{i}'] = trained[trained['label'] == i]
    print(globals()[f'label_{i}'].shape[0], label_to_typology[i], i)

{"asctime": "2021-08-04 18:16:43", "name": "numexpr.utils", "filename": "utils.py", "levelname": "INFO", "message": "NumExpr defaulting to 4 threads."}
14027 графика 0
24186 документы 1
3365 живопись 2
695 оружие 3
22153 предметы археологии 4
4074 предметы естественнонаучной коллекции 5
1431 предметы минералогической коллекции 6
21596 предметы нумизматики 7
19836 предметы печатной продукции 8
21262 предметы прикладного искусства, быта и этнографии 9
1532 предметы техники 10
0 прочие 11
7746 редкие книги 12
505 скульптура 13
28772 фотографии и негативы 14


In [26]:
full = pd.concat(
    [
     label_3,
     label_3,
     label_3,
     label_4,
     label_4,
     label_4,
     label_6,
     label_6,
     label_6,
     label_10,
     label_10,
     label_10,
     label_13,
     label_13,
     label_13,
     label_0,
     label_1,
     label_2,
     label_5,
     label_7,
     label_8,
     label_9,
     label_11,
     label_12,
     label_14
    ]
)

In [27]:
mapper = {0:0,
          1:1,
          2:2,
          3:3,
          4:4,
          5:5,
          6:6,
          7:7,
          8:8,
          9:9,
          10:10,
          12:11,
          13:12,
          14:13
          }

In [28]:
full['label'] = full['label'].map(mapper)

In [None]:
downloaded_train_images = listdir(DOWNLOADED_TRAIN_IMAGES)
guid_downloaded_train_images = [f.split('.')[0] for f in downloaded_train_images 
                                if os.path.getsize(f"{str(DOWNLOADED_TRAIN_IMAGES / f)}") > 100]

In [None]:
failed_guid = ['5c989b07-c6a2-4c7b-bd48-e6f9bca7eb27', 
               'cbd73791-e011-40e1-8cc7-f72a745bcebb', 
               'bebd7545-b84c-4af1-84bd-425e2b6313a9',
               'baa188a2-d630-489f-b92e-6cfaba762b0b',
               '4c39c840-cf9f-4363-82bf-74ea084c4f0d',
               '94c866c7-6230-409f-a5f9-b50f976ab62f',
               '46ecbdac-521d-4967-8d04-78e1811d3403',
               '60cacca9-e773-4932-83ca-e4d61bf5c134',
               '76975b28-9dfd-4cb6-bb54-c10a71b886a8',
               'b8165817-1662-4c17-ab53-fd0525537140',
               '99d35800-949b-445f-a018-d2e60b67d9c6',
               '852f75be-19bf-4148-932c-1637facbe8bc',
               'b8165817-1662-4c17-ab53-fd0525537140',
               'a733aa01-0e72-4a69-9045-6d262110c766',
               '78d9b613-9ced-4c65-9cb0-6b2d4dc8006c',
               '26e283c4-5919-4586-912c-3a45bfe290da',
               '1e7d2f58-d625-415c-ab3e-68b13ac46649',
               '73480476-2ba5-4e2f-8a67-f37db72a3bf0',
               '5de42ac6-4652-4a01-998b-0e9a70b19485',
               'eaa4120d-9375-4215-baed-03d9eb71b1f8',
               '900f9bab-7440-4776-9974-dd1a62687609',
               '3bc9b4a8-e881-46c1-82ce-74045856f3f4',
               '56060482-90ab-4a34-816b-ff0c2d01d6f9',
               '36803a89-50a4-4856-916d-3fb0f8c2874a',
               '8eccaebb-9fef-4e89-b08f-0fbec5feada1',
               '593efb25-7798-45a8-a953-3e0160ab826d',
               'e80f1b44-3a97-424b-b27b-6c5d30934ec1',
               '8be3ede8-1da2-4d56-820a-fee889fc9b19',
               '2aaf7293-bf48-48ce-9924-5a3c8a5d5c12',
               'a8a34b22-cf29-4987-92dc-9615aa63c0d3',
               '90f467ca-061b-4bd1-9222-3ccef2b5a476',
               '9f675417-4575-43b1-8acd-affeeecf5a3d',
               '208199f2-d8a1-4bb1-ab53-ecb14829e3c7',
               '0d7cd8d8-61f5-44f7-a6e1-04a17f34eac6',
               '8363665f-118e-4be7-89e5-42d71effbdd4',
               'e8e87195-6840-4487-a4e8-e834318aae36',
               '51a9e383-813a-4c19-b32c-e3604b171972',
               '00ce78f0-fded-4dc3-949d-3c6a55e9a4c7',
               '500dc4d-371d-4bd2-a1cd-3300991cf237',
               '63c1ee48-4f17-4f63-9cce-df8f6d78bb7c',
               'c073a415-66dd-460e-84b7-8c1a31e2f21e',
               'f500dc4d-371d-4bd2-a1cd-3300991cf237',
               'd9a80d93-4c54-4327-bef3-9191ea18c4f1',
               '1e6a7d50-1472-4b42-94fe-f762832424ae',
               '25d36f15-5b50-4f82-98b4-2142dde916e9',
               'edd9f097-e93b-4b09-8725-ce96a9dace7c',
               '2cd86089-d749-4a3c-8ed2-6de7eceffd6d',
               'e5220363-2b9b-46f3-a0a8-128d47d21aa0']

full_train_2 = (
    full[~(full.guid.isin(failed_guid))]
    .reset_index(drop=True)
)
#full_train_2 = full_train_2[full.guid.isin(guid_downloaded_train_images)]

X_train, X_valid = train_test_split(full_train_2, test_size=0.1, random_state=42)
print(f'train: {X_train.shape[0]}, valid: {X_valid.shape[0]}')

dt, dv = get_data_loaders(train_data=X_train, 
                          valid_data=X_valid, 
                          config=config, 
                          img_path=TRAIN_IMAGES, 
                          download_img_path=DOWNLOADED_TRAIN_IMAGES, 
                          use_data=True)

out_dir = str(ROOT_DIR / os.path.join(config.outdir, 'full_res_2'))
print("Savedir: {}".format(out_dir))
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

f1_best = 0
for epoch in train_epoch:
    train_model(net, dt, criterion, optimizer, config, epoch)
    f1_val = validation(net, dv, criterion_val, epoch)
    if f1_val > f1_best:
        save_checkpoint(net, optimizer, scheduler, epoch, out_dir, f1_val)
        f1_best = f1_val
    scheduler.step()

train: 201384, valid: 22376
{"asctime": "2021-08-04 18:21:23", "name": "airotica.dataset", "filename": "dataset.py", "levelname": "INFO", "message": "Preparing train reader..."}
{"asctime": "2021-08-04 18:21:23", "name": "airotica.dataset", "filename": "dataset.py", "levelname": "INFO", "message": "Done."}
{"asctime": "2021-08-04 18:21:23", "name": "airotica.dataset", "filename": "dataset.py", "levelname": "INFO", "message": "here"}
{"asctime": "2021-08-04 18:21:23", "name": "airotica.dataset", "filename": "dataset.py", "levelname": "INFO", "message": "Preparing valid reader..."}
{"asctime": "2021-08-04 18:21:23", "name": "airotica.dataset", "filename": "dataset.py", "levelname": "INFO", "message": "Done."}


  cpuset_checked))

Train:   0%|          | 0/4195 [00:00<?, ?it/s][A

Savedir: /content/drive/MyDrive/digital_breakthrough/task_3/EXPERIMENTS/full_res_2



Train:   0%|          | 1/4195 [00:19<22:09:41, 19.02s/it][A
Train:   0%|          | 2/4195 [00:22<11:23:15,  9.78s/it][A
Train:   0%|          | 3/4195 [00:23<6:43:38,  5.78s/it] [A
Train:   0%|          | 4/4195 [00:24<4:32:17,  3.90s/it][A
Train:   0%|          | 5/4195 [00:25<3:20:00,  2.86s/it][A
Train:   0%|          | 6/4195 [00:26<2:36:02,  2.24s/it][A
Train:   0%|          | 7/4195 [00:27<2:08:14,  1.84s/it][A
Train:   0%|          | 8/4195 [00:28<1:50:03,  1.58s/it][A
Train:   0%|          | 9/4195 [00:29<1:37:47,  1.40s/it][A
Train:   0%|          | 10/4195 [00:30<1:29:26,  1.28s/it][A
Train:   0%|          | 11/4195 [00:31<1:23:44,  1.20s/it][A
Train:   0%|          | 12/4195 [00:32<1:20:09,  1.15s/it][A
Train:   0%|          | 13/4195 [00:33<1:17:28,  1.11s/it][A
Train:   0%|          | 14/4195 [00:34<1:15:57,  1.09s/it][A
Train:   0%|          | 15/4195 [00:35<1:14:34,  1.07s/it][A
Train:   0%|          | 16/4195 [00:36<1:13:42,  1.06s/it][A
Train:   0%| 


            Epoch: 0; step: 500; loss: 0.8543; acc: 0.74, f1: 0.72742125
            



Train:  12%|█▏        | 502/4195 [20:24<4:36:32,  4.49s/it][A
Train:  12%|█▏        | 503/4195 [20:25<3:32:15,  3.45s/it][A
Train:  12%|█▏        | 504/4195 [20:26<2:47:24,  2.72s/it][A
Train:  12%|█▏        | 505/4195 [20:34<4:26:15,  4.33s/it][A
Train:  12%|█▏        | 506/4195 [20:35<3:25:05,  3.34s/it][A
Train:  12%|█▏        | 507/4195 [20:36<2:42:12,  2.64s/it][A
Train:  12%|█▏        | 508/4195 [20:37<2:12:13,  2.15s/it][A
Train:  12%|█▏        | 509/4195 [20:38<1:51:14,  1.81s/it][A
Train:  12%|█▏        | 510/4195 [20:39<1:36:33,  1.57s/it][A
Train:  12%|█▏        | 511/4195 [20:40<1:26:17,  1.41s/it][A
Train:  12%|█▏        | 512/4195 [20:41<1:19:02,  1.29s/it][A
Train:  12%|█▏        | 513/4195 [20:42<1:13:58,  1.21s/it][A
Train:  12%|█▏        | 514/4195 [20:57<5:20:24,  5.22s/it][A
Train:  12%|█▏        | 515/4195 [21:23<11:48:04, 11.54s/it][A
Train:  12%|█▏        | 516/4195 [21:24<8:34:09,  8.39s/it] [A
Train:  12%|█▏        | 517/4195 [21:25<6:18:26,  6.


            Epoch: 0; step: 1000; loss: 0.7969; acc: 0.76, f1: 0.74980152
            



Train:  24%|██▍       | 1002/4195 [1:10:55<14:56:18, 16.84s/it][A
Train:  24%|██▍       | 1003/4195 [1:10:56<10:43:23, 12.09s/it][A
Train:  24%|██▍       | 1004/4195 [1:10:57<7:46:28,  8.77s/it] [A
Train:  24%|██▍       | 1005/4195 [1:10:58<5:42:36,  6.44s/it][A
Train:  24%|██▍       | 1006/4195 [1:10:59<4:15:55,  4.82s/it][A
Train:  24%|██▍       | 1007/4195 [1:11:00<3:15:13,  3.67s/it][A
Train:  24%|██▍       | 1008/4195 [1:11:01<2:32:49,  2.88s/it][A
Train:  24%|██▍       | 1009/4195 [1:11:02<2:03:06,  2.32s/it][A
Train:  24%|██▍       | 1010/4195 [1:11:18<5:48:57,  6.57s/it][A
Train:  24%|██▍       | 1011/4195 [1:11:19<4:20:20,  4.91s/it][A
Train:  24%|██▍       | 1012/4195 [1:11:20<3:18:20,  3.74s/it][A
Train:  24%|██▍       | 1013/4195 [1:12:32<21:23:59, 24.21s/it][A
Train:  24%|██▍       | 1014/4195 [1:12:33<15:14:36, 17.25s/it][A
Train:  24%|██▍       | 1015/4195 [1:12:34<10:56:10, 12.38s/it][A
Train:  24%|██▍       | 1016/4195 [1:12:35<7:55:28,  8.97s/it] [A
Tr

KeyboardInterrupt: ignored

In [None]:
https://drive.google.com/file/d/1g5FTydmkt5d5nNWWYeFroM3zsA6wlVaM/view?usp=sharing

In [None]:
https://github.com/pytorch/pytorch/issues/1137