In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/innopolis/train.csv
/kaggle/input/innopolis/test.csv




https://github.com/dreamquark-ai/tabnet


In [None]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m168.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.0
[0m

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Импорт модулей

In [None]:
import json
import itertools
import math
import textwrap
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetClassifier
from scipy.stats import ttest_rel
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier

from tqdm.auto import tqdm

## Настройка путей

In [None]:
contest_dir = Path('/kaggle/input/innopolis/')
data_dir = contest_dir

## Загрузка данных

In [None]:
input_train_data = pd.read_csv(data_dir / 'train.csv', encoding='ascii')
input_test_data = pd.read_csv(data_dir / 'test.csv', encoding='ascii')

## Константы

In [None]:
TARGET_NAMES = [
    'подсолнечник',
    'картофель',
    'пшеница озимая',
    'гречиха',
    'кукуруза',
    'пшеница яровая',
    'сахарная свекла'
]

TARGET_COUNT = 7

TARGET_COLUMN = 'crop'

## Генерация фичей и таргетов

In [None]:
def reproject(latitude, longitude):
    """Returns the x & y coordinates in meters using a sinusoidal projection"""
    from math import pi, cos, radians
    earth_radius = 6371009 # in meters
    lat_dist = pi * earth_radius / 180.0

    y = [lat * lat_dist for lat in latitude]
    x = [long * lat_dist * cos(radians(lat)) 
                for lat, long in zip(latitude, longitude)]
    return x, y

def create_features(input_data: pd.DataFrame, is_train_data: bool) -> pd.DataFrame:
    data = input_data.copy()

    nd_mean_columns = sorted(x for x in data.columns if x.startswith('nd_mean_'))
    nd_mean_dates = [pd.to_datetime(x[8:]) for x in nd_mean_columns]

    nd_mean = np.array(data[nd_mean_columns])
    data['nd_mean.min'] = np.min(nd_mean, axis=1)
    data['nd_mean.max'] = np.max(nd_mean, axis=1)
    data['nd_mean.mean'] = np.mean(nd_mean, axis=1)
    data['nd_mean.std'] = np.std(nd_mean, axis=1)
    data['nd_mean.sum'] = np.sum(nd_mean, axis=1)

    nd_mean_mask = nd_mean != 0
    data['nd_mean.nonzero.mean'] = np.mean(nd_mean, axis=1, where=nd_mean_mask)
    data['nd_mean.nonzero.std'] = np.std(nd_mean, axis=1, where=nd_mean_mask)
    data['nd_mean.nonzero.count'] = np.sum(nd_mean_mask, axis=1)

    geo_data = list(data['.geo'].apply(json.loads))

    data['geo.is_polygon'] = [x['type'] == 'Polygon' for x in geo_data]
    data['geo.is_multi_polygon'] = [x['type'] == 'MultiPolygon' for x in geo_data]
    data['geo.is_geometry_collection'] = [x['type'] == 'GeometryCollection' for x in geo_data]

    def get_polygons(geo):
        if geo['type'] == 'Polygon':
            return [geo['coordinates']]
        if geo['type'] == 'MultiPolygon':
            return geo['coordinates']
        if geo['type'] == 'GeometryCollection':
            return [x['coordinates'] for x in geo['geometries'] if x['type'] == 'Polygon']
        raise ValueError('Invalid geometry type: ' + geo['type'])

    geo_polygons = [get_polygons(x) for x in geo_data]

    data['geo.polygon_count'] = [len(x) for x in geo_polygons]
    data['geo.polygon_part_count'] = [sum(len(y) for y in x) for x in geo_polygons]

    def get_coordinates(polygons):
        lon = []
        lat = []
        for polygon in polygons:
            for part in polygon:
                lon.extend(x[0] for x in part)
                lat.extend(x[1] for x in part)
        return lat, lon

    data['get.point_count'] = [len(get_coordinates(x)[0]) for x in geo_polygons]

    def get_latitude(polygons) -> float:
        lat, lon = get_coordinates(polygons)
        return (np.max(lat) + np.min(lat)) / 2

    def get_latitude_delta(polygons) -> float:
        lat, lon = get_coordinates(polygons)
        return (np.max(lat) - np.min(lat))

    def get_longitude(polygons) -> float:
        lat, lon = get_coordinates(polygons)
        return (np.max(lon) + np.min(lon)) / 2

    def get_longitude_delta(polygons) -> float:
        lat, lon = get_coordinates(polygons)
        return (np.max(lon) - np.min(lon))

    data['latitude'] = [get_latitude(x) for x in geo_polygons]
    data['longitude'] = [get_longitude(x) for x in geo_polygons]

    data['latitude_delta'] = [get_latitude_delta(x) for x in geo_polygons]
    data['longitude_delta'] = [get_longitude_delta(x) for x in geo_polygons]

    data['latitude_delta / longitude_delta'] = data['latitude_delta'] / data['longitude_delta']

    #for i in range(1, len(nd_mean_columns)):
    #    column0 = nd_mean_columns[i - 1]
    #    column1 = nd_mean_columns[i]
    #    data[f'{column1} - {column0}'] = data[column1] - data[column0]

    date = nd_mean_dates[0]
    nd_mean_date_indexes = []
    date_count = 0
    while date <= nd_mean_dates[-1]:
        if date in nd_mean_dates:
            nd_mean_date_indexes.append(date_count)
        date_count += 1
        date = date + pd.tseries.offsets.Day()
    nd_mean_date_indexes = np.array(nd_mean_date_indexes)

    def interpolate_nd_mean(arr):
        result = []
        x = list(range(date_count))
        for i in range(arr.shape[0]):
            xp_i = arr[i].nonzero()[0]
            xp = nd_mean_date_indexes[xp_i]
            fp = arr[i][xp_i]
            result.append(np.interp(x, xp, fp)[nd_mean_date_indexes])
        return np.array(result)

    #data[nd_mean_columns] = interpolate_nd_mean(data[nd_mean_columns].to_numpy())

    x = data[nd_mean_columns].to_numpy()
    data['nd_mean.eq.0'] = (x == 0).sum(axis=1)
    data['nd_mean.lt.05'] = (x < 0.05).sum(axis=1)
    data['nd_mean.lt.10'] = (x < 0.1).sum(axis=1)
    data['nd_mean.lt.15'] = (x < 0.15).sum(axis=1)
    data['nd_mean.lt.20'] = (x < 0.20).sum(axis=1)
    data['nd_mean.lt.30'] = (x < 0.30).sum(axis=1)
    #x[x <= 0.1] = 0
    data[nd_mean_columns] = x * x
    
    data['nd_mean_2021-05-15 * nd_mean_2021-05-26'] = data['nd_mean_2021-05-15'] * data['nd_mean_2021-05-26']
    data['nd_mean_2021-06-16 * nd_mean_2021-05-26'] = data['nd_mean_2021-06-16'] * data['nd_mean_2021-05-26']
    data['nd_mean_2021-05-17 * nd_mean_2021-05-26'] = data['nd_mean_2021-05-17'] * data['nd_mean_2021-05-26']
    data['nd_mean_2021-07-17 * nd_mean_2021-05-26'] = data['nd_mean_2021-07-17'] * data['nd_mean_2021-05-26']
    data['nd_mean_2021-08-01 * nd_mean_2021-05-26'] = data['nd_mean_2021-08-01'] * data['nd_mean_2021-05-26']
    
    #x = data[nd_mean_columns].to_numpy()
    #data[nd_mean_columns] = np.log1p(x)

    def get_nd_mean_delta(arr):
        result = []
        x = list(range(date_count))
        for i in range(arr.shape[0]):
            xp_i = arr[i].nonzero()[0]
            xp = nd_mean_date_indexes[xp_i]
            fp = arr[i][xp_i]
            f = np.interp(x, xp, fp)
            result.append((f[1:] - f[:-1])[nd_mean_date_indexes[1:] - 1])
        return np.array(result)

    nd_mean_delta_columns = [x + '_delta' for x in nd_mean_columns[1:]]
    data[nd_mean_delta_columns] = get_nd_mean_delta(data[nd_mean_columns].to_numpy())
 
    data = data.copy()  # defragmentation
    return data


train_data = create_features(input_train_data, True)
test_data = create_features(input_test_data, False)

for i in range(TARGET_COUNT):
    train_data[f'target{i}'] = train_data[TARGET_COLUMN] == i
    
record_count = 7000
geo_matrix = np.zeros((record_count, record_count), dtype=float)
geo_x = np.zeros(record_count, dtype=float)
geo_y = np.zeros(record_count, dtype=float)

for item in train_data.itertuples():
    x, y = reproject([item.latitude], [item.longitude])
    geo_x[item.id] = x[0]
    geo_y[item.id] = y[0]

for item in test_data.itertuples():
    x, y = reproject([item.latitude], [item.longitude])
    geo_x[item.id] = x[0]
    geo_y[item.id] = y[0]

for i in range(record_count):
    d = np.sqrt((geo_x - geo_x[i]) ** 2 + (geo_y - geo_y[i]) ** 2)
    geo_matrix[i] = d
    geo_matrix[:,i] = d

geo_matrix[geo_matrix == 0] = 1e20

## Конфигурация

In [None]:
DEFAULT_FEATURES = [
    'area',

    'nd_mean_2021-04-15',
    'nd_mean_2021-04-16',
    'nd_mean_2021-04-18',
    'nd_mean_2021-04-19',
    #'nd_mean_2021-04-20',
    #'nd_mean_2021-04-22',
    #'nd_mean_2021-04-23',
    'nd_mean_2021-04-25',
    'nd_mean_2021-04-26',
    'nd_mean_2021-04-27',
    'nd_mean_2021-04-28',
    'nd_mean_2021-04-29',
    'nd_mean_2021-04-30',
    'nd_mean_2021-05-01',
    'nd_mean_2021-05-02',
    'nd_mean_2021-05-03',
    'nd_mean_2021-05-04',
    'nd_mean_2021-05-07',
    'nd_mean_2021-05-08',
    #'nd_mean_2021-05-09',
    'nd_mean_2021-05-10',
    'nd_mean_2021-05-15',
    'nd_mean_2021-05-16',
    'nd_mean_2021-05-17',
    'nd_mean_2021-05-19',
    'nd_mean_2021-05-20',
    'nd_mean_2021-05-21',
    'nd_mean_2021-05-24',
    'nd_mean_2021-05-26',
    'nd_mean_2021-05-27',
    'nd_mean_2021-05-29',
    'nd_mean_2021-06-02',
    'nd_mean_2021-06-03',
    'nd_mean_2021-06-04',
    'nd_mean_2021-06-05',
    'nd_mean_2021-06-06',
    'nd_mean_2021-06-07',
    'nd_mean_2021-06-09',
    'nd_mean_2021-06-10',
    'nd_mean_2021-06-12',
    'nd_mean_2021-06-13',
    'nd_mean_2021-06-16',
    'nd_mean_2021-06-18',
    'nd_mean_2021-06-19',
    'nd_mean_2021-06-20',
    #'nd_mean_2021-06-22',
    #'nd_mean_2021-06-25',
    'nd_mean_2021-06-27',
    'nd_mean_2021-06-28',
    'nd_mean_2021-07-04',
    'nd_mean_2021-07-05',
    'nd_mean_2021-07-07',
    #'nd_mean_2021-07-08',
    'nd_mean_2021-07-09',
    'nd_mean_2021-07-13',
    'nd_mean_2021-07-15',
    'nd_mean_2021-07-17',
    'nd_mean_2021-07-20',
    'nd_mean_2021-07-26',
    'nd_mean_2021-07-27',
    'nd_mean_2021-07-29',
    'nd_mean_2021-07-31',
    'nd_mean_2021-08-01',
    'nd_mean_2021-08-07',
    'nd_mean_2021-08-10',
    'nd_mean_2021-08-11',
    'nd_mean_2021-08-12',
    'nd_mean_2021-08-13',
    'nd_mean_2021-08-23',
    #'nd_mean_2021-08-27',
    
    #'nd_mean_2021-05-15 * nd_mean_2021-05-26',
    #'nd_mean_2021-06-16 * nd_mean_2021-05-26',
    #'nd_mean_2021-05-17 * nd_mean_2021-05-26',
    #'nd_mean_2021-07-17 * nd_mean_2021-05-26',
    #'nd_mean_2021-08-01 * nd_mean_2021-05-26',
    
    #'nd_mean.eq.0',
    #'nd_mean.lt.05',
    #'nd_mean.lt.10',
    #'nd_mean.lt.15',
    #'nd_mean.lt.20',
    #'nd_mean.lt.30',

    #'nd_mean.min',
    #'nd_mean.max',
    #'nd_mean.mean',
    #'nd_mean.std',
    #'nd_mean.sum',

    #'nd_mean.nonzero.mean',
    #'nd_mean.nonzero.std',
    #'nd_mean.nonzero.count',

    #'geo.is_polygon',
    #'geo.is_multi_polygon',
    #'geo.is_geometry_collection',
    #'geo.polygon_count',
    #'geo.polygon_part_count',
    #'get.point_count',

    'latitude',
    'longitude',

    #'latitude_delta',
    #'longitude_delta',
    #'latitude_delta / longitude_delta',

    #'id',
]

FEATURES = {
    0: DEFAULT_FEATURES,
    1: DEFAULT_FEATURES,
    2: DEFAULT_FEATURES,
    3: DEFAULT_FEATURES,
    4: DEFAULT_FEATURES,
    5: DEFAULT_FEATURES,
    6: DEFAULT_FEATURES,
}

SEED_COUNT = 5

FOLD_COUNT = 5

## Обучение моделей

### Бинарные классификаторы

In [None]:
results = defaultdict(list)
models = defaultdict(list)
best_iterations_for_f1 = defaultdict(list)
best_iterations_for_recall_macro = defaultdict(list)
metrics = defaultdict(list)

y_true_values = []
y_pred_values = []

for seed in tqdm(range(1, 1 + SEED_COUNT)):
    print(f'Seed: {seed}')
    k_fold = KFold(FOLD_COUNT, shuffle=True, random_state=seed)

    for fold, (train_index, val_index) in tqdm(enumerate(k_fold.split(train_data)), total=k_fold.n_splits):
        print(f'  Fold: {fold}')
        
        train_df = train_data.iloc[train_index]
        val_df = train_data.iloc[val_index]

        for target_index in range(TARGET_COUNT):
            target_name = TARGET_NAMES[target_index]
            target_column = f'target{target_index}'
            feature_names = FEATURES[target_index]
            
            train_X = train_df[feature_names].to_numpy(dtype='float32')
            train_y = train_df[target_column].to_numpy()
            val_X = val_df[feature_names].to_numpy(dtype='float32')
            val_y = val_df[target_column].to_numpy()

            model = TabNetClassifier(seed=1, verbose=0, n_d=8, n_a=8, device_name='cuda')
            model.fit(
                train_X, train_y,
                max_epochs=300,
                batch_size=1024,
                patience=300,
                eval_set=[(val_X, val_y)],
                eval_name=['valid'],
                eval_metric=['balanced_accuracy'],
                drop_last=False,
            )

            print(f'    {target_name:<20} {model.best_cost:<10} {model.best_epoch}')

            results[target_index].append(model.history)
            models[target_index].append(model)
            best_iterations_for_f1[target_index].append(model.best_epoch)
            metrics[target_index].append(model.best_cost)

        predicts = []
        for target_index in range(TARGET_COUNT):
            best_iter = best_iterations_for_f1[target_index][-1]
            model = models[target_index][-1]
            predicts.append(model.predict_proba(val_X)[:,1])
            best_iterations_for_recall_macro[target_index].append(best_iter)

        y_true = np.array(val_df[TARGET_COLUMN])
        y_pred = np.argmax(predicts, axis=0)

        y_true_values.append(y_true)
        y_pred_values.append(y_pred)

        best_score = recall_score(y_true, y_pred, average='macro')

        #fixed_iterations = list(range(30, 1000, 10))
        #
        #for search_stage in range(5):
        #    for target_index in range(TARGET_COUNT):
        #        for fixed_iter in fixed_iterations:
        #            predicts = []
        #            for i in range(TARGET_COUNT):
        #                if target_index == i:
        #                    best_iter = fixed_iter
        #                else:
        #                    best_iter = best_iterations_for_recall_macro[i][-1]
        #                predicts.append(predicts_result[i][best_iter])
        #
        #            y_pred = np.argmax(predicts, axis=0)
        #            score = recall_score(y_true, y_pred, average='macro')
        #        
        #            if score > best_score:
        #                best_score = score
        #                y_pred_values[-1] = y_pred
        #                best_iterations_for_recall_macro[target_index][-1] = fixed_iter
        
        print('   ', best_score)

    print('  Mean:')
    for target_index in range(TARGET_COUNT):
        target_name = TARGET_NAMES[target_index]
        score = np.mean(metrics[target_index][-FOLD_COUNT:])
        print(f'    {target_name:<20} {score}')

    y_true = np.concatenate(y_true_values[-FOLD_COUNT:])
    y_pred = np.concatenate(y_pred_values[-FOLD_COUNT:])

    print('  Report:')
    report = classification_report(y_true, y_pred, target_names=TARGET_NAMES, digits=3)
    print(textwrap.indent(report, '    '))

print('\n[Summary]')

print('Mean:')
for target_index in range(TARGET_COUNT):
    score = np.mean(metrics[target_index])
    print(f'  {TARGET_NAMES[target_index]:<20} {score}')

print('Min:')
for target_index in range(TARGET_COUNT):
    score = np.min(metrics[target_index])
    print(f'  {TARGET_NAMES[target_index]:<20} {score}')

print('Max:')
for target_index in range(TARGET_COUNT):
    score = np.max(metrics[target_index])
    print(f'  {TARGET_NAMES[target_index]:<20} {score}')

print('Report:')
y_true = np.concatenate(y_true_values)
y_pred = np.concatenate(y_pred_values)
report = classification_report(y_true, y_pred, target_names=TARGET_NAMES, digits=3)
print(textwrap.indent(report, '  '))

print('Metrics:')
print('  Accuracy:', accuracy_score(y_true, y_pred))
print('  Recall (macro):', recall_score(y_true, y_pred, average='macro'))
print('  Precision (macro):', precision_score(y_true, y_pred, average='macro'))
print('  F1 (macro):', f1_score(y_true, y_pred, average='macro'))

  0%|          | 0/5 [00:00<?, ?it/s]

Seed: 1


  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
Stop training because you reached max_epochs = 300 with best_epoch = 298 and best_valid_balanced_accuracy = 0.96243
    подсолнечник         0.962434179721193 298
Stop training because you reached max_epochs = 300 with best_epoch = 91 and best_valid_balanced_accuracy = 1.0
    картофель            1.0        91
Stop training because you reached max_epochs = 300 with best_epoch = 252 and best_valid_balanced_accuracy = 0.97308
    пшеница озимая       0.9730775525472868 252
Stop training because you reached max_epochs = 300 with best_epoch = 127 and best_valid_balanced_accuracy = 1.0
    гречиха              1.0        127
Stop training because you reached max_epochs = 300 with best_epoch = 295 and best_valid_balanced_accuracy = 0.94969
    кукуруза             0.9496913580246913 295
Stop training because you reached max_epochs = 300 with best_epoch = 190 and best_valid_balanced_accuracy = 0.99878
    пшеница яровая       0.9987789987789988 190
Stop training because you reached

  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
Stop training because you reached max_epochs = 300 with best_epoch = 298 and best_valid_balanced_accuracy = 0.96599
    подсолнечник         0.9659914978744686 298
Stop training because you reached max_epochs = 300 with best_epoch = 150 and best_valid_balanced_accuracy = 1.0
    картофель            1.0        150
Stop training because you reached max_epochs = 300 with best_epoch = 226 and best_valid_balanced_accuracy = 0.98055
    пшеница озимая       0.980546185284946 226
Stop training because you reached max_epochs = 300 with best_epoch = 111 and best_valid_balanced_accuracy = 1.0
    гречиха              1.0        111
Stop training because you reached max_epochs = 300 with best_epoch = 173 and best_valid_balanced_accuracy = 0.96306
    кукуруза             0.9630636678024285 173
Stop training because you reached max_epochs = 300 with best_epoch = 187 and best_valid_balanced_accuracy = 0.9966
    пшеница яровая       0.9965986394557823 187
Stop training because you reache

  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
Stop training because you reached max_epochs = 300 with best_epoch = 207 and best_valid_balanced_accuracy = 0.94867
    подсолнечник         0.9486706707547565 207
Stop training because you reached max_epochs = 300 with best_epoch = 270 and best_valid_balanced_accuracy = 1.0
    картофель            1.0        270
Stop training because you reached max_epochs = 300 with best_epoch = 198 and best_valid_balanced_accuracy = 0.98736
    пшеница озимая       0.9873584037990484 198
Stop training because you reached max_epochs = 300 with best_epoch = 107 and best_valid_balanced_accuracy = 1.0
    гречиха              1.0        107
Stop training because you reached max_epochs = 300 with best_epoch = 235 and best_valid_balanced_accuracy = 0.95789
    кукуруза             0.9578932610524349 235
Stop training because you reached max_epochs = 300 with best_epoch = 220 and best_valid_balanced_accuracy = 1.0
    пшеница яровая       1.0        220
Stop training because you reached max_epoc

  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
Stop training because you reached max_epochs = 300 with best_epoch = 225 and best_valid_balanced_accuracy = 0.9679
    подсолнечник         0.9679045776918118 225
Stop training because you reached max_epochs = 300 with best_epoch = 242 and best_valid_balanced_accuracy = 1.0
    картофель            1.0        242
Stop training because you reached max_epochs = 300 with best_epoch = 279 and best_valid_balanced_accuracy = 0.98708
    пшеница озимая       0.9870838347372644 279
Stop training because you reached max_epochs = 300 with best_epoch = 93 and best_valid_balanced_accuracy = 1.0
    гречиха              1.0        93
Stop training because you reached max_epochs = 300 with best_epoch = 219 and best_valid_balanced_accuracy = 0.96477
    кукуруза             0.9647715566199713 219
Stop training because you reached max_epochs = 300 with best_epoch = 152 and best_valid_balanced_accuracy = 1.0
    пшеница яровая       1.0        152
Stop training because you reached max_epochs 

  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
Stop training because you reached max_epochs = 300 with best_epoch = 158 and best_valid_balanced_accuracy = 0.94668
    подсолнечник         0.9466833233627758 158
Stop training because you reached max_epochs = 300 with best_epoch = 275 and best_valid_balanced_accuracy = 0.99603
    картофель            0.996031746031746 275
Stop training because you reached max_epochs = 300 with best_epoch = 180 and best_valid_balanced_accuracy = 0.98807
    пшеница озимая       0.9880664610597094 180
Stop training because you reached max_epochs = 300 with best_epoch = 131 and best_valid_balanced_accuracy = 1.0
    гречиха              1.0        131
Stop training because you reached max_epochs = 300 with best_epoch = 258 and best_valid_balanced_accuracy = 0.96369
    кукуруза             0.963693982854589 258
Stop training because you reached max_epochs = 300 with best_epoch = 153 and best_valid_balanced_accuracy = 0.99941
    пшеница яровая       0.9994138335287222 153
Stop training becaus

### Мульти-классификатор

In [None]:
results = []
models = []
best_iterations = []
metrics = []

for seed in tqdm(range(1, 1 + SEED_COUNT)):
    print(f'Seed: {seed}')
    k_fold = KFold(FOLD_COUNT, shuffle=True, random_state=seed)

    for fold, (train_index, val_index) in tqdm(enumerate(k_fold.split(train_data)), total=k_fold.n_splits):
        print(f'  Fold: {fold}')
        
        train_df = train_data.iloc[train_index].reset_index(drop=True)
        val_df = train_data.iloc[val_index].reset_index(drop=True)
        
        train_id = train_df.id.to_numpy()
        val_id = val_df.id.to_numpy()
        
        train_neighbor = train_id[geo_matrix[train_id][:,train_id].argmin(axis=1)]
        val_neighbor = train_id[geo_matrix[val_id][:,train_id].argmin(axis=1)]

        train_crop = train_df.set_index('id', drop=False).loc[train_neighbor]['crop'].reset_index(drop=True)
        val_crop = train_df.set_index('id', drop=False).loc[val_neighbor]['crop'].reset_index(drop=True)

        for target_index in range(TARGET_COUNT):
            train_df[f'neighbor.crop{target_index}'] = train_crop == target_index
            val_df[f'neighbor.crop{target_index}'] = val_crop == target_index

        train_df['neighbor.distance'] = geo_matrix[train_id][:,train_id].min(axis=1)
        val_df['neighbor.distance'] = geo_matrix[val_id][:,train_id].min(axis=1)
        
        feature_names = DEFAULT_FEATURES.copy()
        for target_index in range(TARGET_COUNT):
            feature_names.append(f'neighbor.crop{target_index}')
        feature_names.append('neighbor.distance')
            
        train_X = train_df[feature_names].to_numpy(dtype='float32')
        train_y = train_df[TARGET_COLUMN].to_numpy()
        val_X = val_df[feature_names].to_numpy(dtype='float32')
        val_y = val_df[TARGET_COLUMN].to_numpy()

        model = TabNetClassifier(seed=1, verbose=0, n_d=8, n_a=8, device_name='cuda')
        model.fit(
            train_X, train_y,
            max_epochs=500,
            batch_size=1024,
            patience=500,
            eval_set=[(val_X, val_y)],
            eval_name=['valid'],
            eval_metric=['balanced_accuracy'],
            drop_last=False,
        )

        print(f'    {model.best_cost:<10} {model.best_epoch}')

        results.append(model.history)
        models.append(model)
        best_iterations.append(model.best_epoch)
        metrics.append(model.best_cost)

print('\n[Summary]')
print('Recall (macro):', np.mean(metrics))

  0%|          | 0/5 [00:00<?, ?it/s]

Seed: 1


  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
Stop training because you reached max_epochs = 500 with best_epoch = 490 and best_valid_balanced_accuracy = 0.96141
    0.961413621393821 490
  Fold: 1
Stop training because you reached max_epochs = 500 with best_epoch = 456 and best_valid_balanced_accuracy = 0.96453
    0.9645279912385598 456
  Fold: 2
Stop training because you reached max_epochs = 500 with best_epoch = 485 and best_valid_balanced_accuracy = 0.96041
    0.960411230965114 485
  Fold: 3
Stop training because you reached max_epochs = 500 with best_epoch = 228 and best_valid_balanced_accuracy = 0.96708
    0.9670808729367586 228
  Fold: 4
Stop training because you reached max_epochs = 500 with best_epoch = 271 and best_valid_balanced_accuracy = 0.9668
    0.9667970865661959 271
Seed: 2


  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
Stop training because you reached max_epochs = 500 with best_epoch = 448 and best_valid_balanced_accuracy = 0.96174
    0.9617410731482471 448
  Fold: 1
Stop training because you reached max_epochs = 500 with best_epoch = 383 and best_valid_balanced_accuracy = 0.94956
    0.9495623669610035 383
  Fold: 2
Stop training because you reached max_epochs = 500 with best_epoch = 463 and best_valid_balanced_accuracy = 0.9646
    0.964600548746981 463
  Fold: 3
Stop training because you reached max_epochs = 500 with best_epoch = 302 and best_valid_balanced_accuracy = 0.96618
    0.9661757409805679 302
  Fold: 4
Stop training because you reached max_epochs = 500 with best_epoch = 192 and best_valid_balanced_accuracy = 0.95491
    0.9549105217044784 192
Seed: 3


  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
Stop training because you reached max_epochs = 500 with best_epoch = 350 and best_valid_balanced_accuracy = 0.95825
    0.9582469665013741 350
  Fold: 1
Stop training because you reached max_epochs = 500 with best_epoch = 458 and best_valid_balanced_accuracy = 0.95996
    0.9599554599886203 458
  Fold: 2
Stop training because you reached max_epochs = 500 with best_epoch = 297 and best_valid_balanced_accuracy = 0.96101
    0.9610071385313566 297
  Fold: 3
Stop training because you reached max_epochs = 500 with best_epoch = 313 and best_valid_balanced_accuracy = 0.9652
    0.9652018943029491 313
  Fold: 4
Stop training because you reached max_epochs = 500 with best_epoch = 364 and best_valid_balanced_accuracy = 0.96296
    0.9629601868190308 364
Seed: 4


  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
Stop training because you reached max_epochs = 500 with best_epoch = 449 and best_valid_balanced_accuracy = 0.96705
    0.9670511653738864 449
  Fold: 1
Stop training because you reached max_epochs = 500 with best_epoch = 488 and best_valid_balanced_accuracy = 0.95944
    0.9594393583551747 488
  Fold: 2
Stop training because you reached max_epochs = 500 with best_epoch = 392 and best_valid_balanced_accuracy = 0.96131
    0.9613107172693639 392
  Fold: 3
Stop training because you reached max_epochs = 500 with best_epoch = 481 and best_valid_balanced_accuracy = 0.96098
    0.9609802046161936 481
  Fold: 4
Stop training because you reached max_epochs = 500 with best_epoch = 402 and best_valid_balanced_accuracy = 0.96472
    0.9647163391521251 402
Seed: 5


  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
Stop training because you reached max_epochs = 500 with best_epoch = 272 and best_valid_balanced_accuracy = 0.95791
    0.9579143762745498 272
  Fold: 1
Stop training because you reached max_epochs = 500 with best_epoch = 352 and best_valid_balanced_accuracy = 0.96069
    0.9606871596103647 352
  Fold: 2
Stop training because you reached max_epochs = 500 with best_epoch = 170 and best_valid_balanced_accuracy = 0.95818
    0.9581827838440505 170
  Fold: 3
Stop training because you reached max_epochs = 500 with best_epoch = 350 and best_valid_balanced_accuracy = 0.95372
    0.9537234943233577 350
  Fold: 4
Stop training because you reached max_epochs = 500 with best_epoch = 335 and best_valid_balanced_accuracy = 0.96406
    0.964055763194131 335

[Summary]
Recall (macro): 0.9613061625119301


## Подготовка submission

In [None]:
def predict_proba(data, models, selection: Optional[List[int]] = None):
    result = None
    count = 0
    for i, model in enumerate(models):
        if selection and i not in selection:
            continue
        proba = model.predict_proba(data)[:,1]
        if result is None:
            result = proba
        else:
            result += proba
        count += 1
    return result / count

In [None]:
predicts = np.array([predict_proba(test_data[FEATURES[target_index]].to_numpy(dtype='float32'),
                                   models[target_index],
                                   selection=None)
                     for target_index in range(TARGET_COUNT)])

submission = test_data[['id']].copy()
submission['crop'] = predicts.argmax(axis=0)

submission.to_csv('/kaggle/working/submission.csv', index=False, encoding='utf-8')

In [None]:
def predict_proba_multi(data, models, selection: Optional[List[int]] = None):
    result = None
    count = 0
    for i, model in enumerate(models):
        if selection and i not in selection:
            continue
        proba = model.predict_proba(data)
        if result is None:
            result = proba
        else:
            result += proba
        count += 1
    return result / count

In [None]:
train_id = train_data.id.to_numpy()
test_id = test_data.id.to_numpy()

test_neighbor = train_id[geo_matrix[test_id][:,train_id].argmin(axis=1)]

test_crop = train_data.set_index('id', drop=False).loc[test_neighbor]['crop'].reset_index(drop=True)
        
for target_index in range(TARGET_COUNT):
    test_data[f'neighbor.crop{target_index}'] = test_crop == target_index

test_data['neighbor.distance'] = geo_matrix[test_id][:,train_id].min(axis=1)

predicts = predict_proba_multi(test_data[DEFAULT_FEATURES + ['neighbor.crop0', 'neighbor.crop1', 'neighbor.crop2', 'neighbor.crop3', 'neighbor.crop4', 'neighbor.crop5', 'neighbor.crop6', 'neighbor.distance']].to_numpy(dtype='float32'), models,
                               selection=None)

submission = test_data[['id']].copy()
submission['crop'] = predicts.argmax(axis=1)

submission.to_csv('/kaggle/working/submission.csv', index=False, encoding='utf-8')

### Сохранение предиктов

In [None]:
predicts = predict_proba_multi(test_data[DEFAULT_FEATURES + ['neighbor.crop0', 'neighbor.crop1', 'neighbor.crop2', 'neighbor.crop3', 'neighbor.crop4', 'neighbor.crop5', 'neighbor.crop6', 'neighbor.distance']].to_numpy(dtype='float32'), models)

submission = test_data[['id']].copy()

for i in range(TARGET_COUNT):
    submission[f'p{i}'] = predicts.T[i]

submission.to_csv('/kaggle/working/tabnet.csv', index=False, encoding='utf-8')