In [None]:
! git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;

## Импорт модулей

In [None]:
import json
import itertools
import math
import textwrap
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import ttest_rel
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier

from tqdm.auto import tqdm

## Подключение Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

contest_dir = Path('/content/drive/MyDrive/innopolis')
data_dir = contest_dir / 'data'

Mounted at /content/drive


## Загрузка данных

In [None]:
input_train_data = pd.read_csv(data_dir / 'train.csv', encoding='ascii')
input_test_data = pd.read_csv(data_dir / 'test.csv', encoding='ascii')

input_train_elevation_data = pd.read_csv(data_dir / 'train_elevation.csv', encoding='ascii')
input_test_elevation_data = pd.read_csv(data_dir / 'test_elevation.csv', encoding='ascii')

## Константы

In [None]:
TARGET_NAMES = [
    'подсолнечник',
    'картофель',
    'пшеница озимая',
    'гречиха',
    'кукуруза',
    'пшеница яровая',
    'сахарная свекла'
]

TARGET_COUNT = 7

TARGET_COLUMN = 'crop'

## Генерация фичей и таргетов


In [None]:
def reproject(latitude, longitude):
    """Returns the x & y coordinates in meters using a sinusoidal projection"""
    from math import pi, cos, radians
    earth_radius = 6371009 # in meters
    lat_dist = pi * earth_radius / 180.0

    y = [lat * lat_dist for lat in latitude]
    x = [long * lat_dist * cos(radians(lat)) 
                for lat, long in zip(latitude, longitude)]
    return x, y


def area_of_polygon(x, y):
    """Calculates the area of an arbitrary polygon given its verticies"""
    area = 0.0
    for i in range(-1, len(x)-1):
        area += x[i] * (y[i+1] - y[i-1])
    return abs(area) / 2.0


def perimeter_of_polygon(x, y):
    """Calculates the perimeter of an arbitrary polygon given its verticies"""
    perimeter = 0.0
    for i in range(1, len(x)):
        perimeter += np.sqrt((x[i] - x[i-1]) ** 2 + (y[i] - y[i-1]) ** 2)
    return perimeter


def create_features(input_data: pd.DataFrame, is_train_data: bool) -> pd.DataFrame:
    data = input_data.copy()

    nd_mean_columns = sorted(x for x in data.columns if x.startswith('nd_mean_'))
    nd_mean_dates = [pd.to_datetime(x[8:]) for x in nd_mean_columns]

    nd_mean = np.array(data[nd_mean_columns])
    data['nd_mean.min'] = np.min(nd_mean, axis=1)
    data['nd_mean.max'] = np.max(nd_mean, axis=1)
    data['nd_mean.mean'] = np.mean(nd_mean, axis=1)
    data['nd_mean.std'] = np.std(nd_mean, axis=1)
    data['nd_mean.sum'] = np.sum(nd_mean, axis=1)

    nd_mean_mask = nd_mean != 0
    data['nd_mean.nonzero.mean'] = np.mean(nd_mean, axis=1, where=nd_mean_mask)
    data['nd_mean.nonzero.std'] = np.std(nd_mean, axis=1, where=nd_mean_mask)
    data['nd_mean.nonzero.count'] = np.sum(nd_mean_mask, axis=1)

    geo_data = list(data['.geo'].apply(json.loads))

    data['geo.is_polygon'] = [x['type'] == 'Polygon' for x in geo_data]
    data['geo.is_multi_polygon'] = [x['type'] == 'MultiPolygon' for x in geo_data]
    data['geo.is_geometry_collection'] = [x['type'] == 'GeometryCollection' for x in geo_data]

    def get_polygons(geo):
        if geo['type'] == 'Polygon':
            return [geo['coordinates']]
        if geo['type'] == 'MultiPolygon':
            return geo['coordinates']
        if geo['type'] == 'GeometryCollection':
            return [x['coordinates'] for x in geo['geometries'] if x['type'] == 'Polygon']
        raise ValueError('Invalid geometry type: ' + geo['type'])

    geo_polygons = [get_polygons(x) for x in geo_data]

    data['geo.polygon_count'] = [len(x) for x in geo_polygons]
    data['geo.polygon_part_count'] = [sum(len(y) for y in x) for x in geo_polygons]

    def get_coordinates(polygons):
        lon = []
        lat = []
        for polygon in polygons:
            for part in polygon:
                lon.extend(x[0] for x in part)
                lat.extend(x[1] for x in part)
        return lat, lon

    data['get.point_count'] = [len(get_coordinates(x)[0]) for x in geo_polygons]

    def get_latitude(polygons) -> float:
        lat, lon = get_coordinates(polygons)
        return (np.max(lat) + np.min(lat)) / 2

    def get_latitude_delta(polygons) -> float:
        lat, lon = get_coordinates(polygons)
        return (np.max(lat) - np.min(lat))

    def get_longitude(polygons) -> float:
        lat, lon = get_coordinates(polygons)
        return (np.max(lon) + np.min(lon)) / 2

    def get_longitude_delta(polygons) -> float:
        lat, lon = get_coordinates(polygons)
        return (np.max(lon) - np.min(lon))

    def get_area(polygon):
        lon = [x[0] for x in polygon[0]]
        lat = [x[1] for x in polygon[0]]

        x, y = reproject(lat, lon)
        return area_of_polygon(x, y)

    def get_perimeter(polygon):
        lon = [x[0] for x in polygon[0]]
        lat = [x[1] for x in polygon[0]]

        x, y = reproject(lat, lon)
        return perimeter_of_polygon(x, y)

    data['latitude'] = [get_latitude(x) for x in geo_polygons]
    data['longitude'] = [get_longitude(x) for x in geo_polygons]

    data['latitude_delta'] = [get_latitude_delta(x) for x in geo_polygons]
    data['longitude_delta'] = [get_longitude_delta(x) for x in geo_polygons]

    data['latitude_delta / longitude_delta'] = data['latitude_delta'] / data['longitude_delta']

    data['area_first_polygon'] = [get_area(x[0]) for x in geo_polygons]
    data['area_all_polygons'] = [sum(get_area(y) for y in x) for x in geo_polygons]

    data['perimeter_first_polygon'] = [get_perimeter(x[0]) for x in geo_polygons]
    data['perimeter_all_polygons'] = [sum(get_perimeter(y) for y in x) for x in geo_polygons]

    data['perimeter_first_polygon / area'] = data['perimeter_first_polygon'] / data['area']
    data['perimeter_all_polygons / area'] = data['perimeter_all_polygons'] / data['area']
    data['perimeter_first_polygon / area_first_polygon'] = data['perimeter_first_polygon'] / data['area_first_polygon']
    data['perimeter_all_polygons / area_all_polygons'] = data['perimeter_all_polygons'] / data['area_all_polygons']

    date = nd_mean_dates[0]
    nd_mean_date_indexes = []
    date_count = 0
    while date <= nd_mean_dates[-1]:
        if date in nd_mean_dates:
            nd_mean_date_indexes.append(date_count)
        date_count += 1
        date = date + pd.tseries.offsets.Day()
    nd_mean_date_indexes = np.array(nd_mean_date_indexes)

    def interpolate_nd_mean(arr):
        result = []
        x = list(range(date_count))
        for i in range(arr.shape[0]):
            xp_i = arr[i].nonzero()[0]
            xp = nd_mean_date_indexes[xp_i]
            fp = arr[i][xp_i]
            result.append(np.interp(x, xp, fp)[nd_mean_date_indexes])
        return np.array(result)

    #data[nd_mean_columns] = interpolate_nd_mean(data[nd_mean_columns].to_numpy())

    x = data[nd_mean_columns].to_numpy()
    data['nd_mean.eq.0'] = (x == 0).sum(axis=1)
    data['nd_mean.lt.05'] = (x < 0.05).sum(axis=1)
    data['nd_mean.lt.10'] = (x < 0.1).sum(axis=1)
    data['nd_mean.lt.15'] = (x < 0.15).sum(axis=1)
    data['nd_mean.lt.20'] = (x < 0.20).sum(axis=1)
    data['nd_mean.lt.30'] = (x < 0.30).sum(axis=1)
    x[x <= 0.15] = 0
    data[nd_mean_columns] = x

    def get_nd_mean_delta(arr):
        result = []
        x = list(range(date_count))
        for i in range(arr.shape[0]):
            xp_i = arr[i].nonzero()[0]
            xp = nd_mean_date_indexes[xp_i]
            fp = arr[i][xp_i]
            f = np.interp(x, xp, fp)
            result.append((f[1:] - f[:-1])[nd_mean_date_indexes[1:] - 1])
        return np.array(result)

    nd_mean_delta_columns = [x + '_delta' for x in nd_mean_columns[1:]]
    data[nd_mean_delta_columns] = get_nd_mean_delta(data[nd_mean_columns].to_numpy())
 
    data = data.copy()  # defragmentation
    return data 

In [None]:
train_data = create_features(input_train_data, True)
test_data = create_features(input_test_data, False)

for i in range(TARGET_COUNT):
    train_data[f'target{i}'] = train_data[TARGET_COLUMN] == i

assert (train_data['id'] == input_train_elevation_data['id']).all()
assert (test_data['id'] == input_test_elevation_data['id']).all()

train_data['elevation'] = input_train_elevation_data['elevation']
test_data['elevation'] = input_test_elevation_data['elevation']

In [None]:
record_count = 7000
geo_matrix = np.zeros((record_count, record_count), dtype=float)
geo_x = np.zeros(record_count, dtype=float)
geo_y = np.zeros(record_count, dtype=float)

In [None]:
for item in train_data.itertuples():
    x, y = reproject([item.latitude], [item.longitude])
    geo_x[item.id] = x[0]
    geo_y[item.id] = y[0]

for item in test_data.itertuples():
    x, y = reproject([item.latitude], [item.longitude])
    geo_x[item.id] = x[0]
    geo_y[item.id] = y[0]

for i in range(record_count):
    d = np.sqrt((geo_x - geo_x[i]) ** 2 + (geo_y - geo_y[i]) ** 2)
    geo_matrix[i] = d
    geo_matrix[:,i] = d

geo_matrix[geo_matrix == 0] = 1e20

## Конфигурация

In [None]:
DEFAULT_FEATURES = [
    'area',

    'nd_mean_2021-04-15',
    'nd_mean_2021-04-16',
    'nd_mean_2021-04-18',
    'nd_mean_2021-04-19',
    #'nd_mean_2021-04-20',
    #'nd_mean_2021-04-22',
    #'nd_mean_2021-04-23',
    'nd_mean_2021-04-25',
    'nd_mean_2021-04-26',
    'nd_mean_2021-04-27',
    'nd_mean_2021-04-28',
    'nd_mean_2021-04-29',
    'nd_mean_2021-04-30',
    'nd_mean_2021-05-01',
    'nd_mean_2021-05-02',
    'nd_mean_2021-05-03',
    'nd_mean_2021-05-04',
    'nd_mean_2021-05-07',
    'nd_mean_2021-05-08',
    #'nd_mean_2021-05-09',
    'nd_mean_2021-05-10',
    'nd_mean_2021-05-15',
    'nd_mean_2021-05-16',
    'nd_mean_2021-05-17',
    'nd_mean_2021-05-19',
    'nd_mean_2021-05-20',
    'nd_mean_2021-05-21',
    'nd_mean_2021-05-24',
    'nd_mean_2021-05-26',
    'nd_mean_2021-05-27',
    'nd_mean_2021-05-29',
    'nd_mean_2021-06-02',
    'nd_mean_2021-06-03',
    'nd_mean_2021-06-04',
    'nd_mean_2021-06-05',
    'nd_mean_2021-06-06',
    'nd_mean_2021-06-07',
    'nd_mean_2021-06-09',
    'nd_mean_2021-06-10',
    'nd_mean_2021-06-12',
    'nd_mean_2021-06-13',
    'nd_mean_2021-06-16',
    'nd_mean_2021-06-18',
    'nd_mean_2021-06-19',
    'nd_mean_2021-06-20',
    #'nd_mean_2021-06-22',
    #'nd_mean_2021-06-25',
    'nd_mean_2021-06-27',
    'nd_mean_2021-06-28',
    'nd_mean_2021-07-04',
    'nd_mean_2021-07-05',
    'nd_mean_2021-07-07',
    #'nd_mean_2021-07-08',
    'nd_mean_2021-07-09',
    'nd_mean_2021-07-13',
    'nd_mean_2021-07-15',
    'nd_mean_2021-07-17',
    'nd_mean_2021-07-20',
    'nd_mean_2021-07-26',
    'nd_mean_2021-07-27',
    'nd_mean_2021-07-29',
    'nd_mean_2021-07-31',
    'nd_mean_2021-08-01',
    'nd_mean_2021-08-07',
    'nd_mean_2021-08-10',
    'nd_mean_2021-08-11',
    'nd_mean_2021-08-12',
    'nd_mean_2021-08-13',
    'nd_mean_2021-08-23',
    #'nd_mean_2021-08-27',

    #'nd_mean.eq.0',
    #'nd_mean.lt.05',
    #'nd_mean.lt.10',
    #'nd_mean.lt.15',
    #'nd_mean.lt.20',
    #'nd_mean.lt.30',

    #'nd_mean.min',
    #'nd_mean.max',
    #'nd_mean.mean',
    #'nd_mean.std',
    #'nd_mean.sum',

    #'nd_mean.nonzero.mean',
    #'nd_mean.nonzero.std',
    #'nd_mean.nonzero.count',

    #'geo.is_polygon',
    #'geo.is_multi_polygon',
    #'geo.is_geometry_collection',
    #'geo.polygon_count',
    #'geo.polygon_part_count',
    #'get.point_count',

    'latitude',
    'longitude',
    'elevation',

    #'latitude_delta',
    #'longitude_delta',
    #'latitude_delta / longitude_delta',
    #'area_first_polygon',
    #'area_all_polygons',
    #'perimeter_first_polygon',
    #'perimeter_all_polygons',
    #'perimeter_first_polygon / area',
    #'perimeter_all_polygons / area',
    #'perimeter_first_polygon / area_first_polygon',
    #'perimeter_all_polygons / area_all_polygons',

    #'id',

    #'nd_mean_2021-04-18_delta',
    #'nd_mean_2021-04-19_delta',
    #'nd_mean_2021-04-20_delta',
    #'nd_mean_2021-04-22_delta',
    #'nd_mean_2021-04-23_delta',
    #'nd_mean_2021-04-25_delta',
    #'nd_mean_2021-04-26_delta',
    #'nd_mean_2021-04-27_delta',
    #'nd_mean_2021-04-28_delta',
    #'nd_mean_2021-04-29_delta',
    #'nd_mean_2021-04-30_delta',
    #'nd_mean_2021-05-01_delta',
    #'nd_mean_2021-05-02_delta',
    #'nd_mean_2021-05-03_delta',
    #'nd_mean_2021-05-04_delta',
    #'nd_mean_2021-05-07_delta',
    #'nd_mean_2021-05-08_delta',
    #'nd_mean_2021-05-09_delta',
    #'nd_mean_2021-05-10_delta',
    #'nd_mean_2021-05-15_delta',
    #'nd_mean_2021-05-16_delta',
    #'nd_mean_2021-05-17_delta',
    #'nd_mean_2021-05-19_delta',
    #'nd_mean_2021-05-20_delta',
    #'nd_mean_2021-05-21_delta',
    #'nd_mean_2021-05-24_delta',
    #'nd_mean_2021-05-26_delta',
    #'nd_mean_2021-05-27_delta',
    #'nd_mean_2021-05-29_delta',
    #'nd_mean_2021-06-02_delta',
    #'nd_mean_2021-06-03_delta',
    #'nd_mean_2021-06-04_delta',
    #'nd_mean_2021-06-05_delta',
    #'nd_mean_2021-06-06_delta',
    #'nd_mean_2021-06-07_delta',
    #'nd_mean_2021-06-09_delta',
    #'nd_mean_2021-06-10_delta',
    #'nd_mean_2021-06-12_delta',
    #'nd_mean_2021-06-13_delta',
    #'nd_mean_2021-06-16_delta',
    #'nd_mean_2021-06-18_delta',
    #'nd_mean_2021-06-19_delta',
    #'nd_mean_2021-06-20_delta',
    #'nd_mean_2021-06-22_delta',
    #'nd_mean_2021-06-25_delta',
    #'nd_mean_2021-06-27_delta',
    #'nd_mean_2021-06-28_delta',
    #'nd_mean_2021-07-04_delta',
    #'nd_mean_2021-07-05_delta',
    #'nd_mean_2021-07-07_delta',
    #'nd_mean_2021-07-08_delta',
    #'nd_mean_2021-07-09_delta',
    #'nd_mean_2021-07-13_delta',
    #'nd_mean_2021-07-15_delta',
    #'nd_mean_2021-07-17_delta',
    #'nd_mean_2021-07-20_delta',
    #'nd_mean_2021-07-26_delta',
    #'nd_mean_2021-07-27_delta',
    #'nd_mean_2021-07-29_delta',
    #'nd_mean_2021-07-31_delta',
    #'nd_mean_2021-08-01_delta',
    #'nd_mean_2021-08-07_delta',
    #'nd_mean_2021-08-10_delta',
    #'nd_mean_2021-08-11_delta',
    #'nd_mean_2021-08-12_delta',
    #'nd_mean_2021-08-13_delta',
    #'nd_mean_2021-08-23_delta',
    #'nd_mean_2021-08-27_delta',
]

FEATURES = {
    0: DEFAULT_FEATURES,
    1: DEFAULT_FEATURES,
    2: DEFAULT_FEATURES,
    3: DEFAULT_FEATURES,
    4: DEFAULT_FEATURES,
    5: DEFAULT_FEATURES,
    6: DEFAULT_FEATURES,
}

DEFAULT_LIGHTGBM_PARAMS = {
    #'verbose': -1,
    #'device_type': 'gpu',
    'objective': 'binary',
    'num_leaves': 31,
    'max_depth': -1,
    #'colsample_bytree': 0.5,
    'learning_rate': 0.1,
    'random_state': 1,
    'n_jobs': 2,
    'deterministic': True,
    'metric': 'custom',
    'first_metric_only': True,
}

SPECIFIC_LIGHTGBM_PARAMS = [
    # 0
    {
        'num_leaves': 13,
        #'is_unbalance': True,
    },
    # 1
    {
        'num_leaves': 8,
    },
    # 2
    {
        'num_leaves': 9,
        #'max_bin': 64,
    },
    # 3
    {
        'num_leaves': 7,
        #'colsample_bytree': 0.6,
    },
    # 4
    {
        'num_leaves': 21,
        #'colsample_bytree': 0.5,
    },
    # 5
    {
        'num_leaves': 8,
        #'max_bin': 24,
    },
    # 6
    {
        'num_leaves': 23,
        #'colsample_bytree': 0.5,
    },
]

LIGHTGBM_PARAMS = {i: {**DEFAULT_LIGHTGBM_PARAMS, **SPECIFIC_LIGHTGBM_PARAMS[i]}
                   for i in range(TARGET_COUNT)}

SEED_COUNT = 5

FOLD_COUNT = 5

## Обучение моделей

In [None]:
def train_lgb_model(params, data, n_estimators, feature_names, target_name,
                    valid_sets=None, evals_result=None, average='binary',
                    pred_treshold=0.5, predicts_result=None, **kwargs):
    dtrain = lgb.Dataset(data[feature_names], label=data[target_name].astype('int32'))

    def create_valid_dataset(data):
        return lgb.Dataset(data[feature_names], label=data[target_name].astype('int32'))

    if valid_sets is not None:
        valid_sets = [create_valid_dataset(data) for data in valid_sets]

    def feval(preds, eval_data):
        y_true = eval_data.get_label()
        if params['objective'] == 'multiclass':
            y_pred = preds.reshape((TARGET_COUNT, -1)).T.argmax(axis=1)
        else:
            y_pred = preds >= pred_treshold
        if predicts_result is not None:
            predicts_result.append(np.array(preds))
        return [
            #('accuracy', accuracy_score(y_true, y_pred), True),
            #('precision', precision_score(y_true, y_pred, zero_division=True, average=average), True),
            ('f1', f1_score(y_true, y_pred, average=average), True),
            ('recall', recall_score(y_true, y_pred, average=average), True),
            ]

    return lgb.train(params, dtrain, n_estimators, valid_sets=valid_sets,
                     feval=feval, evals_result=evals_result, 
                     **kwargs)
                     #callbacks=[lgb.record_evaluation(evals_result)])

### Бинарные классификаторы

In [None]:
results = defaultdict(list)
models = defaultdict(list)
best_iterations_for_f1 = defaultdict(list)
best_iterations_for_recall_macro = defaultdict(list)
metrics = defaultdict(list)

y_true_values = []
y_pred_values = []

for seed in tqdm(range(1, 1 + SEED_COUNT)):
    print(f'Seed: {seed}')
    k_fold = KFold(FOLD_COUNT, shuffle=True, random_state=seed)

    for fold, (train_index, val_index) in tqdm(enumerate(k_fold.split(train_data)), total=k_fold.n_splits):
        print(f'  Fold: {fold}')
        
        train_df = train_data.iloc[train_index].reset_index(drop=True)
        val_df = train_data.iloc[val_index].reset_index(drop=True)

        train_id = train_df.id.to_numpy()
        val_id = val_df.id.to_numpy()
        
        train_neighbor = train_id[geo_matrix[train_id][:,train_id].argmin(axis=1)]
        val_neighbor = train_id[geo_matrix[val_id][:,train_id].argmin(axis=1)]

        train_crop = train_df.set_index('id', drop=False).loc[train_neighbor]['crop'].reset_index(drop=True)
        val_crop = train_df.set_index('id', drop=False).loc[val_neighbor]['crop'].reset_index(drop=True)
        
        for target_index in range(TARGET_COUNT):
            train_df[f'neighbor.crop{target_index}'] = train_crop == target_index
            val_df[f'neighbor.crop{target_index}'] = val_crop == target_index

        train_df['neighbor.distance'] = geo_matrix[train_id][:,train_id].min(axis=1)
        val_df['neighbor.distance'] = geo_matrix[val_id][:,train_id].min(axis=1)

        predicts_result = []

        for target_index in range(TARGET_COUNT):
            target_name = TARGET_NAMES[target_index]
            target_column = f'target{target_index}'
            params = LIGHTGBM_PARAMS[target_index]
            feature_names = FEATURES[target_index].copy()
            for i in range(TARGET_COUNT):
                feature_names.append(f'neighbor.crop{i}')
            feature_names.append('neighbor.distance')

            evals_result = {}
            predicts_result.append([])
            model = train_lgb_model(params, train_df, 500, 
                                    feature_names=feature_names,
                                    target_name=target_column,
                                    valid_sets=[val_df], evals_result=evals_result,
                                    predicts_result=predicts_result[-1],
                                    verbose_eval=False)
            
            metric_results = evals_result['valid_0']['f1']
            best_metric = max(metric_results)
            best_iter = np.argmax(metric_results)
            print(f'    {target_name:<20} {best_metric:<10} {best_iter}')

            results[target_index].append(evals_result)
            models[target_index].append(model)
            best_iterations_for_f1[target_index].append(best_iter)
            metrics[target_index].append(best_metric)

        predicts = []
        for target_index in range(TARGET_COUNT):
            best_iter = best_iterations_for_f1[target_index][-1]
            predicts.append(predicts_result[target_index][best_iter])
            best_iterations_for_recall_macro[target_index].append(best_iter)

        y_true = np.array(val_df[TARGET_COLUMN])
        y_pred = np.argmax(predicts, axis=0)

        y_true_values.append(y_true)
        y_pred_values.append(y_pred)

        best_score = recall_score(y_true, y_pred, average='macro')
        print('   ', best_score)

    print('  Mean:')
    for target_index in range(TARGET_COUNT):
        target_name = TARGET_NAMES[target_index]
        score = np.mean(metrics[target_index][-FOLD_COUNT:])
        print(f'    {target_name:<20} {score}')

    y_true = np.concatenate(y_true_values[-FOLD_COUNT:])
    y_pred = np.concatenate(y_pred_values[-FOLD_COUNT:])

    print('  Report:')
    report = classification_report(y_true, y_pred, target_names=TARGET_NAMES, digits=3)
    print(textwrap.indent(report, '    '))

print('\n[Summary]')

print('Mean:')
for target_index in range(TARGET_COUNT):
    score = np.mean(metrics[target_index])
    print(f'  {TARGET_NAMES[target_index]:<20} {score}')

print('Min:')
for target_index in range(TARGET_COUNT):
    score = np.min(metrics[target_index])
    print(f'  {TARGET_NAMES[target_index]:<20} {score}')

print('Max:')
for target_index in range(TARGET_COUNT):
    score = np.max(metrics[target_index])
    print(f'  {TARGET_NAMES[target_index]:<20} {score}')

print('Report:')
y_true = np.concatenate(y_true_values)
y_pred = np.concatenate(y_pred_values)
report = classification_report(y_true, y_pred, target_names=TARGET_NAMES, digits=3)
print(textwrap.indent(report, '  '))

print('Metrics:')
print('  Accuracy:', accuracy_score(y_true, y_pred))
print('  Recall (macro):', recall_score(y_true, y_pred, average='macro'))
print('  Precision (macro):', precision_score(y_true, y_pred, average='macro'))
print('  F1 (macro):', f1_score(y_true, y_pred, average='macro'))

  0%|          | 0/5 [00:00<?, ?it/s]

Seed: 1


  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
    подсолнечник         0.9230769230769231 110
    картофель            1.0        41
    пшеница озимая       0.970873786407767 148
    гречиха              0.9910714285714286 41
    кукуруза             0.9302325581395349 281
    пшеница яровая       0.9932432432432432 20
    сахарная свекла      0.9166666666666667 244
    0.969603053996437
  Fold: 1
    подсолнечник         0.9403973509933775 146
    картофель            0.9967213114754099 71
    пшеница озимая       0.9800796812749003 291
    гречиха              1.0        66
    кукуруза             0.930909090909091 165
    пшеница яровая       1.0        27
    сахарная свекла      0.9708029197080291 365
    0.9826493069634035
  Fold: 2
    подсолнечник         0.9415807560137458 142
    картофель            1.0        31
    пшеница озимая       0.9790209790209791 85
    гречиха              1.0        54
    кукуруза             0.9655172413793104 64
    пшеница яровая       0.996078431372549 58
    сахарная свекла

  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
    подсолнечник         0.9520000000000001 125
    картофель            1.0        34
    пшеница озимая       0.9823321554770317 44
    гречиха              1.0        14
    кукуруза             0.960573476702509 173
    пшеница яровая       1.0        53
    сахарная свекла      0.9259259259259259 306
    0.9830794528468948
  Fold: 1
    подсолнечник         0.9122807017543859 46
    картофель            1.0        50
    пшеница озимая       0.989010989010989 331
    гречиха              0.993103448275862 13
    кукуруза             0.9427609427609428 120
    пшеница яровая       1.0        21
    сахарная свекла      0.9527896995708154 128
    0.9724865208777022
  Fold: 2
    подсолнечник         0.9540636042402827 120
    картофель            1.0        29
    пшеница озимая       0.9686411149825784 252
    гречиха              1.0        48
    кукуруза             0.9347826086956522 91
    пшеница яровая       0.995850622406639 113
    сахарная свекла      0.95555555

  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
    подсолнечник         0.9295774647887325 258
    картофель            0.9965870307167235 32
    пшеница озимая       0.9802371541501976 42
    гречиха              1.0        86
    кукуруза             0.9500000000000001 175
    пшеница яровая       1.0        51
    сахарная свекла      0.9460580912863071 71
    0.9766953236558823
  Fold: 1
    подсолнечник         0.9285714285714285 115
    картофель            1.0        34
    пшеница озимая       0.9837133550488599 286
    гречиха              1.0        16
    кукуруза             0.937269372693727 62
    пшеница яровая       1.0        18
    сахарная свекла      0.932475884244373 288
    0.9712611912100403
  Fold: 2
    подсолнечник         0.9389067524115756 476
    картофель            1.0        112
    пшеница озимая       0.9809885931558935 116
    гречиха              1.0        99
    кукуруза             0.9477351916376306 374
    пшеница яровая       0.9962546816479401 138
    сахарная свекла      0.94514

  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
    подсолнечник         0.9571428571428572 223
    картофель            1.0        104
    пшеница озимая       0.9701492537313433 61
    гречиха              1.0        85
    кукуруза             0.951310861423221 106
    пшеница яровая       1.0        64
    сахарная свекла      0.9699248120300753 102
    0.9875434993931655
  Fold: 1
    подсолнечник         0.9236641221374046 95
    картофель            1.0        35
    пшеница озимая       0.9722222222222222 64
    гречиха              1.0        26
    кукуруза             0.9144981412639405 93
    пшеница яровая       0.9921875  18
    сахарная свекла      0.937007874015748 78
    0.9711124299453381
  Fold: 2
    подсолнечник         0.9347079037800686 135
    картофель            1.0        24
    пшеница озимая       0.9793103448275863 93
    гречиха              1.0        11
    кукуруза             0.9565217391304347 181
    пшеница яровая       1.0        19
    сахарная свекла      0.9384615384615385 143
    

  0%|          | 0/5 [00:00<?, ?it/s]

  Fold: 0
    подсолнечник         0.923943661971831 310


## Подготовка submission

In [None]:
def predict_proba(data, models, best_iterations, selection: Optional[List[int]] = None):
    result = None
    for i, (model, best_iter) in enumerate(zip(models, best_iterations)):
        if selection and i not in selection:
            continue
        proba = model.predict(data, num_iteration=(best_iter + 1))
        if result is None:
            result = proba
        else:
            result += proba
    return result / len(models)

In [None]:
train_id = train_data.id.to_numpy()
test_id = test_data.id.to_numpy()

test_neighbor = train_id[geo_matrix[test_id][:,train_id].argmin(axis=1)]

test_crop = train_data.set_index('id', drop=False).loc[test_neighbor]['crop'].reset_index(drop=True)
        
for target_index in range(TARGET_COUNT):
    test_data[f'neighbor.crop{target_index}'] = test_crop == target_index

test_data['neighbor.distance'] = geo_matrix[test_id][:,train_id].min(axis=1)

predicts = np.array([predict_proba(test_data[FEATURES[target_index] + ['neighbor.crop0', 'neighbor.crop1', 'neighbor.crop2', 'neighbor.crop3', 'neighbor.crop4', 'neighbor.crop5', 'neighbor.crop6', 'neighbor.distance']],
                                   models[target_index],
                                   best_iterations_for_recall_macro[target_index],
                                   selection=None)
                     for target_index in range(TARGET_COUNT)])

submission = test_data[['id']].copy()
submission['crop'] = predicts.argmax(axis=0)

submission.to_csv(contest_dir / 'submission.csv', index=False, encoding='utf-8')

### Сохранение предиктов

In [None]:
predicts = [predict_proba(test_data[FEATURES[target_index] + ['neighbor.crop0', 'neighbor.crop1', 'neighbor.crop2', 'neighbor.crop3', 'neighbor.crop4', 'neighbor.crop5', 'neighbor.crop6', 'neighbor.distance']],
                                   models[target_index],
                                   best_iterations_for_recall_macro[target_index],
                                   selection=None)
            for target_index in range(TARGET_COUNT)]

submission = test_data[['id']].copy()

for i in range(TARGET_COUNT):
    submission[f'p{i}'] = predicts[i]

submission.to_csv(contest_dir / 'lightgbm.csv', index=False, encoding='utf-8')