### Custom libs to be installed:

In [1]:
# !pip install rgf_python
# !pip install xgboost
# !pip install lightgbm

In [3]:
import os
import sys
import glob
import zipfile
import pathlib
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from rgf.sklearn import RGFClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
sys.path.append(os.path.abspath('../src/'))
    
from data import prepare_submission
from data import data_generator_test
from data import data_generator_train
from data import datasetDecomposition

from model_selection import cross_val_score

from quality_estimator import BaseQualityEstimator

## Data preparation:

#### Please download Dataset.zip to ../data/

In [5]:
raw_root_dir = "../data/"
FILENAME = "Dataset.zip"

In [6]:
# unzip and place files in right folders:
os.mkdir(os.path.join(raw_root_dir, "raw"))

with zipfile.ZipFile(os.path.join(raw_root_dir, FILENAME), 'r') as zip_ref:
    zip_ref.extractall(os.path.join(raw_root_dir, "raw"))

for file_ in glob.glob(os.path.join(raw_root_dir, "raw", "Dataset", "*")):
    os.rename(file_, file_.replace("raw/Dataset/", ""))

In [7]:
root_dir = "../data/ellipse"

In [8]:
# Decompose image dataset to the multichannel images where we have separate (starting from 2nd) channel 
# for each instance and logical "or" of all other channels in the 1st one (as in original markup)
 
datasetDecomposition(input_path="../data/DX_TEST_RESULT_FULL.csv", 
                     output_path=root_dir, 
                     shape=(1024,1024))

os.rename("../data/OpenPart.csv", "../data/ellipse/OpenPart.csv")

In [9]:
# Read and separate data for train and test (submission):

X, Xy, y = data_generator_train(root_dir=root_dir)
X_test, Xy_test = data_generator_test(root_dir=root_dir)

## Algorithm initialization:

In [11]:
# a number of 1-st stage meta-learners:
lgbm = LGBMClassifier(max_depth=2, random_state=0)
rgf = RGFClassifier(max_leaf=400, algorithm="RGF_Sib", test_interval=100, verbose=False)
rf = RandomForestClassifier(n_estimators=100, random_state=0)
xgb = XGBClassifier(max_depth=2, n_estimators=10, random_state=0)

# QualityEstimator definition with metrics family names to be used for feature search:
q_clf = BaseQualityEstimator(metrics=["dice_coefficient", "mae", "mse"], 
                             unary_metrics=[],
                             matching_metrics=[
                                 "match2tpr",
                                 "match2predBinRate",
                                 "match2gtBinRate"
                             ],
                             meta_clfs={'lgbm': lgbm, 
                                        'rgf': rgf,
                                        'rf': rf, 
                                        'xgb': xgb})

# Algorithm fitting 

* 1-st stage metrics: generation in the search space
* 2-nd stage metrics: aggregation for each generated metric in the search space
* feature selection from the aggregated metrics
* fitting of the 1-st level meta-estimators on the selected features
* aggregation function optimization over predictions of the 1-st level meta-estimators

In [12]:
q_clf.fit(X, Xy, y)

BaseQualityEstimator(matching_metrics=['match2tpr', 'match2predBinRate',
                                       'match2gtBinRate'],
                     meta_clfs={'lgbm': LGBMClassifier(max_depth=2,
                                                       random_state=0),
                                'rf': RandomForestClassifier(random_state=0),
                                'rgf': RGFClassifier(algorithm='RGF_Sib',
                                                     max_leaf=400,
                                                     verbose=False),
                                'xgb': XGBClassifier(base_score=0.5,
                                                     booster='gbtree',
                                                     colsample_bylevel=1,
                                                     colsample_bynod...
                                                     interaction_constraints='',
                                                     learning_rate=0.3000

In [13]:
# predictions of the meta-classifier on test data:
y_pred = q_clf.predict(X_test, Xy_test)

In [14]:
# example of automatically selected features:
print(q_clf.selected_features['rf'])

['dice_coefficient'
 'match2gtBinRate_0.16000000000000003_coverageCalculation_iou_0.02'
 'match2predBinRate_0.01_iou_iou_0.02'
 'match2predBinRate_0.060000000000000005_dice_score_iou_0.02'
 'match2predBinRate_0.11_dice_score_iou_0.02'
 'match2predBinRate_0.11_iou_iou_0.02'
 'match2predBinRate_0.16000000000000003_coverageCalculation_iou_0.02'
 'match2predBinRate_0.21000000000000002_dice_score_iou_0.02'
 'match2predBinRate_0.66_coverageCalculation_iou_0.02'
 'match2tpr_iou_0.25' 'match2tpr_iou_0.7']


## Submission preparation:

In [15]:
root_dir = "../data/ellipse/"

SecretPart_427 = prepare_submission(y_pred, root_dir)

In [16]:
SecretPart_427.head(3)

Unnamed: 0,Case,Sample 1,Sample 2,Sample 3
0,00011827_003.png,1.0,1.0,1.0
1,00011925_072.png,4.25,5.0,3.25
2,00012045_019.png,1.0,5.0,1.0


In [17]:
SecretPart_427.to_csv("../data/SecretPart_427.csv", index=False)