In [1]:
import pandas as pd
import numpy as np

from typing import Tuple

from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

from catboost import CatBoostClassifier
from xgboost import XGBClassifier

# Построние выборок

Все выборки(тренировочная, валидационная, тестовая) строились единственным образом, отличалось только кол-во элементов.

## Генерация эллипсов и гипербол
Для генерации эллипсов и гипербол просто генерировались три коэффициента случайным образом, диапозоном от -100 до 100. И в зависимости от знака дискриминанта принималось решение куда относить эти коэффициенты. Если он равен нулю то преход на следующую итерацию цикла, больше нуля гипербола, меньще нуля парабола.

## Генерация парабол
Из сходя из того что у дискриминат должен быть равен нулю если коническое сечение представляет собой параболу. То генерировать случайным образом коэффициенты - плохое решение. Я генерировал коэффициенты a, b случайным образом а с по по формуле представленной ниже. 

In [2]:
eps = 0.00001

def generate_parabola_coef() -> Tuple[float, float, float]:
    a = np.random.rand() * 200 - 100
    b = np.random.rand() * 200 - 100
    c = b**2 / 4 / a
    
    return a, b, c

In [3]:
np.random.seed(0)

def generate_coeff_set(count_samples: int) -> pd.DataFrame:
    current_count = {
        'ellipse': 0,
        'hyperbola': 0
    }    
    set_is_full = False
    res = pd.DataFrame()
    
    while not set_is_full:
        a = np.random.rand() * 200 - 100 # from -100 to 100
        b = np.random.rand() * 200 - 100
        c = np.random.rand() * 200 - 100
        
        if np.absolute(a) < eps and np.absolute(a) < eps and np.absolute(a) < eps: 
            continue
        
        discriminant = b * b - 4 * a * c
        current_figure = None
        
        if np.absolute(discriminant) < eps:
            continue
        elif discriminant > 0:
            current_figure = 'hyperbola'
        else:
            current_figure = 'ellipse'
        
        if current_count[current_figure] == count_samples:
            continue
        
        current_count[current_figure] += 1
        res = res.append({
            'A': a,
            'B': b,
            'C': c,
            'figure': current_figure 
        }, ignore_index=True)
        
        is_full = True
        for value in current_count.values():
            is_full = is_full and (value == count_samples)
        
        set_is_full = is_full
        
    parabolas_count = 0
    while parabolas_count < count_samples:
        a, b, c = generate_parabola_coef()
        
        if np.absolute(a) < eps and np.absolute(a) < eps and np.absolute(a) < eps: 
            continue
        
        res = res.append({
            'A': a,
            'B': b,
            'C': c,
            'figure': 'parabola' 
        }, ignore_index=True)
        parabolas_count += 1
    
    return res.sample(frac=1).reset_index(drop=True)

In [4]:
def form_sets_from_generated_data(count_examples_per_class, test_ratio):
    data_train = generate_coeff_set(count_examples_per_class)
    data_test = generate_coeff_set(int(count_examples_per_class * test_ratio))
    
    le = LabelEncoder()
    data_train['figure'] = le.fit_transform(data_train['figure'])
    # data_train = pd.get_dummies(data_train, columns=['figure'])
    data_test['figure'] = le.transform(data_test['figure'])
    # data_test = pd.get_dummies(data_test, columns=['figure'])
    x_train, x_val, y_train, y_val = train_test_split(data_train.iloc[:, :3], data_train.iloc[:, 3:], test_size=0.1)
    x_test, y_test = data_test.iloc[:, :3], data_test.iloc[:, 3:]
    
    return x_train, y_train, x_val, y_val, x_test, y_test 

In [5]:
def form_sets_from_file():
    data_train = pd.read_csv('./train.csv')
    data_val = pd.read_csv('./val.csv')
    data_test = pd.read_csv('./test.csv')
    
    x_train, y_train = data_train.iloc[:, :3], data_train.iloc[:, 3:]
    x_val, y_val = data_val.iloc[:, :3], data_val.iloc[:, 3:]
    x_test, y_test = data_test.iloc[:, :3], data_test.iloc[:, 3:]
    
    return x_train, y_train, x_val, y_val, x_test, y_test 

In [21]:
count_examples_per_class = 20_000
test_ratio = 0.1

# x_train, y_train, x_val, y_val, x_test, y_test = form_sets_from_generated_data(count_examples_per_class, test_ratio)
x_train, y_train, x_val, y_val, x_test, y_test = form_sets_from_file()

In [19]:
def save_to_csv(x: pd.DataFrame, y: pd.DataFrame, file_name: str) -> None:
    res = x.join(y)
    res.to_csv(f'./{file_name}.csv', index=False)

In [20]:
# Uncomment in order to save data
# save_to_csv(x_train, y_train, 'train')
# save_to_csv(x_val, y_val, 'val')
# save_to_csv(x_test, y_test, 'test')

In [22]:
print(f'x_train shape = {x_train.shape}')
print(f'x_val shape = {x_val.shape}')
print(f'x_test shape = {x_test.shape}')
print(f'y_test shape = {y_test.shape}')

x_train shape = (54000, 3)
x_val shape = (6000, 3)
x_test shape = (6000, 3)
y_test shape = (6000, 1)


In [23]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [24]:
def calc_metrics(model) -> float:
    res = rf.predict(x_test)
    score = accuracy_score(y_test, res)
    return score

In [25]:
print(f'Random forest accuracy score = {calc_metrics(rf)}')

Random forest accuracy score = 0.9533333333333334


In [26]:
cat_boost = CatBoostClassifier()

In [27]:
cat_boost.fit(x_train, y_train,
              eval_set=(x_val, y_val),
              verbose=False
             )

<catboost.core.CatBoostClassifier at 0x1a25ca0a20>

In [28]:
print(f'CatBoost accuracy score = {calc_metrics(cat_boost)}')

CatBoost accuracy score = 0.9533333333333334


In [29]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [30]:
print(f'Logistic regression accuracy score = {calc_metrics(lr)}')

Logistic regression accuracy score = 0.9533333333333334


In [31]:
xgboost = XGBClassifier()
xgboost.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [32]:
print(f'XGBoost accuracy score = {calc_metrics(xgboost)}')

XGBoost accuracy score = 0.9533333333333334
