In [1]:
import datetime
import random
from datetime import timedelta

from sklearn.preprocessing import StandardScaler, RobustScaler

from sklearn.neighbors import KNeighborsClassifier

from core.composer.gp_composer.gp_composer import \
    GPComposer, GPComposerRequirements
from core.composer.visualisation import ComposerVisualiser
from core.repository.model_types_repository import ModelTypesRepository
from core.repository.quality_metrics_repository import \
    ClassificationMetricsEnum, MetricsRepository
from core.repository.tasks import Task, TaskTypesEnum
from core.utils import probs_to_labels
from examples.utils import create_multi_clf_examples_from_excel


import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score as roc_auc
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report, confusion_matrix
from benchmark.benchmark_utils import get_scoring_case_data_paths
from core.composer.chain import Chain
from core.composer.node import PrimaryNode, SecondaryNode
from core.models.data import InputData


random.seed(1)
np.random.seed(1)


def get_model(train_file_path: str, cur_lead_time: datetime.timedelta = timedelta(minutes=5)):
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)

    # the search of the models provided by the framework
    # that can be used as nodes in a chain for the selected task
    models_repo = ModelTypesRepository()
    available_model_types, _ = models_repo.suitable_model(task_type=task.task_type)

    metric_function = MetricsRepository(). \
        metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty)

    composer_requirements = GPComposerRequirements(
        primary=available_model_types, secondary=available_model_types,
        max_lead_time=cur_lead_time, max_arity=3,
        max_depth=4, pop_size=20, num_of_generations=100, 
        crossover_prob = 0.8, mutation_prob = 0.8, 
        add_single_model_chains = True)

    # Create the genetic programming-based composer, that allow to find
    # the optimal structure of the composite model
    composer = GPComposer()

    # run the search of best suitable model
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                initial_chain=None,
                                                composer_requirements=composer_requirements,
                                                metrics=metric_function, is_visualise=False)
    chain_evo_composed.fit(input_data=dataset_to_compose)

    return chain_evo_composed


def apply_model_to_data(model: Chain, data_path: str):
    df, file_path = create_multi_clf_examples_from_excel(data_path, return_df=True)
    dataset_to_apply = InputData.from_csv(file_path, with_target=True)
    evo_predicted = model.predict(dataset_to_apply)
    df['forecast'] = probs_to_labels(evo_predicted.predict)
    return df


def apply_model_to_data_and_predict(model: Chain, data_path: str):
    df, file_path = create_multi_clf_examples_from_excel(data_path, return_df=True)
    
    dataset_to_validate = InputData.from_csv(data_path)
    predicted_labels = model.predict(dataset_to_validate).predict
    
    
    test_data = InputData.from_csv(file_path, with_target=True)
    roc_auc_valid = roc_auc(y_true=test_data.target,
                                  y_score=predicted_labels,
                                  multi_class='ovo',
                                  average='macro')
    
    roc_auc_st = roc_auc(y_true=test_data.target, y_score=predicted_labels.round())
    
    p = precision_score(y_true=test_data.target,y_pred=predicted_labels.round())
    r = recall_score(y_true=test_data.target, y_pred=predicted_labels.round())
    a = accuracy_score(y_true=test_data.target, y_pred=predicted_labels.round())
    
    return roc_auc_valid, roc_auc_st, p, r, a


def validate_model_quality(model: Chain, data_path: str):
    dataset_to_validate = InputData.from_csv(data_path)
    predicted_labels = model.predict(dataset_to_validate).predict

    roc_auc_valid = round(roc_auc(y_true=test_data.target,
                                  y_score=predicted_labels,
                                  multi_class='ovo',
                                  average='macro'), 3)
    
    roc_auc_st = roc_auc(y_true=test_data.target,y_score=predicted_labels)
                              
    p = precision_score(y_true=test_data.target,y_pred=predicted_labels.round())
    r = recall_score(y_true=test_data.target, y_pred=predicted_labels.round())
    a = accuracy_score(y_true=test_data.target, y_pred=predicted_labels.round())
    
    return roc_auc_valid, roc_auc_st, p, r, a


import time

In [2]:
def get_simple_chain():
    first = PrimaryNode(model_type='knn')
    chain = Chain(first)

    return chain

In [3]:
file_path_first = r'./creditcard_scaling_underSample.csv'
create_multi_clf_examples_from_excel(file_path_first, return_df = True);

In [4]:
train_file_path = r'./examples/data/creditcard_scaling_underSample/creditcard_scaling_underSample.csv'
test_file_path = r'./examples/data/creditcard_scaling_underSample/creditcard_scaling_underSample.csv'

In [5]:
train_data = InputData.from_csv(train_file_path)
test_data = InputData.from_csv(test_file_path)

In [6]:
chain = get_simple_chain()

In [7]:
start = time.time()
chain.fit(train_data, use_cache=False)
end = time.time()
print(end-start)

39.58692240715027


In [8]:
before_tuning_predicted = chain.predict(test_data)

In [9]:
bfr_tun_roc_auc = round(roc_auc(y_true=test_data.target,
                          y_score=before_tuning_predicted.predict), 4)

p = round(precision_score(test_data.target,before_tuning_predicted.predict.round()), 4)
r = round(recall_score(y_true=test_data.target, y_pred=before_tuning_predicted.predict.round()), 4)
a = round(accuracy_score(y_true=test_data.target, y_pred=before_tuning_predicted.predict.round()),4 )
f = round(f1_score(y_true=test_data.target, y_pred=before_tuning_predicted.predict.round()), 4)

In [10]:
print(f'ROC_AUC = {bfr_tun_roc_auc} \nPRECISION = {p} \nRECALL = {r} \nACCURACY = {a} \nf1_score = {f}')

ROC_AUC = 0.9896 
PRECISION = 0.9477 
RECALL = 0.9207 
ACCURACY = 0.935 
f1_score = 0.934


# Профайлинг

In [13]:
import cProfile


def profile(func):
    """Decorator for run function profile"""
    def wrapper(*args, **kwargs):
        profile_filename = func.__name__ + '.prof'
        profiler = cProfile.Profile()
        result = profiler.runcall(func, *args, **kwargs)
        profiler.dump_stats(profile_filename)
        return result
    return wrapper


@profile
def knn_fit():
    
    def get_simple_chain():
        first = PrimaryNode(model_type='knn')
        chain = Chain(first)

        return chain

    chain = get_simple_chain()
    
    train_data = InputData.from_csv(r'./examples/data/creditcard_scaling_underSample/creditcard_scaling_underSample.csv')
    
    chain.fit(train_data, use_cache=False)

In [15]:
knn_fit()

In [16]:
import cProfile


def profile(func):
    """Decorator for run function profile"""
    def wrapper(*args, **kwargs):
        profile_filename = func.__name__ + '.prof'
        profiler = cProfile.Profile()
        result = profiler.runcall(func, *args, **kwargs)
        profiler.dump_stats(profile_filename)
        return result
    return wrapper


@profile
def knn_fit_1():
    
    chain.fit(train_data, use_cache=False)

In [17]:
knn_fit_1()

# Applying FEDOT model to full data

In [11]:
test_data = InputData.from_csv(r'./creditcard_scaling.csv')
before_tuning_predicted = chain.predict(test_data)

In [12]:
bfr_tun_roc_auc = round(roc_auc(y_true=test_data.target,
                          y_score=before_tuning_predicted.predict), 4)

p = round(precision_score(test_data.target,before_tuning_predicted.predict.round()), 4)
r = round(recall_score(y_true=test_data.target, y_pred=before_tuning_predicted.predict.round()), 4)
a = round(accuracy_score(y_true=test_data.target, y_pred=before_tuning_predicted.predict.round()), 4)
f = round(f1_score(y_true=test_data.target, y_pred=before_tuning_predicted.predict.round()), 4)

In [13]:
print(f'ROC_AUC = {bfr_tun_roc_auc} \nPRECISION = {p} \nRECALL = {r} \nACCURACY = {a} \nf1_score = {f}')

ROC_AUC = 0.9741 
PRECISION = 0.0329 
RECALL = 0.8963 
ACCURACY = 0.9543 
f1_score = 0.0635


# From Sklearn

In [11]:
train_file_path = r'./examples/data/creditcard_scaling_underSample/train.csv'
test_file_path = r'./examples/data/creditcard_scaling_underSample/test.csv'

In [12]:
df_train = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)

In [13]:
X_train = df_train.drop(columns = ['Class'])
y_train = df_train.iloc[:,-1]

X_test = df_test.drop(columns = ['Class'])
y_test = df_test.iloc[:,-1]

In [14]:
neigh = KNeighborsClassifier()

In [15]:
start = time.time()
neigh.fit(X_train, y_train)
end = time.time()
print(end-start)

0.004996538162231445


In [16]:
y_pred = neigh.predict(X_test)

In [17]:
bfr_tun_roc_auc = round(roc_auc(y_true=y_test, y_score=y_pred), 4)

p = round(precision_score(y_test, y_pred), 4)
r = round(recall_score(y_test, y_pred), 4)
a = round(accuracy_score(y_test, y_pred), 4)
f = round(f1_score(y_test, y_pred), 4)

In [18]:
print(f'ROC_AUC = {bfr_tun_roc_auc} \nPRECISION = {p} \nRECALL = {r} \nACCURACY = {a} \nf1_score = {f}')

ROC_AUC = 0.9268 
PRECISION = 0.9294 
RECALL = 0.908 
ACCURACY = 0.9289 
f1_score = 0.9186


# Applying SKLEARN model to full data

In [22]:
data = pd.read_csv(r'./creditcard_scaling.csv')
X_all = data.drop(columns=['Class'])
y_all = data.iloc[:,-1]
y_pred_all = neigh.predict(X_all)

In [23]:
bfr_tun_roc_auc = round(roc_auc(y_all, y_pred_all), 4)

p = round(precision_score(y_all, y_pred_all), 4)
r = round(recall_score(y_all, y_pred_all), 4)
a = round(accuracy_score(y_all, y_pred_all), 4)
f = round(f1_score(y_all, y_pred_all), 4)

In [24]:
print(f'ROC_AUC = {bfr_tun_roc_auc} \nPRECISION = {p} \nRECALL = {r} \nACCURACY = {a} \nf1_score = {f}')

ROC_AUC = 0.9317 
PRECISION = 0.027 
RECALL = 0.9207 
ACCURACY = 0.9426 
f1_score = 0.0525
