In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import pickle
import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('D:/Maga/6th module/MLOps/ml/data/final_df.csv', index_col=0)

df

Unnamed: 0,brand,categories,colors,price,is_available,condition,is_sale,source,shipping,is_special_offer,is_returnable,manufacturer
1284,Forever Young,shoes,black,29.99,1,New,1,Overstock.com,No,1,0,RB-1904
1284,Forever Young,clothing,black,29.99,1,New,1,Overstock.com,No,1,0,RB-1904
1286,Forever Young,shoes,black,49.88,1,New,1,Overstock.com,No,1,0,RB-1904
1286,Forever Young,clothing,black,49.88,1,New,1,Overstock.com,No,1,0,RB-1904
7862,Hush Puppies,shoes,black,23.54,1,New,1,Overstock.com,No,1,0,H509362-635-75 M US
...,...,...,...,...,...,...,...,...,...,...,...,...
33797,Novica,clothing,red,39.99,1,New,1,Overstock.com,No,0,0,276629
33797,Novica,shoes,red,39.99,1,New,1,Overstock.com,No,0,0,276629
33798,Novica,accessories,red,35.99,1,New,0,Overstock.com,No,0,0,276629
33798,Novica,clothing,red,35.99,1,New,0,Overstock.com,No,0,0,276629


In [3]:
# Категориальные признаки:
# brand
# condition
# source
# shipping
# manufacturer
# category
# color

categorical_cols = df.dtypes[df.dtypes == 'object'].index
categorical_cols

Index(['brand', 'categories', 'colors', 'condition', 'source', 'shipping',
       'manufacturer'],
      dtype='object')

In [4]:
# все кодирую label encoder'ом и сохраняю классы в файл

for col in categorical_cols:
    sc = LabelEncoder()
    sc.fit(df[col])
    save_path = f'D:/Maga/6th module/MLOps/ml/data/encoders/{col}_classes.npy'
    print(f'Save classes for column={col}, path: {save_path}')
    np.save(save_path, sc.classes_)
    df[col] = sc.transform(df[col])

df

Save classes for column=brand, path: D:/Maga/6th module/MLOps/ml/data/encoders/brand_classes.npy
Save classes for column=categories, path: D:/Maga/6th module/MLOps/ml/data/encoders/categories_classes.npy
Save classes for column=colors, path: D:/Maga/6th module/MLOps/ml/data/encoders/colors_classes.npy
Save classes for column=condition, path: D:/Maga/6th module/MLOps/ml/data/encoders/condition_classes.npy
Save classes for column=source, path: D:/Maga/6th module/MLOps/ml/data/encoders/source_classes.npy
Save classes for column=shipping, path: D:/Maga/6th module/MLOps/ml/data/encoders/shipping_classes.npy
Save classes for column=manufacturer, path: D:/Maga/6th module/MLOps/ml/data/encoders/manufacturer_classes.npy


Unnamed: 0,brand,categories,colors,price,is_available,condition,is_sale,source,shipping,is_special_offer,is_returnable,manufacturer
1284,281,2,1,29.99,1,1,1,12,1,1,0,1831
1284,281,1,1,29.99,1,1,1,12,1,1,0,1831
1286,281,2,1,49.88,1,1,1,12,1,1,0,1831
1286,281,1,1,49.88,1,1,1,12,1,1,0,1831
7862,324,2,1,23.54,1,1,1,12,1,1,0,1355
...,...,...,...,...,...,...,...,...,...,...,...,...
33797,500,1,6,39.99,1,1,1,12,1,0,0,293
33797,500,2,6,39.99,1,1,1,12,1,0,0,293
33798,500,0,6,35.99,1,1,0,12,1,0,0,293
33798,500,1,6,35.99,1,1,0,12,1,0,0,293


In [5]:
# выделение столбцов-признаков и целевой переменной
X, y = df.drop(columns=['price']).to_numpy(), df['price'].to_numpy()

# разделение на обучающее и тестовое множества
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [6]:
import joblib

# нормализация и сохранение scaler'а в файл
scaler = MinMaxScaler().fit(y_train.reshape(-1, 1))
scaler_filename = 'D:/Maga/6th module/MLOps/ml/data/scalers/y_scaler.save'
print(f'Saving y scaler to {scaler_filename}')
joblib.dump(scaler, scaler_filename)
y_train_norm = scaler.transform(y_train.reshape(-1, 1))

y_train_norm

Saving y scaler to D:/Maga/6th module/MLOps/ml/data/scalers/y_scaler.save


array([[0.0461171 ],
       [0.07938573],
       [0.03633064],
       ...,
       [0.03935931],
       [0.02268164],
       [0.0491391 ]])

Возьму для реализации дерево решений с максимальной глубиной 3

In [7]:
# для удобной печати метрик

from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import r2_score as R2

def show_results():
    res_df = pd.DataFrame(results)
    res_df.set_index('model', inplace=True)
    pd.set_option('display.float_format', lambda x: '%.9f' % x)
    return res_df

def add_metric(title, rmse, mae, r2):
    results['model'].append(title)
    results['RMSE'].append(rmse)
    results['MAE'].append(mae)
    results['R2'].append(r2)

def calculate_metrics(title, y_train, predict_train, y_test, predict_test):
    print(title)
    print('RMSE')
    rmse = MSE(y_test, predict_test, squared=False)
    print('Train: ', MSE(y_train, predict_train, squared=False))
    print('Test: ', rmse)
    print('MAE')
    mae = MAE(y_test, predict_test)
    print('Train: ', MAE(y_train, predict_train))
    print('Test: ', mae)
    print('R2')
    r2 = R2(y_test, predict_test)
    print('Train: ', R2(y_train, predict_train))
    print('Test: ', r2)
    add_metric(title, rmse, mae, r2)
    print()

global indexes
global results

indexes = []
results = {'model': [], 'RMSE': [], 'MAE': [], 'R2': []}

In [8]:
from sklearn.tree import DecisionTreeRegressor

# расчет метрик для дерева решений с глубиной 3 (итог не супер, но для выполнения задания это и не важно)
model = DecisionTreeRegressor(max_depth=3)
model.fit(X_train, y_train_norm)

predict_train_norm = model.predict(X_train)
predict_test_norm = model.predict(X_test)

predict_train = scaler.inverse_transform(predict_train_norm.reshape(-1, 1))
predict_test = scaler.inverse_transform(predict_test_norm.reshape(-1, 1))

calculate_metrics('DecisionTreeRegressor', y_train, predict_train, y_test, predict_test)

DecisionTreeRegressor
RMSE
Train:  49.5894782355055
Test:  52.81906489506599
MAE
Train:  27.251142409268073
Test:  28.584168508426902
R2
Train:  0.28809023855587956
Test:  0.351412336204326



In [9]:
from sklearn.tree import _tree

# распечатаю выведенный алгоритм в понятном виде
def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print("{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)

tree_to_code(model, df.drop(columns=['price']).columns)

def tree(brand, categories, colors, is_available, condition, is_sale, source, shipping, is_special_offer, is_returnable, manufacturer):
  if source <= 23.5:
    if source <= 11.5:
      if manufacturer <= 738.5:
        return [[0.10781685]]
      else:  # if manufacturer > 738.5
        return [[0.05531139]]
    else:  # if source > 11.5
      if source <= 12.5:
        return [[0.03173469]]
      else:  # if source > 12.5
        return [[0.03950162]]
  else:  # if source > 23.5
    if manufacturer <= 501.5:
      if is_sale <= 0.5:
        return [[0.52075897]]
      else:  # if is_sale > 0.5
        return [[0.2144626]]
    else:  # if manufacturer > 501.5
      if is_sale <= 0.5:
        return [[0.22034257]]
      else:  # if is_sale > 0.5
        return [[0.09084402]]


In [14]:
def get_prediction(data, model_params):
    if 'prediction' in model_params:
        return model_params['prediction']
    else:
        field_name = model_params['field']
        conditions = model_params['conditions']
        bound_value = conditions['bound']
        if data[field_name] <= bound_value:
            return get_prediction(data, conditions['lower_or_equal'])
        else:
            return get_prediction(data, conditions['upper'])

def predict_value(data):
    '''Предсказание значений на основе выведенного алгоритма'''
    cols = df.drop(columns=['price']).columns
    columns_indexes = dict(zip(cols, range(len(cols))))
    model_params = {
        'field': 'source',
        'conditions': {
            'bound': 23.5,
            'lower_or_equal': {
                'field': 'source',
                'conditions': {
                    'bound': 11.5,
                    'lower_or_equal': {
                        'field': 'manufacturer',
                        'conditions': {
                            'bound': 738.5,
                            'lower_or_equal': {
                                'prediction': 0.10781685
                            },
                            'upper': {
                                'prediction': 0.05531139
                            }
                        }
                    },
                    'upper': {
                        'field': 'source',
                        'conditions': {
                            'bound': 12.5,
                            'lower_or_equal': {
                                'prediction': 0.03173469
                            },
                            'upper': {
                                'prediction': 0.03950162
                            }
                        }
                    }
                }
            },
            'upper': {
                'field': 'manufacturer',
                'conditions': {
                    'bound': 501.5,
                    'lower_or_equal': {
                        'field': 'is_sale',
                        'conditions': {
                            'bound': 0.5,
                            'lower_or_equal': {
                                'prediction': 0.52075897
                            },
                            'upper': {
                                'prediction': 0.2144626
                            }
                        }
                    },
                    'upper': {
                        'field': 'is_sale',
                        'conditions': {
                            'bound': 0.5,
                            'lower_or_equal': {
                                'prediction': 0.22034257
                            },
                            'upper': {
                                'prediction': 0.09084402
                            }
                        }
                    }
                }
            }
        }
    }
    # сохранение параметров в файл
    with open('D:/Maga/6th module/MLOps/ml/data/params/decision_tree.pkl', 'wb') as f:
        pickle.dump(model_params, f)
    data = {
        'source': data[columns_indexes['source']],
        'manufacturer': data[columns_indexes['manufacturer']],
        'is_sale': data[columns_indexes['is_sale']]
    }
    return get_prediction(data, model_params)

In [15]:
# предсказание на тестовом множестве
X_test_norm = []

for row in X_test:
    X_test_norm.append(predict_value(row))

print(X_test_norm)

[0.03173469, 0.03173469, 0.03173469, 0.03173469, 0.03173469, 0.03173469, 0.03950162, 0.03173469, 0.03950162, 0.03173469, 0.03173469, 0.03173469, 0.03950162, 0.03173469, 0.03950162, 0.03950162, 0.03950162, 0.03173469, 0.03173469, 0.03173469, 0.03173469, 0.03173469, 0.03173469, 0.03950162, 0.03173469, 0.03950162, 0.03173469, 0.03950162, 0.03173469, 0.03173469, 0.03173469, 0.03950162, 0.03950162, 0.03950162, 0.03950162, 0.03173469, 0.03173469, 0.03950162, 0.03950162, 0.03173469, 0.03173469, 0.03173469, 0.03173469, 0.03950162, 0.03173469, 0.03173469, 0.03173469, 0.03173469, 0.03173469, 0.03950162, 0.03950162, 0.03173469, 0.03173469, 0.03173469, 0.03173469, 0.03950162, 0.03950162, 0.03950162, 0.03173469, 0.03950162, 0.03173469, 0.03173469, 0.03173469, 0.03173469, 0.03173469, 0.03950162, 0.03950162, 0.03950162, 0.03173469, 0.03950162, 0.03173469, 0.03173469, 0.03950162, 0.03950162, 0.03173469, 0.03950162, 0.03950162, 0.03173469, 0.03950162, 0.03173469, 0.03173469, 0.03173469, 0.03173469, 0.0

In [16]:
# денормализация предсказаний
predict_test = scaler.inverse_transform(predict_test_norm.reshape(-1, 1))

predict_test

array([[48.56061882],
       [48.56061882],
       [48.56061882],
       ...,
       [60.20333033],
       [60.20333033],
       [48.56061882]])

In [17]:
# расчет метрик для реализованного алгоритма
calculate_metrics('DecisionTreeRegressor Alg', y_train, predict_train, y_test, predict_test)

DecisionTreeRegressor Alg
RMSE
Train:  49.5894782355055
Test:  52.81906489506599
MAE
Train:  27.251142409268073
Test:  28.584168508426902
R2
Train:  0.28809023855587956
Test:  0.351412336204326



Данные сходятся, значит, алгоритм реализован корректно. Можно переносить его в приложение.