In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tsfresh
from sklearn.compose import ColumnTransformer

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold
import numpy as np
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_regression
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
import numpy as np
import math

def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    
    error_sum = 0.0

    for i in range(len(y_true)):
        error_sum += (math.log1p(max(0, y_pred[i])) - math.log1p(max(0, y_true[i])))**2

    mean_error = np.sqrt(error_sum / len(y_true))
    return mean_error


In [3]:
def metrics(y,y_pred):
    r2 = r2_score(y, y_pred)
    mae= mean_absolute_error(y, y_pred)
    mape =mean_absolute_percentage_error(y, y_pred)
    mse=mean_squared_error(y, y_pred)
    print(f"metrics: \nMAPE: {(mape*100).round(3)}%  R2: {r2.round(3)}  MAE: {mae.round(3)}  MSE: {mse.round(3)}")

Функция RSMLE для Catboost позаимствованна у : https://habr.com/ru/sandbox/163469/ 

In [4]:
import math
class RMSLE(object):
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)

        result = []
        for index in range(len(targets)):
            val = max(approxes[index], 0)
            der1 = math.log1p(targets[index]) - math.log1p(max(0, approxes[index]))
            der2 = -1 / (max(0, approxes[index]) + 1)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result
class RMSLE_val(object):
    def get_final_error(self, error, weight):
        return np.sqrt(error / (weight + 1e-38))

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += w * ((math.log1p(max(0, approx[i])) - math.log1p(max(0, target[i])))**2)

        return error_sum, weight_sum

In [5]:
data=pd.read_csv('train_regr.csv', low_memory=False)
data=data.fillna(0)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22232 entries, 0 to 22231
Columns: 453 entries, ESP1_Feeding_m3_day to lastStartDate
dtypes: float64(429), int64(22), object(2)
memory usage: 76.8+ MB


In [6]:
data['SK_Calendar'] = pd.to_datetime(data['SK_Calendar'])
data['FailureDate'] = pd.to_datetime(data['FailureDate'])

In [7]:
# missing_values_count = data.isna().sum()

# print("Количество пропущенных значений в каждой колонке:")
# for column, count in missing_values_count.items():
#     print(f"{column}: {count}")

In [8]:
data[['SK_Calendar','FailureDate','SK_Well']].head(17317)

Unnamed: 0,SK_Calendar,FailureDate,SK_Well
0,2017-11-20,2018-10-24,0.002841
1,2017-11-21,2018-10-24,0.002841
2,2017-11-22,2018-10-24,0.002841
3,2017-11-23,2018-10-24,0.002841
4,2017-11-24,2018-10-24,0.002841
...,...,...,...
17312,2020-10-03,2020-10-07,0.003247
17313,2020-10-04,2020-10-07,0.003247
17314,2020-10-05,2020-10-07,0.003247
17315,2020-10-06,2020-10-07,0.003247


In [9]:
df=data.drop(['SK_Mounts','SK_ConfirmDateStart', 'sk_calendarprevmonth','FailureDate','SK_Calendar'],axis=1)

In [10]:
data.head()

Unnamed: 0,ESP1_Feeding_m3_day,ESP1_Pressure_m,ESP1_StageCount,ESP1_MaxEfficiency,ESP1_ETT,ESP1_Manufacturer,ESP1_Build_type,ESP1_EquipPropName,ESP1_Overall_group,ESP1_Diameter,...,CurrentTTF,SK_Calendar,SK_Mounts,FailureDate,daysToFailure,daysFromLastStart,FailuresCountFromLastWellWork,SK_ConfirmDateStart,sk_calendarprevmonth,lastStartDate
0,200.0,450.0,39.0,0.57,0.010414,0.00271,0.00521,0.003944,0.004859,0.069,...,339.0,2017-11-20,9017584.0,2018-10-24,338,1,0,20171023,20171020,0.002841
1,160.0,834.0,146.0,0.567058,0.002639,0.00271,0.004114,0.003944,0.003587,0.103,...,339.0,2017-11-21,9026842.0,2018-10-24,337,2,0,20171121,20171021,0.002841
2,160.0,834.0,146.0,0.567058,0.002639,0.00271,0.004114,0.003944,0.003587,0.103,...,339.0,2017-11-22,9026842.0,2018-10-24,336,3,0,20171121,20171022,0.002841
3,160.0,834.0,146.0,0.567058,0.002639,0.00271,0.004114,0.003944,0.003587,0.103,...,339.0,2017-11-23,9026842.0,2018-10-24,335,4,0,20171121,20171023,0.002841
4,160.0,834.0,146.0,0.567058,0.002639,0.00271,0.004114,0.003944,0.003587,0.103,...,339.0,2017-11-24,9026842.0,2018-10-24,334,5,0,20171121,20171024,0.002841


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22232 entries, 0 to 22231
Columns: 448 entries, ESP1_Feeding_m3_day to lastStartDate
dtypes: float64(428), int64(20)
memory usage: 76.0 MB


In [12]:
df['SK_Well']

0        0.002841
1        0.002841
2        0.002841
3        0.002841
4        0.002841
           ...   
22227    0.003226
22228    0.003226
22229    0.003226
22230    0.003226
22231    0.003226
Name: SK_Well, Length: 22232, dtype: float64

In [13]:
X=df.drop(['daysToFailure'],axis=1)
y=np.log(df['daysToFailure'])

In [14]:
from sklearn.preprocessing import MinMaxScaler

# Создаем экземпляр MinMaxScaler
scaler = MinMaxScaler()

# Извлекаем названия колонок
columns = X.columns

# Нормализуем данные в датафрейме X
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=columns)
X=X_normalized

In [15]:
def data_split(X,y):   
    X_train=X[0:17316]
    y_train=y[0:17316]
    X_test=X[17316:]
    y_test=y[17316:]
    return X_train, y_train, X_test, y_test

In [16]:
X

Unnamed: 0,ESP1_Feeding_m3_day,ESP1_Pressure_m,ESP1_StageCount,ESP1_MaxEfficiency,ESP1_ETT,ESP1_Manufacturer,ESP1_Build_type,ESP1_EquipPropName,ESP1_Overall_group,ESP1_Diameter,...,ЗамерыМРМ/Давление буферное (ТМ),ЗамерыМРМ/Давление затрубное (ТМ),ЗамерыМРМ/Давление линейное (ТМ),ЗамерыМРМ/Газовый фактор (ТМ),ЗамерыМРМ/Давление на приеме насоса (ТМ),SK_Well,CurrentTTF,daysFromLastStart,FailuresCountFromLastWellWork,lastStartDate
0,0.139344,0.139706,0.000000,0.631240,0.227682,0.059934,0.05736,0.000000,0.164545,0.000000,...,0.473934,0.440529,0.071130,0.0,0.414634,0.051380,0.303993,0.000000,0.000000,0.051380
1,0.106557,0.327941,0.664596,0.624051,0.057688,0.059934,0.00000,0.000000,0.117107,0.755556,...,0.473934,0.440529,0.071130,0.0,0.406504,0.051380,0.303993,0.001032,0.000000,0.051380
2,0.106557,0.327941,0.664596,0.624051,0.057688,0.059934,0.00000,0.000000,0.117107,0.755556,...,0.473934,0.440529,0.079498,0.0,0.403252,0.051380,0.303993,0.002064,0.000000,0.051380
3,0.106557,0.327941,0.664596,0.624051,0.057688,0.059934,0.00000,0.000000,0.117107,0.755556,...,0.473934,0.440529,0.075314,0.0,0.403252,0.051380,0.303993,0.003096,0.000000,0.051380
4,0.106557,0.327941,0.664596,0.624051,0.057688,0.059934,0.00000,0.000000,0.117107,0.755556,...,0.473934,0.440529,0.071130,0.0,0.403252,0.051380,0.303993,0.004128,0.000000,0.051380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22227,0.016393,0.178922,0.403727,0.406416,0.093264,0.323624,0.05736,0.065391,0.164545,0.000000,...,0.014218,0.017621,0.430962,0.0,0.000000,0.058342,0.278584,0.314757,0.142857,0.058342
22228,0.016393,0.178922,0.403727,0.406416,0.093264,0.323624,0.05736,0.065391,0.164545,0.000000,...,0.023697,0.026432,0.430962,0.0,0.000000,0.058342,0.278584,0.315789,0.142857,0.058342
22229,0.016393,0.178922,0.403727,0.406416,0.093264,0.323624,0.05736,0.065391,0.164545,0.000000,...,0.004739,0.008811,0.426778,0.0,0.000000,0.058342,0.278584,0.316821,0.142857,0.058342
22230,0.016393,0.178922,0.403727,0.406416,0.093264,0.323624,0.05736,0.065391,0.164545,0.000000,...,0.014218,0.017621,0.430962,0.0,0.000000,0.058342,0.278584,0.317853,0.142857,0.058342


In [17]:
f_statistic, p_values = f_regression(X, y)

In [18]:
results = list(zip(f_statistic, X.columns))

results.sort(reverse=True)

top_parameters = results[:10]
for f_stat, param_name in top_parameters:
    print(f"Parameter: {param_name}, F-statistic: {f_stat}")

Parameter: CurrentTTF, F-statistic: 9681.544269473707
Parameter: lastStartDate, F-statistic: 7868.224290587856
Parameter: SK_Well, F-statistic: 7739.432426533852
Parameter: Cluster, F-statistic: 7285.149288579952
Parameter: Motor1_Mark, F-statistic: 6106.085918951366
Parameter: Motor1_MotorETT, F-statistic: 5701.356137626462
Parameter: SourcePumpType, F-statistic: 5507.8564413793865
Parameter: ESP1_Mark, F-statistic: 4992.855623149952
Parameter: ESP2_Mark, F-statistic: 4803.175564819378
Parameter: Separator_Mark, F-statistic: 4589.976663735845


In [19]:
# Вычисляем взаимную информацию
mi_scores = mutual_info_regression(X, y)

# Создаем список, содержащий значения взаимной информации и соответствующие имена признаков
results = list(zip(mi_scores, X.columns))

# Сортируем результаты по значению взаимной информации
results = sorted(results, key=lambda x: x[0], reverse=True)

In [20]:
# Выводим лучшие признаки
top_features = results[:8]
for mi_score, feature_name in top_features:
    print(f"Feature: {feature_name}, Mutual Information: {mi_score}")

Feature: DynamicLevel_m, Mutual Information: 1.292267772941413
Feature: WellFlowPressure_atm, Mutual Information: 1.122394579319944
Feature: TRTargetLfpTech, Mutual Information: 1.0955614711360049
Feature: PumpDeliveryRate_pct, Mutual Information: 1.0468308803987814
Feature: OilRatePot_t_d, Mutual Information: 1.0338134293662886
Feature: daysFromLastStart, Mutual Information: 0.992163386249513
Feature: Free_Gas, Mutual Information: 0.9666654191779216
Feature: OilRate_t_d, Mutual Information: 0.9430121210138269


Feature: DynamicLevel_m, Mutual Information: 1.2909881131607746
Feature: WellFlowPressure_atm, Mutual Information: 1.1218592646709205
Feature: TRTargetLfpTech, Mutual Information: 1.094436294081806
Feature: PumpDeliveryRate_pct, Mutual Information: 1.0482507684644276
Feature: OilRatePot_t_d, Mutual Information: 1.0331277308314153
Feature: daysFromLastStart, Mutual Information: 0.9911171452353145
Feature: Free_Gas, Mutual Information: 0.9680646486039581
Feature: CasingPressure_atm, Mutual Information: 0.9422968798257054

In [21]:
# Создаем пустой словарь для хранения данных
data_dict = {}

# Выводим лучшие признаки
top_features = results[:8]
for mi_score, feature_name in top_features:
    data_dict[feature_name] = data[feature_name]

# Создаем DataFrame из словаря
X_mutual = pd.DataFrame(data_dict)

# Выводим DataFrame с данными
X_mutual.head()

Unnamed: 0,DynamicLevel_m,WellFlowPressure_atm,TRTargetLfpTech,PumpDeliveryRate_pct,OilRatePot_t_d,daysFromLastStart,Free_Gas,OilRate_t_d
0,1447.0,96.990352,36.33,0.73,24.226985,1,3.497414,17.6076
1,1435.0,116.316656,37.43,0.86875,26.785406,2,3.497414,16.7634
2,1435.0,116.316656,37.43,0.86875,26.785406,3,3.497414,16.7634
3,1435.0,116.316656,37.43,0.86875,26.785406,4,3.497414,16.7634
4,1435.0,116.316656,37.43,0.86875,26.785406,5,3.497414,16.7634


In [22]:
X_train, y_train, X_test, y_test=data_split(X_mutual,y);

In [23]:
# model = CatBoostRegressor(iterations=150,
#                           early_stopping_rounds=100,
#                           grow_policy = 'Depthwise',
#                           depth=8,
#                           loss_function=RMSLE(),
                          
#                           random_state=42,
#                           l2_leaf_reg = 1,
#                           learning_rate=0.03,
#                           verbose=10,
#                           eval_metric=RMSLE_val())
# params = {'l2_leaf_reg':[1,4,8],
#           'learning_rate': [0.03,0.5,0.1],
#           'depth':[6,8,10],
#           'n_estimators': range(100,200,20)
#          }
# grid_search_result = model.grid_search(params, 
#                                        X=X_mutual, 
#                                        y=y, 
#                                        plot=True)

In [24]:
# print(grid_search_result['params'])

In [25]:
cat= CatBoostRegressor(iterations=180 , 
                       depth=10,
                       
                       random_seed=42,
                       learning_rate=0.5,
                       l2_leaf_reg=4, 
                       bootstrap_type='Bayesian',
                       
                       early_stopping_rounds=100,
                       grow_policy = 'Depthwise',
                       loss_function=RMSLE(),
                       verbose=10,
                       eval_metric=RMSLE_val())
cat.fit(X_train, y_train)

0:	learn: 1.1629670	total: 1s	remaining: 2m 59s
10:	learn: 0.1024145	total: 1.32s	remaining: 20.3s
20:	learn: 0.0769904	total: 1.62s	remaining: 12.3s
30:	learn: 0.0656350	total: 1.93s	remaining: 9.29s
40:	learn: 0.0571996	total: 2.4s	remaining: 8.15s
50:	learn: 0.0512064	total: 2.83s	remaining: 7.16s
60:	learn: 0.0469988	total: 3.15s	remaining: 6.15s
70:	learn: 0.0438299	total: 3.53s	remaining: 5.42s
80:	learn: 0.0413258	total: 3.86s	remaining: 4.72s
90:	learn: 0.0397426	total: 4.11s	remaining: 4.02s
100:	learn: 0.0389350	total: 4.35s	remaining: 3.4s
110:	learn: 0.0380686	total: 4.59s	remaining: 2.85s
120:	learn: 0.0376299	total: 4.83s	remaining: 2.35s
130:	learn: 0.0371893	total: 5.09s	remaining: 1.9s
140:	learn: 0.0368056	total: 5.34s	remaining: 1.48s
150:	learn: 0.0365975	total: 5.56s	remaining: 1.07s
160:	learn: 0.0363629	total: 5.81s	remaining: 685ms
170:	learn: 0.0361881	total: 6.04s	remaining: 318ms
179:	learn: 0.0360773	total: 6.23s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x2032b932390>

In [26]:
# cat.save_model('cat.cbm')
cat_pred_train = np.exp(cat.predict(X_train))
cat_pred_test = np.exp(cat.predict(X_test))

In [27]:
metrics(np.exp(y_train), cat_pred_train)

metrics: 
MAPE: 4.244%  R2: 0.994  MAE: 6.89  MSE: 217.49


In [28]:
metrics(np.exp(y_test), cat_pred_test)

metrics: 
MAPE: 180.267%  R2: -0.019  MAE: 142.708  MSE: 37376.454


In [30]:
# Преобразование y_test и cat_pred_test в одномерные массивы (если необходимо)
y_test = np.ravel(y_test)
cat_pred_test = np.ravel(cat_pred_test)

# Расчет метрики для обучающего набора
rmsle_train = rmsle(np.exp(y_train), (cat_pred_train))

# Расчет метрики для тестового набора
rmsle_test = rmsle(np.exp(y_test), (cat_pred_test))

In [31]:
rmsle_train

0.08198199327254357

In [32]:
rmsle_test

1.3011616735068237

In [33]:
cat_pred_test-y_test

array([92.02642832, 90.6342077 , 86.22072611, ...,  8.04009779,
        8.44556289,  5.4315577 ])

TSFRESH

In [34]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import settings

In [35]:
X_long_train = pd.DataFrame({0: X_train.values.flatten(),
                          1: np.arange(X_train.shape[0]).repeat(X_train.shape[1])})
print(X_long_train.shape)
X_long_train.head()

(138528, 2)


Unnamed: 0,0,1
0,1447.0,0
1,96.990352,0
2,36.33,0
3,0.73,0
4,24.226985,0


In [36]:
settings_minimal = settings.MinimalFCParameters()
len(settings_minimal)
settings_time = settings.TimeBasedFCParameters()
settings_time.update(settings_minimal)
settings_efficient = settings.EfficientFCParameters()
len(settings_efficient)

73

In [37]:
X_tsfresh_train = extract_features(X_long_train, column_id=1, impute_function=impute, default_fc_parameters=settings_time)
print(X_tsfresh_train.shape)

Feature Extraction:   0%|          | 0/30 [00:00<?, ?it/s]

Feature Extraction: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


(17316, 10)


In [38]:
X_long_test = pd.DataFrame({0: X_test.values.flatten(),
                          1: np.arange(X_test.shape[0]).repeat(X_test.shape[1])})
print(X_long_test.shape)
X_tsfresh_test = extract_features(X_long_test, column_id=1, impute_function=impute, default_fc_parameters=settings_time)
print(X_tsfresh_test.shape)

(39328, 2)


Feature Extraction: 100%|██████████| 30/30 [00:07<00:00,  4.11it/s]


(4916, 10)


In [53]:
X_tsfresh_train.reset_index(drop=True, inplace=True)
X_train.reset_index(drop=True, inplace=True)
features=pd.concat([X_tsfresh_train,X_train],axis=1)

In [56]:
X_tsfresh_test.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
test=pd.concat([X_tsfresh_test,X_test],axis=1)

In [57]:
features

Unnamed: 0,0__sum_values,0__median,0__mean,0__length,0__standard_deviation,0__variance,0__root_mean_square,0__maximum,0__absolute_maximum,0__minimum,DynamicLevel_m,WellFlowPressure_atm,TRTargetLfpTech,PumpDeliveryRate_pct,OilRatePot_t_d,daysFromLastStart,Free_Gas,OilRate_t_d
0,1627.382350,20.917292,203.422794,8.0,470.956483,221800.009016,513.011542,1447.0,1447.0,0.73000,1447.0,96.990352,36.33,0.73000,24.226985,1,3.497414,17.6076
1,1638.661626,21.774403,204.832703,8.0,466.304559,217439.941892,509.309708,1435.0,1435.0,0.86875,1435.0,116.316656,37.43,0.86875,26.785406,2,3.497414,16.7634
2,1639.661626,21.774403,204.957703,8.0,466.250301,217389.343091,509.310321,1435.0,1435.0,0.86875,1435.0,116.316656,37.43,0.86875,26.785406,3,3.497414,16.7634
3,1640.661626,21.774403,205.082703,8.0,466.196271,217338.963040,509.311180,1435.0,1435.0,0.86875,1435.0,116.316656,37.43,0.86875,26.785406,4,3.497414,16.7634
4,1641.661626,21.774403,205.207703,8.0,466.142469,217288.801739,509.312285,1435.0,1435.0,0.86875,1435.0,116.316656,37.43,0.86875,26.785406,5,3.497414,16.7634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17311,2471.022428,21.140319,308.877803,8.0,676.970912,458289.616359,744.106924,2082.0,2082.0,0.22000,2082.0,33.739637,33.74,0.22000,8.541000,304,0.353390,8.4284
17312,2472.022428,21.140319,309.002803,8.0,676.970093,458288.506283,744.158074,2082.0,2082.0,0.22000,2082.0,33.739637,33.74,0.22000,8.541000,305,0.353390,8.4284
17313,2473.022428,21.140319,309.127803,8.0,676.969434,458287.614957,744.209388,2082.0,2082.0,0.22000,2082.0,33.739637,33.74,0.22000,8.541000,306,0.353390,8.4284
17314,2474.022428,21.140319,309.252803,8.0,676.968938,458286.942381,744.260867,2082.0,2082.0,0.22000,2082.0,33.739637,33.74,0.22000,8.541000,307,0.353390,8.4284


In [58]:
cat= CatBoostRegressor(iterations=180 , 
                       depth=10,
                       
                       random_seed=42,
                       learning_rate=0.5,
                       l2_leaf_reg=4, 
                       
                       
                       
                       
                       loss_function=RMSLE(),
                       verbose=10,
                       eval_metric=RMSLE_val())
cat.fit(features, y_train)

0:	learn: 1.1592947	total: 447ms	remaining: 1m 20s
10:	learn: 0.1476412	total: 1.26s	remaining: 19.3s
20:	learn: 0.1170502	total: 2.03s	remaining: 15.4s
30:	learn: 0.0998247	total: 2.8s	remaining: 13.5s
40:	learn: 0.0911706	total: 3.52s	remaining: 11.9s
50:	learn: 0.0826954	total: 4.23s	remaining: 10.7s
60:	learn: 0.0769892	total: 4.98s	remaining: 9.72s
70:	learn: 0.0714284	total: 5.67s	remaining: 8.71s
80:	learn: 0.0676090	total: 6.39s	remaining: 7.82s
90:	learn: 0.0643949	total: 7.12s	remaining: 6.97s
100:	learn: 0.0618352	total: 7.84s	remaining: 6.13s
110:	learn: 0.0589695	total: 8.62s	remaining: 5.36s
120:	learn: 0.0566031	total: 9.38s	remaining: 4.57s
130:	learn: 0.0541140	total: 10.1s	remaining: 3.76s
140:	learn: 0.0524895	total: 10.8s	remaining: 2.98s
150:	learn: 0.0504078	total: 11.5s	remaining: 2.2s
160:	learn: 0.0488578	total: 12.2s	remaining: 1.44s
170:	learn: 0.0472939	total: 12.9s	remaining: 677ms
179:	learn: 0.0457239	total: 13.5s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x2033c688710>

In [61]:
cat_pred_train = np.exp(cat.predict(features))
cat_pred_test = np.exp(cat.predict(test))

In [62]:
metrics(np.exp(y_train), cat_pred_train)

metrics: 
MAPE: 9.018%  R2: 0.968  MAE: 17.087  MSE: 1138.06


In [63]:
metrics(np.exp(y_test), cat_pred_test)

metrics: 
MAPE: 225.241%  R2: -0.341  MAE: 165.505  MSE: 49171.546


In [64]:
# Преобразование y_test и cat_pred_test в одномерные массивы (если необходимо)
y_test = np.ravel(y_test)
cat_pred_test = np.ravel(cat_pred_test)

# Расчет метрики для обучающего набора
rmsle_train = rmsle(np.exp(y_train), (cat_pred_train))

# Расчет метрики для тестового набора
rmsle_test = rmsle(np.exp(y_test), (cat_pred_test))

In [65]:
rmsle_train

0.13500395230642884

In [66]:
rmsle_test

1.6634075322209054

In [72]:
d=cat_pred_test-y_test
d[:100]

array([ 92.15885603,  90.67166356,  92.19936508,  93.96064814,
        98.95524791, 106.80362121,  84.2651483 ,  72.27990089,
       130.88700128, 128.15951187, 123.31247912, 143.87846366,
       181.05457409, 173.26445416, 171.81287872, 171.81999619,
       166.92923336, 193.66322195, 193.67049471, 205.398417  ,
       205.44710685, 195.59575453, 166.69155062, 161.41850185,
       163.40241643, 164.87022335, 155.34302274, 142.40943207,
       141.8955097 , 141.46775117, 143.18086241, 165.94501678,
       183.69550263, 242.60085575, 253.3404592 , 259.46125842,
       275.65212734, 283.08430134, 283.09281203, 279.32204048,
       240.93399743, 240.94273111, 240.95154174, 240.96043069,
       237.4548321 , 214.59987117, 214.60900365, 214.61822031,
       247.93776041, 247.94715015, 230.24168448, 230.25125393,
       230.26091584, 236.81036938, 240.70853154, 240.71848187,
       260.00529442, 260.01544679, 253.76672287, 251.80883802,
       251.81930932, 248.04824909, 248.05894438, 248.06