In [17]:
import pandas as pd
import preprocessors as pp

from faker import Faker
from IPython.display import display, HTML
from manager import save_pipeline
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from urllib.parse import urlencode

# Обучение модели

Обучим модель `RandomForestRegressor` и сделаем `Pipeline` для предобработки признаков.

👨‍💻 В демке не будем чистить данные, искать аномалии и строить графики.

Для примера возьмем DataFrame с ценами на недвижимость в Бостоне.

In [18]:
boston = load_boston()

In [19]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

## Сгруппируем колонки

* `CATEGORICAL_FEATURES` - категорийные фичи
* `NUMERICAL_FEATURES` - дискретные фичи, которые прогоним через `scaler`
* `REDUNDANT_FEATURES` - всякое лишнее, что не нужно для модели (фичи с признаками мультиколлинеарности, малозначимые фичи)
* `TARGET` - целевой признак

In [20]:
CATEGORICAL_FEATURES = ['RAD']
NUMERICAL_FEATURES = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'PTRATIO', 'B', 'LSTAT']
REDUNDANT_FEATURES = ['DT_1', 'DT_2']
TARGET = 'MEDV'

In [21]:
df = pd.read_csv(
    filepath_or_buffer=boston.filename,
    header=1
)

Добавим столбец с датой и временем в виде строки, для предсказания он не нужен.

На нем покажу кастомные обработки полей через pipeline

In [22]:
faker = Faker()
DATETIME_FORMAT = '%d.%m.%Y %H:%M:%S'
DATE_FORMAT = '%Y-%m-%d'

# Добавим столбец, в котором будет дата и время в формате datetime_format
df['DT_1'] = df.apply(lambda x: faker.date_time_between(start_date='-30d', end_date='now').strftime(DATETIME_FORMAT), axis=1)
df['DT_2'] = df.apply(lambda x: faker.date_time_between(start_date='now', end_date='+30d').strftime(DATE_FORMAT), axis=1)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
 14  DT_1     506 non-null    object 
 15  DT_2     506 non-null    object 
dtypes: float64(11), int64(3), object(2)
memory usage: 63.4+ KB


In [24]:
# разделим данные на обучающую и тестовую выборки

train_df = df.sample(frac=.90)

test_df = df.drop(train_df.index, axis=0)

display(train_df)
display(test_df)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,DT_1,DT_2
145,2.37934,0.0,19.58,0,0.871,6.130,100.0,1.4191,5,403,14.7,172.91,27.80,13.8,03.03.2021 02:53:03,2021-03-22
213,0.14052,0.0,10.59,0,0.489,6.375,32.3,3.9454,4,277,18.6,385.81,9.38,28.1,13.02.2021 23:00:49,2021-03-14
78,0.05646,0.0,12.83,0,0.437,6.232,53.7,5.0141,5,398,18.7,386.40,12.34,21.2,20.02.2021 14:47:11,2021-03-20
299,0.05561,70.0,2.24,0,0.400,7.041,10.0,7.8278,5,358,14.8,371.58,4.74,29.0,12.02.2021 17:15:13,2021-03-23
339,0.05497,0.0,5.19,0,0.515,5.985,45.4,4.8122,5,224,20.2,396.90,9.74,19.0,19.02.2021 20:59:44,2021-03-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,6.53876,0.0,18.10,1,0.631,7.016,97.5,1.2024,24,666,20.2,392.05,2.96,50.0,16.02.2021 02:23:42,2021-03-31
30,1.13081,0.0,8.14,0,0.538,5.713,94.1,4.2330,4,307,21.0,360.17,22.60,12.7,07.03.2021 22:52:07,2021-03-20
169,2.44953,0.0,19.58,0,0.605,6.402,95.2,2.2625,5,403,14.7,330.04,11.32,22.3,11.03.2021 14:50:49,2021-04-04
268,0.54050,20.0,3.97,0,0.575,7.470,52.6,2.8720,5,264,13.0,390.30,3.16,43.5,19.02.2021 10:37:27,2021-03-22


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,DT_1,DT_2
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,11.03.2021 17:13:17,2021-03-14
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9,11.03.2021 03:34:22,2021-04-03
18,0.80271,0.0,8.14,0,0.538,5.456,36.6,3.7965,4,307,21.0,288.99,11.69,20.2,15.02.2021 02:12:07,2021-03-15
35,0.06417,0.0,5.96,0,0.499,5.933,68.2,3.3603,5,279,19.2,396.9,9.68,18.9,08.03.2021 16:24:59,2021-04-09
38,0.17505,0.0,5.96,0,0.499,5.966,30.2,3.8473,5,279,19.2,393.43,10.13,24.7,12.03.2021 14:11:43,2021-03-20
62,0.11027,25.0,5.13,0,0.453,6.456,67.8,7.2255,8,284,19.7,396.9,6.73,22.2,20.02.2021 14:44:18,2021-03-17
65,0.03584,80.0,3.37,0,0.398,6.29,17.8,6.6115,4,337,16.1,396.9,4.67,23.5,28.02.2021 11:50:38,2021-03-30
96,0.11504,0.0,2.89,0,0.445,6.163,69.6,3.4952,2,276,18.0,391.83,11.34,21.4,20.02.2021 21:12:28,2021-03-22
111,0.10084,0.0,10.01,0,0.547,6.715,81.6,2.6775,6,432,17.8,395.59,10.16,22.8,25.02.2021 01:35:02,2021-04-01
115,0.17134,0.0,10.01,0,0.547,5.928,88.2,2.4631,6,432,17.8,344.91,15.76,18.3,23.02.2021 15:47:07,2021-04-10


In [25]:
# Можно раскомментировать # ('Info N', Info()), чтобы увидеть,
# что происходит внутри pipeline-а

pipeline = Pipeline([
    # ('Info 1', Info()),
    ('Datetime parser 1', pp.DateTimeParser(column=REDUNDANT_FEATURES[0], datetime_format=DATETIME_FORMAT)),
    ('Datetime parser 2', pp.DateTimeParser(column=REDUNDANT_FEATURES[1], datetime_format=DATE_FORMAT)),
    # ('Info 2', Info()),
    # ('Info 3', Info()),
    ('Drop', pp.Drop(columns=REDUNDANT_FEATURES)),
    # ('Info 4', Info()),
    ('Scale', pp.Scaler(columns=NUMERICAL_FEATURES)),
    # ('Info 5', Info()),
    ('Forest', RandomForestRegressor(n_estimators=100, max_depth=8, n_jobs=-1))
], verbose=True)

In [26]:
pipeline.fit(train_df.drop(columns=TARGET), train_df[TARGET])
'Ok!'

[Pipeline] . (step 1 of 5) Processing Datetime parser 1, total=   0.0s
[Pipeline] . (step 2 of 5) Processing Datetime parser 2, total=   0.0s
[Pipeline] .............. (step 3 of 5) Processing Drop, total=   0.0s
[Pipeline] ............. (step 4 of 5) Processing Scale, total=   0.0s
[Pipeline] ............ (step 5 of 5) Processing Forest, total=   0.2s


'Ok!'

In [27]:
predicted = pipeline.predict(test_df.drop(columns=TARGET))

rmse = mean_squared_error(test_df[TARGET], predicted) ** .5
mae = mean_absolute_error(test_df[TARGET], predicted)

'RMSE: {:.4f} MAE: {:.4f}'.format(rmse, mae)

'RMSE: 2.6043 MAE: 1.9001'

In [28]:
save_pipeline(pipeline=pipeline, file_path='./trained_model/forest.pickle')

In [30]:
# Эта ссылка нам еще понадобится после запуска сервера

f'http://0.0.0.0:5000/predict?{urlencode(train_df.iloc[0].to_dict())}'

'http://0.0.0.0:5000/predict?CRIM=2.37934&ZN=0.0&INDUS=19.58&CHAS=0&NOX=0.871&RM=6.13&AGE=100.0&DIS=1.4191&RAD=5&TAX=403&PTRATIO=14.7&B=172.91&LSTAT=27.8&MEDV=13.8&DT_1=03.03.2021+02%3A53%3A03&DT_2=2021-03-22'