# Описание проекта

# Импорт библиотек

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

from catboost import CatBoostRegressor
import catboost

import ydata_profiling
import phik

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

from pylab import rcParams
import warnings

sns.set(style="darkgrid")
rcParams['figure.figsize'] = 20, 9

import sys

In [2]:
sys.path.append('./train')

In [3]:
from metrics_f1 import calc_f1_score

# Загрузка данных

In [4]:
path_train = r"./train"

## Дислокация вагонов (dislok_wagons)

In [14]:
# данные по дислокации
dislok = pd.read_parquet(path_train + '/dislok_wagons.parquet').convert_dtypes()

In [15]:
dislok.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6250933 entries, 0 to 1052924
Data columns (total 16 columns):
 #   Column        Dtype         
---  ------        -----         
 0   plan_date     datetime64[ns]
 1   wagnum        Int64         
 2   date_kap      datetime64[ns]
 3   date_dep      datetime64[ns]
 4   kod_vrab      Int64         
 5   date_pl_rem   datetime64[ns]
 6   id_road_disl  Int64         
 7   st_id_dest    Int64         
 8   id_road_dest  Int64         
 9   st_id_send    Int64         
 10  id_road_send  Int64         
 11  ost_prob      Int64         
 12  isload        Int64         
 13  fr_id         Int64         
 14  last_fr_id    Int64         
 15  distance      Int64         
dtypes: Int64(12), datetime64[ns](4)
memory usage: 882.3 MB


In [40]:
dislok['plan_date'].max()

Timestamp('2023-01-31 00:00:00')

In [42]:
dislok[dislok['plan_date'] == dislok['plan_date'].max()].head()

Unnamed: 0,plan_date,wagnum,date_kap,date_dep,kod_vrab,date_pl_rem,id_road_disl,st_id_dest,id_road_dest,st_id_send,id_road_send,ost_prob,isload,fr_id,last_fr_id,distance
30,2023-01-31,23398,2021-10-04,2019-05-24,0,2024-10-04,15,3376,15,6152,28,64018,1,1083,1083,999
61,2023-01-31,31528,NaT,2021-12-16,1,2024-12-16,15,3376,15,6152,28,44948,1,1083,1083,-1
92,2023-01-31,20486,2021-04-04,2019-07-30,0,2024-04-04,44,12651,38,12318,44,7988,0,5032,2336,5025
123,2023-01-31,16526,NaT,2022-05-17,1,2025-05-17,41,22,41,2258,11,114983,1,3266,3266,-1
154,2023-01-31,18061,2022-02-23,2019-08-09,0,2025-02-23,36,22,36,8971,36,104774,0,1639,1639,4051


## Текущие ремонты (pr_rems)

In [16]:
# данные по текущим ремонтам
pr_rem = pd.read_parquet(path_train + '/pr_rems.parquet').convert_dtypes()

In [17]:
pr_rem.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10441 entries, 0 to 1515
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   wagnum        10441 non-null  Int64         
 1   rem_month     10441 non-null  datetime64[ns]
 2   rod_id        10441 non-null  Int64         
 3   model         10441 non-null  string        
 4   road_id_send  10441 non-null  Int64         
 5   road_id_rem   10441 non-null  Int64         
 6   kod_vrab      10441 non-null  Int64         
 7   st_id_send    10441 non-null  Int64         
 8   st_id_rem     10441 non-null  Int64         
 9   distance      10441 non-null  Int64         
 10  month         10441 non-null  Int64         
dtypes: Int64(9), datetime64[ns](1), string(1)
memory usage: 1.0 MB


In [36]:
pr_rem['month'].max()

12

In [37]:
pr_rem[pr_rem['month'] == pr_rem['month'].max()].head()

Unnamed: 0,wagnum,rem_month,rod_id,model,road_id_send,road_id_rem,kod_vrab,st_id_send,st_id_rem,distance,month
2,22548,2022-12-08,1,12-1303-01,6,7,1,4421,3940,842,12
3,24902,2022-12-07,1,12-1303-01,6,7,1,4421,3940,827,12
4,27478,2022-12-06,1,12-7023-02,6,7,1,4421,3940,161,12
5,6458,2022-12-17,0,11-276,6,6,0,3946,3940,96,12
6,6678,2022-12-14,0,11-280,3,3,1,457,420,770,12


## Список вагонов с остаточным пробегом (wagons_probeg_ownersip)

In [18]:
# список вагонов с остаточным пробегом на момент прогноза
wag_prob = pd.read_parquet(path_train + '/wagons_probeg_ownersip.parquet').convert_dtypes()

In [19]:
wag_prob.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6249857 entries, 0 to 9249588
Data columns (total 8 columns):
 #   Column          Dtype         
---  ------          -----         
 0   repdate         datetime64[ns]
 1   wagnum          Int64         
 2   ost_prob        Int64         
 3   manage_type     Int64         
 4   rod_id          Int64         
 5   reestr_state    Int64         
 6   ownership_type  Int64         
 7   month           Int64         
dtypes: Int64(7), datetime64[ns](1)
memory usage: 470.9 MB


In [28]:
wag_prob['repdate'].max()

Timestamp('2023-01-31 00:00:00')

In [46]:
wag_prob['repdate'].min()

Timestamp('2022-08-01 00:00:00')

In [33]:
wag_prob[wag_prob['repdate'] == wag_prob['repdate'].max()].head()

Unnamed: 0,repdate,wagnum,ost_prob,manage_type,rod_id,reestr_state,ownership_type,month
183,2023-01-31,33361,150961,0,1,1,0,1
456,2023-01-31,33364,148432,0,1,1,0,1
729,2023-01-31,33366,154991,0,1,1,0,1
1002,2023-01-31,33358,24859,0,1,1,0,1
1275,2023-01-31,33349,144208,0,1,1,0,1


## Параметры вагона (wag_params)

In [20]:
# параметры вагона
wag_param = pd.read_parquet(path_train + '/wag_params.parquet').convert_dtypes()

In [29]:
# у вагонов могут меняться параметры, поэтмоу номер дублируется
wag_param = wag_param.drop_duplicates(subset='wagnum', keep='last')

In [30]:
wag_param.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33977 entries, 3218 to 33707
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   wagnum              33977 non-null  Int64         
 1   model               33977 non-null  string        
 2   rod_id              33977 non-null  Int64         
 3   gruz                33977 non-null  Int64         
 4   cnsi_gruz_capacity  33977 non-null  Int64         
 5   cnsi_volumek        33977 non-null  Float64       
 6   tara                33977 non-null  Int64         
 7   date_build          33977 non-null  datetime64[ns]
 8   srok_sl             33977 non-null  datetime64[ns]
 9   zavod_build         33977 non-null  Int64         
 10  date_iskl           116 non-null    datetime64[ns]
 11  cnsi_probeg_dr      33977 non-null  Int64         
 12  cnsi_probeg_kr      33977 non-null  Int64         
 13  kuzov               33977 non-null  Int64  

In [31]:
wag_param.head()

Unnamed: 0,wagnum,model,rod_id,gruz,cnsi_gruz_capacity,cnsi_volumek,tara,date_build,srok_sl,zavod_build,date_iskl,cnsi_probeg_dr,cnsi_probeg_kr,kuzov,telega,tormoz,tipvozd,tippogl,norma_km,ownertype
3218,26318,12-600-04,1,682,682,85.0,240,1992-12-25,2022-04-27,5,2023-02-16,160,160,2,9,3,6,11,110000,0
19128,28344,12-132,1,700,700,88.0,240,2003-08-12,2024-12-24,0,2022-12-14,110,160,2,9,2,1,12,0,0
21526,8099,11-286,0,670,670,138.0,270,1995-08-31,2027-10-01,1,NaT,110,160,2,9,2,1,1,160000,1
32353,33350,12-9850-02,1,750,750,90.0,248,2014-10-27,2047-02-05,19,NaT,250,500,2,11,2,7,12,250000,1
81,5308,11-276,0,680,680,122.0,260,1995-09-17,2027-09-28,1,NaT,110,160,2,9,2,1,11,160000,1


## Текущие ремонты (tr_rems)

In [5]:
# текущие ремонты вагонов
tr_rem = pd.read_parquet(path_train + '/tr_rems.parquet').convert_dtypes()

In [23]:
tr_rem.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48652 entries, 0 to 7699
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   wagnum        48652 non-null  Int64         
 1   rem_month     48652 non-null  datetime64[ns]
 2   kod_vrab      48652 non-null  Int64         
 3   neis1_kod     48652 non-null  Int64         
 4   neis2_kod     48652 non-null  Int64         
 5   neis3_kod     48652 non-null  Int64         
 6   mod1_kod      48652 non-null  Int64         
 7   mod2_kod      48652 non-null  Int64         
 8   mod3_kod      48652 non-null  Int64         
 9   mod4_kod      48652 non-null  Int32         
 10  mod5_kod      48652 non-null  Int32         
 11  mod6_kod      48652 non-null  Int32         
 12  mod7_kod      48652 non-null  Int32         
 13  road_id_send  48652 non-null  Int64         
 14  gr_probeg     47920 non-null  Int64         
 15  por_probeg    47977 non-null  Int64  

In [44]:
tr_rem['rem_month'].max()

Timestamp('2023-01-01 00:00:00')

In [45]:
tr_rem[tr_rem['rem_month'] == tr_rem['rem_month'].max()].head()

Unnamed: 0,wagnum,rem_month,kod_vrab,neis1_kod,neis2_kod,neis3_kod,mod1_kod,mod2_kod,mod3_kod,mod4_kod,mod5_kod,mod6_kod,mod7_kod,road_id_send,gr_probeg,por_probeg,st_id_send
0,29857,2023-01-01,5,0,98,54,7,4,2,0,0,0,0,3,105003,53305,594
1,29857,2023-01-01,3,6,98,54,7,4,2,0,0,0,0,3,105003,53193,392
2,29928,2023-01-01,2,0,98,54,7,4,2,0,0,0,0,16,103036,51268,4247
3,29928,2023-01-01,2,95,98,54,7,4,2,0,0,0,0,16,103036,51268,4247
4,2360,2023-01-01,2,0,98,54,7,4,2,0,0,0,0,44,92501,28550,12437


## Первичный анализ

## target

In [6]:
month_to_predict = pd.to_datetime('2023-01-01')

In [7]:
# таргет по прогноза выбытия вагонов в ПР на месяц и на 10 дней
target = pd.read_csv(path_train +'/target/y_train.csv').convert_dtypes()

In [8]:
target.head()

Unnamed: 0,wagnum,month,target_month,target_day
0,33361,2023-01-01,0,0
1,33364,2023-01-01,0,0
2,33366,2023-01-01,0,0
3,33358,2023-01-01,0,0
4,33349,2023-01-01,0,0


In [9]:
target.month = pd.to_datetime(target.month)

In [10]:
target = target[target.month == month_to_predict][['wagnum','target_month','target_day']]

In [11]:
target.target_month.sum(), target.target_day.sum()

(1676, 461)

In [12]:
target.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33973 entries, 0 to 33972
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   wagnum        33973 non-null  Int64
 1   target_month  33973 non-null  Int64
 2   target_day    33973 non-null  Int64
dtypes: Int64(3)
memory usage: 1.1 MB


# EDA

# Предобработка данных

# Feature engineering

# Base-line

## Важность признаков

# Feature engineering v2