In [3]:
%matplotlib inline
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

from catboost import CatBoostClassifier
import catboost

import ydata_profiling
import phik

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

from pylab import rcParams
import warnings

sns.set(style="darkgrid")
rcParams['figure.figsize'] = 20, 9

import sys

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
sys.path.append('./train')

In [6]:
from metrics_f1 import calc_f1_score

In [7]:
path_train = r"./train"

In [8]:
# список вагонов с остаточным пробегом на момент прогноза
wag_prob_clear = pd.read_parquet(path_train + '/wagons_probeg_ownersip.parquet').convert_dtypes()

In [9]:
path_train_v2 = './train/test'

In [10]:
# список вагонов с остаточным пробегом на момент прогноза
wag_prob_clear_v2 = pd.read_parquet(path_train_v2 + '/wagons_probeg_ownersip.parquet').convert_dtypes()

In [11]:
wag_prob_clear_all = pd.concat([wag_prob_clear, wag_prob_clear_v2])

In [12]:
wag_prob_clear_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7193667 entries, 0 to 9249486
Data columns (total 8 columns):
 #   Column          Dtype         
---  ------          -----         
 0   repdate         datetime64[ns]
 1   wagnum          Int64         
 2   ost_prob        Int64         
 3   manage_type     Int64         
 4   rod_id          Int64         
 5   reestr_state    Int64         
 6   ownership_type  Int64         
 7   month           Int64         
dtypes: Int64(7), datetime64[ns](1)
memory usage: 542.0 MB


In [13]:
wag_prob_clear_all['repdate'].unique()

array(['2022-08-01T00:00:00.000000000', '2022-08-02T00:00:00.000000000',
       '2022-08-03T00:00:00.000000000', '2022-08-04T00:00:00.000000000',
       '2022-08-05T00:00:00.000000000', '2022-08-06T00:00:00.000000000',
       '2022-08-07T00:00:00.000000000', '2022-08-08T00:00:00.000000000',
       '2022-08-09T00:00:00.000000000', '2022-08-10T00:00:00.000000000',
       '2022-08-11T00:00:00.000000000', '2022-08-12T00:00:00.000000000',
       '2022-08-13T00:00:00.000000000', '2022-08-14T00:00:00.000000000',
       '2022-08-15T00:00:00.000000000', '2022-08-16T00:00:00.000000000',
       '2022-08-17T00:00:00.000000000', '2022-08-18T00:00:00.000000000',
       '2022-08-19T00:00:00.000000000', '2022-08-20T00:00:00.000000000',
       '2022-08-21T00:00:00.000000000', '2022-08-22T00:00:00.000000000',
       '2022-08-23T00:00:00.000000000', '2022-08-24T00:00:00.000000000',
       '2022-08-25T00:00:00.000000000', '2022-08-26T00:00:00.000000000',
       '2022-08-27T00:00:00.000000000', '2022-08-28

In [15]:
wag_prob_pred = wag_prob_clear_all.copy()
wag_prob_pred= wag_prob_pred.drop(['manage_type'], axis=1)
wag_prob_pred= wag_prob_pred.drop(['rod_id'], axis=1)
wag_prob_pred= wag_prob_pred.drop(['reestr_state'], axis=1)

In [16]:
wag_prob_pred.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7193667 entries, 0 to 9249486
Data columns (total 5 columns):
 #   Column          Dtype         
---  ------          -----         
 0   repdate         datetime64[ns]
 1   wagnum          Int64         
 2   ost_prob        Int64         
 3   ownership_type  Int64         
 4   month           Int64         
dtypes: Int64(4), datetime64[ns](1)
memory usage: 356.7 MB


In [17]:
wag_prob_pred[(wag_prob_pred['repdate'] <= '2023-03-01')]['repdate'].unique()

array(['2022-08-01T00:00:00.000000000', '2022-08-02T00:00:00.000000000',
       '2022-08-03T00:00:00.000000000', '2022-08-04T00:00:00.000000000',
       '2022-08-05T00:00:00.000000000', '2022-08-06T00:00:00.000000000',
       '2022-08-07T00:00:00.000000000', '2022-08-08T00:00:00.000000000',
       '2022-08-09T00:00:00.000000000', '2022-08-10T00:00:00.000000000',
       '2022-08-11T00:00:00.000000000', '2022-08-12T00:00:00.000000000',
       '2022-08-13T00:00:00.000000000', '2022-08-14T00:00:00.000000000',
       '2022-08-15T00:00:00.000000000', '2022-08-16T00:00:00.000000000',
       '2022-08-17T00:00:00.000000000', '2022-08-18T00:00:00.000000000',
       '2022-08-19T00:00:00.000000000', '2022-08-20T00:00:00.000000000',
       '2022-08-21T00:00:00.000000000', '2022-08-22T00:00:00.000000000',
       '2022-08-23T00:00:00.000000000', '2022-08-24T00:00:00.000000000',
       '2022-08-25T00:00:00.000000000', '2022-08-26T00:00:00.000000000',
       '2022-08-27T00:00:00.000000000', '2022-08-28

In [19]:
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
 
selected_date = '2023-03-01'
 
unique_wagnum = wag_prob_pred['wagnum'].unique()
 
# Создание модели линейной регрессии и прогнозирование для каждого wagnum
forecast_results = []  
 
for wagnum in tqdm(unique_wagnum, desc='Прогресс'):
    data_wagnum = wag_prob_pred[(wag_prob_pred['wagnum'] == wagnum) & (wag_prob_pred['repdate'] <= selected_date)].copy()
    data_wagnum['repdate'] = pd.to_datetime(data_wagnum['repdate'])
    data_wagnum.set_index('repdate', inplace=True)
 
 
    data_wagnum = data_wagnum[data_wagnum['ost_prob'] != 0]
 
    if data_wagnum.empty:
 
        continue
 
 
 
    # Обучение модели линейной регрессии
    X = pd.to_numeric(pd.to_datetime(data_wagnum.index)).values.reshape(-1, 1)
    y = data_wagnum['ost_prob'].values
    model = LinearRegression()
    model.fit(X, y)
 
    # Прогноз на 30 дней
    future_dates = pd.date_range(start=selected_date, periods=30)
    X_future = pd.to_numeric(pd.to_datetime(future_dates)).values.reshape(-1, 1)
    forecast_30 = model.predict(X_future)
 
    # Сохранение прогноза на 30-ый день
    forecast_results.append({'wagnum': wagnum, 'forecast_30': forecast_30[-1]})

Прогресс: 100%|██████████████████████████████████████████████████████████████████| 33977/33977 [44:19<00:00, 12.78it/s]


In [20]:
df = pd.DataFrame(forecast_results)

In [28]:
df.head()

Unnamed: 0,wagnum,forecast_30
0,33361,209362.814887
1,33364,169745.425805
2,33366,198579.255264
3,33358,17584.192255
4,33349,221454.067814


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32970 entries, 0 to 32969
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   wagnum       32970 non-null  int64  
 1   forecast_30  32970 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 515.3 KB


In [23]:
df.to_csv('arima_forecast_prob_end_month')

In [25]:
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
 
selected_date = '2022-12-01'
 
unique_wagnum = wag_prob_pred['wagnum'].unique()
 
# Создание модели линейной регрессии и прогнозирование для каждого wagnum
forecast_results = []  
 
for wagnum in tqdm(unique_wagnum, desc='Прогресс'):
    data_wagnum = wag_prob_pred[(wag_prob_pred['wagnum'] == wagnum) & (wag_prob_pred['repdate'] <= selected_date)].copy()
    data_wagnum['repdate'] = pd.to_datetime(data_wagnum['repdate'])
    data_wagnum.set_index('repdate', inplace=True)
 
 
    data_wagnum = data_wagnum[data_wagnum['ost_prob'] != 0]
 
    if data_wagnum.empty:
 
        continue
 
 
 
    # Обучение модели линейной регрессии
    X = pd.to_numeric(pd.to_datetime(data_wagnum.index)).values.reshape(-1, 1)
    y = data_wagnum['ost_prob'].values
    model = LinearRegression()
    model.fit(X, y)
 
    # Прогноз на 30 дней
    future_dates = pd.date_range(start=selected_date, periods=30)
    X_future = pd.to_numeric(pd.to_datetime(future_dates)).values.reshape(-1, 1)
    forecast_30 = model.predict(X_future)
 
    # Сохранение прогноза на 30-ый день
    forecast_results.append({'wagnum': wagnum, 'forecast_30': forecast_30[-1]})

Прогресс: 100%|██████████████████████████████████████████████████████████████████| 33977/33977 [39:38<00:00, 14.29it/s]


In [26]:
df_train = pd.DataFrame(forecast_results)

In [27]:
df_train.head()

Unnamed: 0,wagnum,forecast_30
0,33361,45185.24478
1,33364,2031.907929
2,33366,-1336.016269
3,33358,25821.751983
4,33349,233161.072414


In [29]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32703 entries, 0 to 32702
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   wagnum       32703 non-null  int64  
 1   forecast_30  32703 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 511.1 KB


In [30]:
df_train.to_csv('train_arima_forecast_prob_end_month.csv')