In [1]:
import numpy
import pandas as pd

#### Загрузка данных и обработка

In [24]:
data1 = pd.read_csv('.\hackathon\data1.csv', parse_dates=['date'])

data2 = pd.read_csv('.\hackathon\data2.csv', parse_dates=['date'])

data3 = pd.read_csv('.\hackathon\data3.csv', parse_dates=['date'])

data4 = pd.read_csv('.\hackathon\data4.csv', parse_dates=['date'])

data5 = pd.read_csv('.\hackathon\data5.csv', parse_dates=['date'])

In [25]:
print(data1.isna().sum())
print(data2.isna().sum())
print(data3.isna().sum())
print(data4.isna().sum())
print(data5.isna().sum())

value    0
date     0
dtype: int64
value    1
date     0
dtype: int64
value    6
date     0
dtype: int64
date     0
value    0
dtype: int64
date     0
value    0
dtype: int64


In [26]:
# Используем линейную интерполяцию для заполнения пропусков
data2['value'] = data2['value'].interpolate(method='linear')
data3['value'] = data3['value'].interpolate(method='linear')

#### Займёмся генерацией фичей для Датасета 1

In [27]:
data1.head()

Unnamed: 0,value,date
0,898.0,2022-01-01
1,3167.0,2022-01-02
2,3380.0,2022-01-03
3,3423.0,2022-01-04
4,3373.0,2022-01-05


In [28]:
from datetime import timedelta
import holidays

# Добавляем временные признаки

data1['date'] = pd.to_datetime(data1['date'])

data1['year'] = data1['date'].dt.year
data1['month'] = data1['date'].dt.month
data1['day'] = data1['date'].dt.day
data1['dayofweek'] = data1['date'].dt.dayofweek  # Понедельник=0, Воскресенье=6
data1['dayofyear'] = data1['date'].dt.dayofyear
data1['weekofyear'] = data1['date'].dt.isocalendar().week

# Добавляем информацию о праздниках (можно подстроить под страну)
holiday_list = holidays.Russia(years=data1['year'].iloc[0])  # Для России, можно изменить на другую страну
data1['is_holiday'] = data1['date'].isin(holiday_list)

data1.head()

Unnamed: 0,value,date,year,month,day,dayofweek,dayofyear,weekofyear,is_holiday
0,898.0,2022-01-01,2022,1,1,5,1,52,True
1,3167.0,2022-01-02,2022,1,2,6,2,52,True
2,3380.0,2022-01-03,2022,1,3,0,3,1,True
3,3423.0,2022-01-04,2022,1,4,1,4,1,True
4,3373.0,2022-01-05,2022,1,5,2,5,1,True


In [29]:
# Дополнительные признаки
data1['is_weekend'] = data1['dayofweek'].isin([5, 6])  # Суббота (5) и воскресенье (6)
data1['quarter'] = data1['date'].dt.quarter  # Квартал года

data1.head()


Unnamed: 0,value,date,year,month,day,dayofweek,dayofyear,weekofyear,is_holiday,is_weekend,quarter
0,898.0,2022-01-01,2022,1,1,5,1,52,True,True,1
1,3167.0,2022-01-02,2022,1,2,6,2,52,True,True,1
2,3380.0,2022-01-03,2022,1,3,0,3,1,True,False,1
3,3423.0,2022-01-04,2022,1,4,1,4,1,True,False,1
4,3373.0,2022-01-05,2022,1,5,2,5,1,True,False,1


In [30]:
# Добавим также лаговые признаки
# которые представляют собой значения временного ряда на предыдущих шагах
data1['lag_1'] = data1['value'].shift(1)  # Лаг 1
data1['lag_2'] = data1['value'].shift(2)  # Лаг 2
data1['lag_3'] = data1['value'].shift(3)  # Лаг 3

data1[['lag_1', 'lag_2', 'lag_3']] = data1[['lag_1', 'lag_2', 'lag_3']].fillna(0)

data1.head()

Unnamed: 0,value,date,year,month,day,dayofweek,dayofyear,weekofyear,is_holiday,is_weekend,quarter,lag_1,lag_2,lag_3
0,898.0,2022-01-01,2022,1,1,5,1,52,True,True,1,0.0,0.0,0.0
1,3167.0,2022-01-02,2022,1,2,6,2,52,True,True,1,898.0,0.0,0.0
2,3380.0,2022-01-03,2022,1,3,0,3,1,True,False,1,3167.0,898.0,0.0
3,3423.0,2022-01-04,2022,1,4,1,4,1,True,False,1,3380.0,3167.0,898.0
4,3373.0,2022-01-05,2022,1,5,2,5,1,True,False,1,3423.0,3380.0,3167.0


In [31]:
# Можно добавить агрегированные признаки на основе скользящих окон
# Например, среднее, медиану, минимум и максимум за последние 7 дней
data1['rolling_mean_7'] = data1['value'].rolling(window=7).mean()  # Скользящее среднее за 7 дней
data1['rolling_min_7'] = data1['value'].rolling(window=7).min()    # Скользящее минимальное значение за 7 дней
data1['rolling_max_7'] = data1['value'].rolling(window=7).max()    # Скользящее максимальное значение за 7 дней

data1[['rolling_mean_7', 'rolling_min_7', 'rolling_max_7']] = data1[['rolling_mean_7', 'rolling_min_7', 'rolling_max_7']].fillna(0)

data1.head(10)

Unnamed: 0,value,date,year,month,day,dayofweek,dayofyear,weekofyear,is_holiday,is_weekend,quarter,lag_1,lag_2,lag_3,rolling_mean_7,rolling_min_7,rolling_max_7
0,898.0,2022-01-01,2022,1,1,5,1,52,True,True,1,0.0,0.0,0.0,0.0,0.0,0.0
1,3167.0,2022-01-02,2022,1,2,6,2,52,True,True,1,898.0,0.0,0.0,0.0,0.0,0.0
2,3380.0,2022-01-03,2022,1,3,0,3,1,True,False,1,3167.0,898.0,0.0,0.0,0.0,0.0
3,3423.0,2022-01-04,2022,1,4,1,4,1,True,False,1,3380.0,3167.0,898.0,0.0,0.0,0.0
4,3373.0,2022-01-05,2022,1,5,2,5,1,True,False,1,3423.0,3380.0,3167.0,0.0,0.0,0.0
5,3767.0,2022-01-06,2022,1,6,3,6,1,True,False,1,3373.0,3423.0,3380.0,0.0,0.0,0.0
6,3511.0,2022-01-07,2022,1,7,4,7,1,True,False,1,3767.0,3373.0,3423.0,3074.142857,898.0,3767.0
7,3645.0,2022-01-08,2022,1,8,5,8,1,True,True,1,3511.0,3767.0,3373.0,3466.571429,3167.0,3767.0
8,3543.0,2022-01-09,2022,1,9,6,9,1,False,True,1,3645.0,3511.0,3767.0,3520.285714,3373.0,3767.0
9,2122.0,2022-01-10,2022,1,10,0,10,2,False,False,1,3543.0,3645.0,3511.0,3340.571429,2122.0,3767.0


#### Создадим класс предобработки данных и применим его к остальным временным рядам

In [38]:
class TimeSeriesPreprocessor:
    def __init__(self, country='Russia', holiday_years=None, interpolate=True):
        self.country = country
        self.holiday_years = holiday_years
        self.interpolate = interpolate

    def add_time_features(self, df):
        # Добавление временных признаков
        # Преобразуем в формат datetime
        df['date'] = pd.to_datetime(df['date'])

        # Добавляем базовые признаки
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['day'] = df['date'].dt.day
        df['dayofweek'] = df['date'].dt.dayofweek
        df['dayofyear'] = df['date'].dt.dayofyear
        df['weekofyear'] = df['date'].dt.isocalendar().week

        # Добавляем информацию о праздниках
        holiday_list = holidays.CountryHoliday(self.country, years=self.holiday_years if self.holiday_years else [df['year'].iloc[0]])
        df['is_holiday'] = df['date'].isin(holiday_list)

        # Дополнительные признаки
        df['is_weekend'] = df['dayofweek'].isin([5, 6])  # Суббота (5) и воскресенье (6)
        df['quarter'] = df['date'].dt.quarter  # Квартал года
        
        return df

    def add_lag_features(self, df, lags=[1, 2, 3]):
        for lag in lags:
            df[f'lag_{lag}'] = df['value'].shift(lag)  

        df[['lag_1', 'lag_2', 'lag_3']] = df[['lag_1', 'lag_2', 'lag_3']].fillna(0)

        return df
    
    def add_rolling_features(self, df, windows=[7]):
        for window in windows:
            df[f'rolling_mean_{window}'] = df['value'].rolling(window=window).mean()  # Скользящее среднее
            df[f'rolling_min_{window}'] = df['value'].rolling(window=window).min()  # Минимум
            df[f'rolling_max_{window}'] = df['value'].rolling(window=window).max()  # Максимум
        
        df[[f'rolling_mean_{window}' for window in windows]] = df[[f'rolling_mean_{window}' for window in windows]].fillna(0)
        df[[f'rolling_min_{window}' for window in windows]] = df[[f'rolling_min_{window}' for window in windows]].fillna(0)
        df[[f'rolling_max_{window}' for window in windows]] = df[[f'rolling_max_{window}' for window in windows]].fillna(0)

        return df
    
    def preprocess(self, df):
        if self.interpolate:
            df['value'] = df['value'].interpolate(method='linear')

        # Добавляем временные признаки
        df = self.add_time_features(df)

        # Добавляем лаговые признаки
        df = self.add_lag_features(df)

        # Добавляем скользящие признаки
        df = self.add_rolling_features(df)

        df = df.replace({True: 1, False: 0})

        return df


In [39]:
# Проверим работу препроцессора

preprocessor = TimeSeriesPreprocessor(country='Russia', holiday_years=[2022, 2023], interpolate=True)

data2_processed = preprocessor.preprocess(data2)

data2_processed.head(10)

Unnamed: 0,value,date,year,month,day,dayofweek,dayofyear,weekofyear,is_holiday,is_weekend,quarter,lag_1,lag_2,lag_3,rolling_mean_7,rolling_min_7,rolling_max_7
0,332.0,2022-01-01,2022,1,1,5,1,52,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0
1,1011.0,2022-01-02,2022,1,2,6,2,52,1,1,1,332.0,0.0,0.0,0.0,0.0,0.0
2,1102.0,2022-01-03,2022,1,3,0,3,1,1,0,1,1011.0,332.0,0.0,0.0,0.0,0.0
3,1065.0,2022-01-04,2022,1,4,1,4,1,1,0,1,1102.0,1011.0,332.0,0.0,0.0,0.0
4,819.0,2022-01-05,2022,1,5,2,5,1,1,0,1,1065.0,1102.0,1011.0,0.0,0.0,0.0
5,721.0,2022-01-06,2022,1,6,3,6,1,1,0,1,819.0,1065.0,1102.0,0.0,0.0,0.0
6,759.0,2022-01-07,2022,1,7,4,7,1,1,0,1,721.0,819.0,1065.0,829.857143,332.0,1102.0
7,878.0,2022-01-08,2022,1,8,5,8,1,1,1,1,759.0,721.0,819.0,907.857143,721.0,1102.0
8,598.0,2022-01-09,2022,1,9,6,9,1,0,1,1,878.0,759.0,721.0,848.857143,598.0,1102.0
9,154.0,2022-01-10,2022,1,10,0,10,2,0,0,1,598.0,878.0,759.0,713.428571,154.0,1065.0


In [40]:
data3_processed = preprocessor.preprocess(data3)

data4_processed = preprocessor.preprocess(data4)

data5_processed = preprocessor.preprocess(data5)