#Feature Extraction

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_path = '/content/drive/MyDrive/Sales Forecast/train_processed1.csv'
train = pd.read_csv(train_path, parse_dates=['Date'])

##Datetime-Based Features

In [None]:
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['Day'] = train['Date'].dt.day
train['DayOfYear'] = train['Date'].dt.dayofyear
train['WeekOfYear'] = train['Date'].dt.isocalendar().week.astype(int)
train['IsWeekend'] = (train['DayOfWeek'] >= 6).astype(int)
train['Quarter'] = train['Date'].dt.quarter
train['IsMonthStart'] = train['Date'].dt.is_month_start.astype(int)
train['IsMonthEnd'] = train['Date'].dt.is_month_end.astype(int)

##Competition and Promo Duration Features

In [None]:
train['CompetitionOpenSince'] = (
    12 * (train['Year'] - train['CompetitionOpenSinceYear']) +
    (train['Month'] - train['CompetitionOpenSinceMonth'])
).clip(lower=0)

In [None]:
train['Promo2Since'] = (
    52 * (train['Year'] - train['Promo2SinceYear']) +
    (train['WeekOfYear'] - train['Promo2SinceWeek'])
).clip(lower=0)

In [None]:
train['IsPromo2Month'] = 0
promo_month_map = {
    1: [1, 4, 7, 10],   # Jan, Apr, Jul, Oct
    2: [2, 5, 8, 11],   # Feb, May, Aug, Nov
    3: [3, 6, 9, 12]    # Mar, Jun, Sept, Dec
}

for interval_code, months in promo_month_map.items():
    train.loc[
        (train['PromoInterval'] == interval_code) & (train['Month'].isin(months)),
        'IsPromo2Month'
    ] = 1

##Store-level Ridge Trend Features

In [None]:
trend_df = []
for store, group in train.groupby('Store'):
    X = group[['DateInt']]
    y = group['Sales']2
    if len(X) >= 30:
        model = Ridge()
        model.fit(X, y)
        slope = model.coef_[0]
    else:
        slope = 0
    trend_df.append((store, slope))

store_trends = pd.DataFrame(trend_df, columns=['Store', 'StoreTrend'])
train = train.merge(store_trends, on='Store', how='left')

##Lag and Rolling Features

In [None]:
# train = train.sort_values(['Store', 'Date'])

# for lag in [1, 7, 14]:
#     train[f'Sales_lag_{lag}'] = train.groupby('Store')['Sales'].shift(lag)

# for window in [7, 14]:
#     train[f'Sales_roll_mean_{window}'] = train.groupby('Store')['Sales'].shift(1).rolling(window).mean().reset_index(level=0, drop=True)
#     train[f'Sales_roll_std_{window}'] = train.groupby('Store')['Sales'].shift(1).rolling(window).std().reset_index(level=0, drop=True)

train = train.sort_values(['Store', 'Date'])
for lag in [1, 7, 14, 30]:
    train[f'Sales_lag_{lag}'] = train.groupby('Store')['Sales'].shift(lag)

train['Sales_roll_mean_7'] = train.groupby('Store')['Sales'].shift(1).rolling(window=7).mean().reset_index(0, drop=True)
train['Sales_roll_std_7'] = train.groupby('Store')['Sales'].shift(1).rolling(window=7).std().reset_index(0, drop=True)


##Decomposition Features by StoreType (Seasonal-Trend)

In [None]:
decomposed = []
for stype in train['StoreType'].unique():
    df = train[train['StoreType'] == stype].copy()
    ts = df.set_index('Date').resample('D')['Sales'].mean().fillna(method='ffill')
    if len(ts) < 60:
        continue
    result = seasonal_decompose(ts, model='additive', period=7)
    tmp = pd.DataFrame({
        'Date': result.trend.index,
        f'StoreType_{stype}_trend': result.trend.values,
        f'StoreType_{stype}_seasonal': result.seasonal.values,
        f'StoreType_{stype}_resid': result.resid.values
    })
    decomposed.append(tmp)

decomposed_df = decomposed[0]
for d in decomposed[1:]:
    decomposed_df = pd.merge(decomposed_df, d, on='Date', how='outer')

train = pd.merge(train, decomposed_df, on='Date', how='left')

  ts = df.set_index('Date').resample('D')['Sales'].mean().fillna(method='ffill')
  ts = df.set_index('Date').resample('D')['Sales'].mean().fillna(method='ffill')
  ts = df.set_index('Date').resample('D')['Sales'].mean().fillna(method='ffill')


##Decomposition Features by Assortment (Seasonal-Trend)

In [None]:
decomposed_assort = []
for a_type in train['Assortment'].unique():
    df = train[train['Assortment'] == a_type].copy()
    ts = df.set_index('Date').resample('D')['Sales'].mean().fillna(method='ffill')
    if len(ts) < 60:
        continue
    result = seasonal_decompose(ts, model='additive', period=7)
    tmp = pd.DataFrame({
        'Date': result.trend.index,
        f'Assortment_{a_type}_trend': result.trend.values,
        f'Assortment_{a_type}_seasonal': result.seasonal.values,
        f'Assortment_{a_type}_resid': result.resid.values
    })
    decomposed_assort.append(tmp)

decomposed_df_assort = decomposed_assort[0]
for d in decomposed_assort[1:]:
    decomposed_df_assort = pd.merge(decomposed_df_assort, d, on='Date', how='outer')

train = pd.merge(train, decomposed_df_assort, on='Date', how='left')


  ts = df.set_index('Date').resample('D')['Sales'].mean().fillna(method='ffill')
  ts = df.set_index('Date').resample('D')['Sales'].mean().fillna(method='ffill')


In [None]:
train.columns.tolist()

['Store',
 'DayOfWeek',
 'Date',
 'Sales',
 'Open',
 'Promo',
 'StateHoliday',
 'SchoolHoliday',
 'StoreType',
 'Assortment',
 'CompetitionDistance',
 'CompetitionOpenSinceMonth',
 'CompetitionOpenSinceYear',
 'Promo2',
 'Promo2SinceWeek',
 'Promo2SinceYear',
 'PromoInterval',
 'DateInt',
 'Year',
 'Month',
 'Day',
 'DayOfYear',
 'WeekOfYear',
 'IsWeekend',
 'Quarter',
 'IsMonthStart',
 'IsMonthEnd',
 'CompetitionOpenSince',
 'Promo2Since',
 'IsPromo2Month',
 'StoreTrend',
 'Sales_lag_1',
 'Sales_lag_7',
 'Sales_lag_14',
 'Sales_lag_30',
 'Sales_roll_mean_7',
 'Sales_roll_std_7',
 'StoreType_2_trend',
 'StoreType_2_seasonal',
 'StoreType_2_resid',
 'StoreType_0_trend',
 'StoreType_0_seasonal',
 'StoreType_0_resid',
 'StoreType_3_trend',
 'StoreType_3_seasonal',
 'StoreType_3_resid',
 'StoreType_1_trend',
 'StoreType_1_seasonal',
 'StoreType_1_resid',
 'Assortment_0_trend',
 'Assortment_0_seasonal',
 'Assortment_0_resid',
 'Assortment_2_trend',
 'Assortment_2_seasonal',
 'Assortment_2_r

These days have no sales and do not help the model learn useful patterns.


In [None]:
train = train[(train['Open'] == 1) & (train['Sales'] > 0)]

In [None]:
train.shape

(844338, 58)

In [None]:
train.to_csv('/content/drive/MyDrive/Sales Forecast/train_final1.csv', index=False)