# Goal

Try to get a good model by spamming infinite features

Process
1. Come up with list of possible features
2. Identify which ones are easiest to add
3. Spam all of them, run lgbm --> see result

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
import itertools
import lightgbm as lgb
import sys

In [2]:
DIRECTORY = './data/'
TRANSLATED = DIRECTORY + 'translated/'
categories = pd.read_csv(TRANSLATED + 'item_categories.csv').drop(columns={'Unnamed: 0'})
items = pd.read_csv(TRANSLATED + 'items.csv').drop(columns={'Unnamed: 0'})
shops = pd.read_csv(TRANSLATED + 'shops.csv').drop(columns={'Unnamed: 0'})
train = pd.read_csv(DIRECTORY + 'sales_train.csv', parse_dates=['date'], date_format='%d.%m.%Y')

## Features

1. Groupings of items & shops
    - item first name
    - item first 4, 6, 11 characters
    - shop first name
    - shop parenthesis
    - item first release date
    - item contains word (Y/N) - for things like PC

    double groupings:
    - item first release date for a given shop
    - item shop release date - item first release date

2. Rolling / lag sales
    grouping combos:
    single groupings
    groupings choose 2 (like shop + overall item age)

    features:
    - lag 1m
    - lag 2m
    - 12m rolling mean
    - 12m rolling fixed for release date
    - ratio 1m/12m rolling

3. Time
    - time features

3. Type of item features
    - contains word

In [3]:
# Outlier clipping
train['item_cnt_day_clipped'] = np.clip(train['item_cnt_day'], 0, 20)

### Part 1: Groupings

In [4]:
items_combined = pd.merge(left=items, right=categories, on=['item_category_id'], how='outer')
print("Na values:", items_combined.isna().sum().sum()) # check that nothing missing

# lowercase (modifying in place)
items_combined['item_name'] = items_combined['item_name'].str.lower()
items_combined['item_category_name'] = items_combined['item_category_name'].str.lower()

# item name
items_combined['item_name_cleaned'] = items_combined['item_name'].str.replace(r'[\s\W]', '', regex=True)
prefixes = [4,8,11]
for prefix in prefixes:
    items_combined[f'item_name_{prefix}'] = items_combined['item_name_cleaned'].str[:prefix]
# items_combined.head()

# category name
items_combined['category_first'] = items_combined['item_category_name'].str.split(r'[\-\(]', regex=True).str[0].str.strip()
items_combined['category_second'] = items_combined['item_category_name'].str.split(r'[\-\(]', regex=True).str[1].str.strip()
# items_combined.head()

# shop
shops = pd.DataFrame(shops)
shops['shop_name'] = shops['shop_name'].str.lower().replace(r'[\,\.\!]', '', regex=True)
shops['shop_first'] = shops['shop_name'].str.split().str[0]
shops['shop_second'] = shops['shop_name'].str.split().str[1]
shops['shop_parenthesis'] = shops['shop_name'].str.extract(r'\"(.*)\"')
# shops.head()

Na values: 0


# Part 2: Add in more of the monthly lag features

In [5]:
# Aggregation df --> add all shopxitem combos per month
combos = []
for month in train['date_block_num'].unique():
    month_items = train[train['date_block_num'] == month]['item_id'].unique()
    month_shops = train[train['date_block_num'] == month]['shop_id'].unique()

    combos.append(np.array(list(itertools.product(month_items, month_shops, [month]))))
aggregated = pd.DataFrame(data=np.vstack(combos), columns=['item_id', 'shop_id', 'date_block_num'])

In [6]:
train_monthly = train.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum().to_frame('month_sales').reset_index()
aggregated = aggregated.merge(train_monthly, how='left').fillna(0)
aggregated['month_sales'] = np.clip(aggregated['month_sales'], 0, 20)

In [7]:
# starting aggregation df
aggregated = aggregated.merge(items_combined, on=['item_id']).merge(shops, on=['shop_id'])
aggregated['date_as_time'] = pd.to_datetime(aggregated['date_block_num'], unit='s')
aggregated = aggregated.sort_values(by='date_as_time')
aggregated[['category_second', 'shop_parenthesis']] = aggregated[['category_second', 'shop_parenthesis']].fillna('')

In [8]:
%%time
# def median_sales_by(df: pd.DataFrame, group_by_cols: list, aggregate_col):
#     grouped = df.groupby(group_by_cols + ['date_block_num'])[aggregate_col].transform('median')
#     return grouped

# def first_sale_by(df: pd.DataFrame, group_by_cols: list):
#     grouped = df['date_block_num'] - df.groupby(group_by_cols)['date_block_num'].transform('min')
#     return grouped

# def rolling_by(df: pd.DataFrame, group_by_cols: list, time_shift: int):
#     grouped = df.groupby(group_by_cols) \
#                 .rolling(f'{time_shift}s', on='date_as_time', closed='left')['month_sales'] \
#                 .median()
#     return grouped

def lag_by(df: pd.DataFrame, group_by_cols: list, time_shift: int):
    combined_cols = group_by_cols + ['date_block_num']
    
    grouped = df.groupby(combined_cols)['month_sales'].mean().to_frame('lag_sales').reset_index()
    grouped['date_block_num'] = grouped['date_block_num'] + time_shift
    res = df.merge(right=grouped, on=combined_cols, how='left')['lag_sales'].fillna(-1)
    return res

groupings = [['item_id'], ['item_category_id'], ['shop_id'], ['item_name_4'], ['item_name_8'], ['item_name_11'], ['shop_first'], ['category_first'], ['category_second']]
# for grouping in groupings:
#     aggregated[f'median_{''.join(grouping)}'] = median_sales_by(aggregated, grouping, 'month_sales')

# for grouping in groupings:
#     aggregated[f'months_since_{''.join(grouping)}'] = first_sale_by(aggregated, grouping)

for window, grouping in itertools.product([1,2,12], groupings):
    print(window, grouping)
    aggregated[f'{window}_rolling({''.join(grouping)})'] = lag_by(aggregated, grouping, window).values

1 ['item_id']
1 ['item_category_id']
1 ['shop_id']
1 ['item_name_4']
1 ['item_name_8']
1 ['item_name_11']
1 ['shop_first']
1 ['category_first']
1 ['category_second']
2 ['item_id']
2 ['item_category_id']
2 ['shop_id']
2 ['item_name_4']
2 ['item_name_8']
2 ['item_name_11']
2 ['shop_first']
2 ['category_first']
2 ['category_second']
12 ['item_id']
12 ['item_category_id']
12 ['shop_id']
12 ['item_name_4']
12 ['item_name_8']
12 ['item_name_11']
12 ['shop_first']
12 ['category_first']
12 ['category_second']
CPU times: user 1min 10s, sys: 55.9 s, total: 2min 6s
Wall time: 2min 27s


In [12]:
final_df = aggregated.copy()
if 'date_as_time' in final_df.columns: final_df.drop(columns={'date_as_time'})

def compress_dataframe(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col], _ = pd.factorize(df[col])
    
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    
    return df

print("Memory before:", final_df.memory_usage().sum() / (1024**2), "MB")
final_df = compress_dataframe(final_df)
print("Memory after:", final_df.memory_usage().sum() / (1024**2), "MB")

Memory before: 3830.2391052246094 MB
Memory after: 1561.2387657165527 MB


In [13]:
final_df = final_df.drop(columns=['date_as_time'])

In [14]:
# Need to backtest now

train, val, test = final_df[final_df['date_block_num'] < 32], final_df[final_df['date_block_num'] == 32], final_df[final_df['date_block_num'] == 33]
X_train, Y_train = train.drop(columns='month_sales'), train['month_sales']
X_val, Y_val = val.drop(columns='month_sales'), val['month_sales']
X_test, Y_test = test.drop(columns='month_sales'), test['month_sales']

model = lgb.LGBMRegressor(n_estimators=1000, objective='rmse', learning_rate=0.01)
lgb_train = lgb.Dataset(X_train, Y_train)
lgb_val = lgb.Dataset(X_val, Y_val)

model.fit(X_train, Y_train, eval_set=[(X_val, Y_val)])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.294244 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8853
[LightGBM] [Info] Number of data points in the train set: 10457023, number of used features: 43
[LightGBM] [Info] Start training from score 0.299865


In [149]:
predictions = model.predict(X_test)
predictions_df = pd.concat([X_test, Y_test], axis=1)
predictions_df['prediction'] = predictions

mean_squared_error(predictions_df['prediction'], predictions_df['month_sales'])

1.042434951335637

In [150]:
# Use model predictions to get full output for test set
test_stores = test['shop_id'].unique()
test_items = test['item_id'].unique()

# Results df
results = pd.DataFrame(data=itertools.product(test_stores, test_items), columns=['shop_id', 'item_id'])
results = results.merge(predictions_df[['shop_id', 'item_id', 'prediction', 'month_sales']], on=['shop_id', 'item_id'], how='left')
results = results.fillna(0)

In [151]:
# Actual res
mean_squared_error(results['prediction'], results['month_sales'])

1.0424349513356368