# Installing LightGBM with GPU

In [None]:
%%time
!pip install --target=/kaggle/working --extra-index-url=https://pypi.nvidia.com "cudf-cu12==24.12.*" "cuml-cu12==24.12.*"
!rm -rf /kaggle/working/numpy*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
import itertools
import lightgbm as lgb
import sys
import time

In [None]:
import cudf
import cuml
print(cudf.__version__, cuml.__version__)

In [None]:
DIRECTORY = '/kaggle/input/competitive-data-science-predict-future-sales/'
TRANSLATED = '/kaggle/input/future-sales-translated/'
categories = pd.read_csv(TRANSLATED + 'item_categories.csv').drop(columns={'Unnamed: 0'})
items = pd.read_csv(TRANSLATED + 'items.csv').drop(columns={'Unnamed: 0'})
shops = pd.read_csv(TRANSLATED + 'shops.csv').drop(columns={'Unnamed: 0'})
train = pd.read_csv(DIRECTORY + 'sales_train.csv')
test = pd.read_csv(DIRECTORY + 'test.csv')
train['date'] = pd.to_datetime(train['date'], format='%d.%m.%Y')

# Memory handling helpers

In [None]:
# Monitor memory usage
memory_stats = []
for name, value in list(locals().items()):
    try:
        if isinstance(value, cudf.DataFrame):
            size = value.memory_usage(deep=True).sum()
        else:
            size = sys.getsizeof(value)
        memory_stats.append((name, size))
    except Exception:
        pass

memory_stats.sort(key=lambda x : -x[1])
print(memory_stats)

In [None]:
import gc
import rmm
gc.collect()
rmm.reinitialize()

# Categorical Groupings

Note: All done in pandas (so string ops are supported) - its fast anyways

In [None]:
# Outlier clipping
train['item_cnt_day'] = np.clip(train['item_cnt_day'], 0, 20)

In [None]:
items_combined = pd.merge(left=items, right=categories, on=['item_category_id'], how='outer')
print("Na values:", items_combined.isna().sum().sum()) # check that nothing missing

# lowercase (modifying in place)
items_combined['item_name'] = items_combined['item_name'].str.lower()
items_combined['item_category_name'] = items_combined['item_category_name'].str.lower()

# item name
items_combined['item_name_cleaned'] = items_combined['item_name'].str.replace(r'[\s\W]', '', regex=True)
prefixes = [4,8,11]
for prefix in prefixes:
    items_combined[f'item_name_{prefix}'] = items_combined['item_name_cleaned'].str[:prefix]
# items_combined.head()

# category name
items_combined['category_first'] = items_combined['item_category_name'].str.split(r'[\-\(]', regex=True).str[0].str.strip()
items_combined['category_second'] = items_combined['item_category_name'].str.split(r'[\-\(]', regex=True).str[1].str.strip()
# items_combined.head()

# shop
shops = pd.DataFrame(shops)
shops['shop_name'] = shops['shop_name'].str.lower().replace(r'[\,\.\!]', '', regex=True)
shops['shop_first'] = shops['shop_name'].str.split().str[0]
shops['shop_second'] = shops['shop_name'].str.split().str[1]
shops['shop_parenthesis'] = shops['shop_name'].str.extract(r'\"(.*)\"')
# shops.head()

# Setting up Test Df
1. Create all unique store/item combos per month (switch to cudf now)
2. Spam features (all in cudf)

In [None]:
%%time

combos = []
for month in train['date_block_num'].unique():
    month_items = train[train['date_block_num'] == month]['item_id'].unique()
    month_shops = train[train['date_block_num'] == month]['shop_id'].unique()

    combos.append(np.array(list(itertools.product(month_items, month_shops, [month]))))
    
month_items = test['item_id'].unique()
month_shops = test['shop_id'].unique()

combos.append(np.array(list(itertools.product(month_items, month_shops, [34]))))

aggregated = cudf.DataFrame(data=np.vstack(combos), columns=['item_id', 'shop_id', 'date_block_num'])

In [None]:
# Convert stuff over to cudf
dfs = [items_combined, shops]
for df in dfs:
    for col in df.select_dtypes('object').columns:
        df[col], _ = df[col].factorize()

In [None]:
train_monthly = train.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum().to_frame('month_sales').reset_index()
aggregated = aggregated.merge(cudf.from_pandas(train_monthly), how='left').fillna(0)
aggregated['month_sales'] = np.clip(aggregated['month_sales'], 0, 20)

In [None]:
# Choose what columns we want to add
items_combined = items_combined.drop(columns=['item_name', 'item_category_name', 'item_name_cleaned', 'item_name_8', 'item_name_11'])

In [None]:
shops = shops.drop(columns=['shop_name', 'shop_parenthesis'])

In [None]:
%%time
# starting aggregation df

aggregated = aggregated.merge(cudf.from_pandas(items_combined), on=['item_id']).merge(cudf.from_pandas(shops), on=['shop_id'])

In [None]:
def compress_dataframe(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col], _ = cudf.factorize(df[col])
    
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        if df[col].dtype == 'float64':
            df[col] = cudf.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = cudf.to_numeric(df[col], downcast='integer')
    
    return df

print("Memory before:", aggregated.memory_usage().sum() / (1024**2), "MB")
aggregated = compress_dataframe(aggregated)
print("Memory after:", aggregated.memory_usage().sum() / (1024**2), "MB")

In [None]:
%%time

def lag_by(df: cudf.DataFrame, group_by_cols: list, time_shift: int, new_name):
    combined_cols = group_by_cols + ['date_block_num']
    grouped = df[combined_cols + ['month_sales']].groupby(combined_cols)['month_sales'].mean().to_frame(new_name).reset_index()
    grouped['date_block_num'] = grouped['date_block_num'] + time_shift
    res = df.merge(right=grouped, on=combined_cols, how='left').fillna(-1)
    return res

groupings = [['item_id'], ['shop_id'], ['item_category_id'], ['item_name_4'], ['category_first'], ['shop_first'], \
             ['item_id', 'shop_id'], ['item_category_id', 'shop_id'], ['category_first', 'shop_id'], ['item_name_4', 'shop_id']]

testing = aggregated[(aggregated.item_id == 1409) & (aggregated.date_block_num < 5)]
testing = testing.reset_index(drop=True)

for window in [1,2,12]:
    for grouping in groupings:
        print(window, grouping)
        
        col_name = f'{window}_rolling({"".join(grouping)})'
        aggregated = lag_by(aggregated, grouping, window, col_name)

In [None]:
%%time

# Steps
# 1. Add FIRST SALE column (item id, or item-store combo)
# 2. Shift by 1-12 months, then groupby various ages

# Sales in first month (generalized to age since??)
# ex. sales in first month for an item id (first time item appears)
# ex. sales in first month for item id - store id combo (first time both sold)

# Groupings
# for an item-first 4, item-first 8, item category, sales in first month for (item, item-store combo)

# Important: can only include as col if THIS MONTH NOT FIRST MONTH 

first_sale_item = aggregated[aggregated['month_sales'] > 0].groupby('item_id')['date_block_num'].min().to_frame('first').reset_index()
first_sale_item_shop = aggregated[aggregated['month_sales'] > 0].groupby(['item_id', 'shop_id'])['date_block_num'].min().to_frame('first').reset_index()

aggregated['first_item_sale'] = aggregated[['item_id']].merge(first_sale_item, on=['item_id'], how='left')['first']
aggregated['first_item_shop_sale'] = aggregated[['item_id', 'shop_id']].merge(first_sale_item_shop, on=['item_id', 'shop_id'], how='left')['first']

aggregated['item_age'] = aggregated['date_block_num'] - aggregated['first_item_sale']
aggregated['item_shop_age'] = aggregated['date_block_num'] - aggregated['first_item_shop_sale']
aggregated = aggregated.fillna(-1)

In [None]:
%%time
def lag_age(df: cudf.DataFrame, group_by_cols: list, time_shift: int, age_col):
    combined_cols = group_by_cols + ['date_block_num', age_col]
    
    grouped = df.groupby(combined_cols)['month_sales'].mean().to_frame('lag_sales').reset_index()
    grouped['date_block_num'] = grouped['date_block_num'] + time_shift
    res = df.merge(right=grouped, on=combined_cols, how='left')['lag_sales'].fillna(-1)
    return res

age_groupings = [['item_category_id'], ['item_name_4']] \
                + [['item_id', 'shop_id'], ['item_name_4', 'shop_id'],['item_category_id', 'shop_id']] \
                + [['category_first', 'shop_id'], ['category_first']]

for grouping in age_groupings:
    for window in [1,2,12]:
        for age_col in ['item_age', 'item_shop_age']:
            print(grouping, window, age_col)
            subset = aggregated[grouping + ['date_block_num', age_col, 'month_sales']]
            aggregated[f'age{window}_{age_col}_({"".join(grouping)})'] = \
                lag_age(subset, grouping, window, age_col).values

# Clean Final Df and Model

In [None]:
final_df = aggregated

def compress_dataframe(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col], _ = cudf.factorize(df[col])
    
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        if df[col].dtype == 'float64':
            df[col] = cudf.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = cudf.to_numeric(df[col], downcast='integer')
    
    return df

print("Memory before:", final_df.memory_usage().sum() / (1024**2), "MB")
final_df = compress_dataframe(final_df)
print("Memory after:", final_df.memory_usage().sum() / (1024**2), "MB")

In [None]:
final_df.to_pandas().to_pickle('retrying.pkl')

In [None]:
# Prepare dfs
train, val, test = final_df[(final_df['date_block_num'] < 33) & (final_df['date_block_num'] > 11)], final_df[final_df['date_block_num'] == 33], final_df[final_df['date_block_num'] == 34]
X_train, Y_train = train.drop(columns='month_sales'), train['month_sales']
X_val, Y_val = val.drop(columns='month_sales'), val['month_sales']
X_test, Y_test = test.drop(columns='month_sales'), test['month_sales']

X_train = X_train.to_pandas()
Y_train = Y_train.to_pandas()
X_val = X_val.to_pandas()
Y_val = Y_val.to_pandas()
X_test = X_test.to_pandas()
Y_test = Y_test.to_pandas()

In [None]:
import lightgbm as lgb

# Prepare the LightGBM Dataset
lgb_train = lgb.Dataset(X_train, label=Y_train)
lgb_val = lgb.Dataset(X_val, label=Y_val, reference=lgb_train)

# Define parameters for the model
params = {
    "objective": "rmse",          # Same as 'objective' in LGBMRegressor
    "boosting_type": "gbdt",      # Default boosting type
    "learning_rate": 0.01,        # Same as in LGBMRegressor
    "device": "gpu",              # Enable GPU
    "metric": "rmse"              # Evaluation metric
}

# Train the model using lgb.train()
model = lgb.train(
    params, 
    lgb_train, 
    num_boost_round=1000,             # Same as n_estimators
    valid_sets=[lgb_train, lgb_val],  # Validation data for evaluation
    callbacks=[lgb.early_stopping(stopping_rounds=50),
               lgb.log_evaluation(100)])

In [None]:
# Get final results
predictions = model.predict(X_test)
predicitons_df = X_test.copy()
predicitons_df['predictions'] = predictions

In [None]:
submission_df = pd.read_csv(DIRECTORY + 'test.csv')
submission_df = submission_df.merge(predicitons_df, on=['shop_id', 'item_id'], how='left')

In [None]:
submission_df[['ID', 'predictions']].rename(columns={'predictions': 'item_cnt_month'}).to_csv('./submission.csv', index=False)

# Look at Results & What to Improve

In [None]:
predictions = model.predict(X_test)
mean_squared_error(predictions, Y_test)

In [None]:
ax = lgb.plot_importance(model, max_num_features=20)
plt.show()

In [None]:
results_df = pd.concat([X_test, Y_test], axis=1)
results_df['predictions'] = predictions

# Mark the type based on item-id, shop-id
# new items: items have never been seen before date block == 33
# seen before item/shop: item-shop combo is present with date block < 33
# not see before: all else

seen_before = final_df[(final_df['month_sales'] > 0) & (final_df['date_block_num'] < 33)] \
                    [['item_id', 'shop_id']].drop_duplicates().to_pandas()
indicator = results_df.merge(seen_before, on=['item_id', 'shop_id'], how='left', indicator=True)['_merge']
results_df['old_combo'] = np.array(indicator == 'both')

items_new = results_df.merge(seen_before['item_id'].drop_duplicates(), on='item_id', how='left', indicator=True)['_merge']
results_df['new_item'] = np.array(items_new == 'left_only')

In [None]:
results_df.head()

In [None]:
results_df['type'] = np.where(results_df['old_combo'], 'old_combo', np.where(results_df['new_item'], 'new_item', 'zero'))

results_df['mse'] = (results_df['month_sales'] - results_df['predictions'])**2
results_df.groupby(by='type')[['mse', 'predictions', 'month_sales']].agg(['count', 'mean', 'sum'])