<a href="https://colab.research.google.com/github/carbonpredict/carbonpredict/blob/master/notebooks/lgbm_weight_imputation_trial_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Compensate-Operations/emission-sample-data.git

Cloning into 'emission-sample-data'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 33 (delta 8), reused 18 (delta 8), pack-reused 13[K
Unpacking objects: 100% (33/33), done.


# Install required packages

LightGBM will be updated to GPU version. It is about 1.5x faster in Colab, but with decent CPU the training will be fast enough anyway so GPU might not be needed. Simply uncomment the lightgbm package and remove device='gpu' parameter from the model training.

In [None]:
!pip3 install lightgbm --upgrade --install-option=--gpu
!pip3 install bayesian-optimization

In [None]:
!for i in /content/emission-sample-data/datasets/textile-v1.0.0/*.tgz; do tar -zxvf "$i" ;done

!ls -lah
!rm ._textile-v1.0.0-5.csv

In [1]:
import pandas as pd
import os


content = sorted(filter(lambda x: x.endswith(".csv"), os.listdir("/content/")))

df = pd.concat((pd.read_csv(f) for f in content))
df

Unnamed: 0,brand,category-1,category-2,category-3,co2_total,colour,fabric_type,ftp_acrylic,ftp_cotton,ftp_elastane,ftp_linen,ftp_other,ftp_polyamide,ftp_polyester,ftp_polypropylene,ftp_silk,ftp_viscose,ftp_wool,gender,label,made_in,season,size,unspsc_code,weight
0,b111,womenswear,uniform,jacket,,blue gray,K,,,,100.0,,,,,,,,W,,TR,,XS,,1.062
1,b82,home,home,curtain,,teal,W,,11.0,5.0,,3.0,,7.0,4.0,,,68.0,,,PK,,XXL,,
2,b107,menswear,headgear,knit-cap,,metal,K,3.0,,4.0,,,,,,89.0,2.0,,M,,PK,,XL,,0.160
3,b111,home,home,curtain,,light grey,K,,,23.0,38.0,22.0,,,8.0,1.0,5.0,,,,TR,,M,,
4,b83,womenswear,footwear,socks,,bondi blue,K,21.0,,,,,43.0,,24.0,,,11.0,W,,VN,,M,,0.029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995,b90,womenswear,nightwear,pyjama,,pink,K,,9.0,,,,46.0,44.0,,,,,W,,CN,,L,,
2999996,b133,baby,footwear,socks,,wheat,K,,,,,,2.0,90.0,3.0,,1.0,3.0,,,TW,,XL,,
2999997,b1,menswear,outerwear,pants,,gold,W,5.0,7.0,,,,79.0,,3.0,,,4.0,M,,US,,S,,
2999998,b73,menswear,accessory,backpack,,amber,K,14.0,10.0,13.0,51.0,,,,,,10.0,,M,,BD,,XL,,


In [10]:

cat_cols = ["category-1", "category-2", "category-3", 
            "size", "made_in", "gender", "colour", 
            "brand", "fabric_type", "season"]

#df["season"].fillna("no_season", inplace=True)

df[cat_cols] = df[cat_cols].astype("category")

X = df[~df["co2_total"].isna()]
y = X["co2_total"].copy()
X = X.drop("co2_total", axis=1)
#X = X.drop("weight", axis=1)

In [11]:

X['weight'] = X.groupby(['category-1', 'category-2', 'category-3'])['weight'].transform(lambda x: x.fillna(x.mean()))

X[X['weight'].isna()]


Unnamed: 0,brand,category-1,category-2,category-3,colour,fabric_type,ftp_acrylic,ftp_cotton,ftp_elastane,ftp_linen,ftp_other,ftp_polyamide,ftp_polyester,ftp_polypropylene,ftp_silk,ftp_viscose,ftp_wool,gender,label,made_in,season,size,unspsc_code,weight


In [4]:
X_imp

23         0.122000
34         0.937707
51         0.876408
56         0.115386
74         0.876408
             ...   
2999961    0.864440
2999970    0.355742
2999980    0.024577
2999981    0.024577
2999984    0.591435
Name: weight, Length: 1699515, dtype: float64

In [12]:
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score


lgb_clf = None

def run_lgb(X, y, params):
    kf = KFold(n_splits=5, shuffle=True)
    preds = np.zeros(len(X))
    nrounds = 5000
    early_stopping_rounds = 200

    models = []

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
        X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]

        trn_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_valid, label=y_valid)

        lgb_clf = lgb.train(params,
                        trn_data,
                        nrounds,
                        valid_sets = [trn_data, val_data],
                        early_stopping_rounds = early_stopping_rounds,
                        verbose_eval = 100)

        preds[val_idx] = lgb_clf.predict(X_valid)

        models.append(lgb_clf)

    s_rmse = np.sqrt(mean_squared_error(y, preds))
    s_r2 = r2_score(y, preds)
    
    print("RMSE Score:", s_rmse)
    print("R^2 Score:", s_r2)

    return models, s_rmse


# 0.7981221901359424

params = {'bagging_fraction': 1.0,
 'bagging_freq': 1,
 'boosting_type': 'gbdt',
 'colsample_bytree': 0.4,
 'lambda_l1': 0.0,
 'lambda_l2': 0.0,
 'learning_rate': 0.1,
 'max_depth': 12,
 'metric': 'rmse',
 'n_jobs': -1,
 'num_leavs': 300.0,
 'objective': 'regression',
 'device': 'gpu',
 'seed': 42}

models, _ = run_lgb(X, y, params)

Training until validation scores don't improve for 200 rounds
[100]	training's rmse: 10.1276	valid_1's rmse: 10.2321
[200]	training's rmse: 9.76296	valid_1's rmse: 10.0189
[300]	training's rmse: 9.59233	valid_1's rmse: 9.96708
[400]	training's rmse: 9.46855	valid_1's rmse: 9.94213
[500]	training's rmse: 9.36225	valid_1's rmse: 9.92739
[600]	training's rmse: 9.26454	valid_1's rmse: 9.91256
[700]	training's rmse: 9.17887	valid_1's rmse: 9.91107
[800]	training's rmse: 9.08791	valid_1's rmse: 9.90815
[900]	training's rmse: 9.01022	valid_1's rmse: 9.90739
[1000]	training's rmse: 8.95247	valid_1's rmse: 9.90896
Early stopping, best iteration is:
[832]	training's rmse: 9.06729	valid_1's rmse: 9.90384
Training until validation scores don't improve for 200 rounds
[100]	training's rmse: 10.0806	valid_1's rmse: 10.2572
[200]	training's rmse: 9.73374	valid_1's rmse: 10.0464
[300]	training's rmse: 9.55194	valid_1's rmse: 9.9904
[400]	training's rmse: 9.41887	valid_1's rmse: 9.95713
[500]	training's

# Weight Imputation For Training

## No imputation

RMSE Score: 9.699965230008726
R^2 Score: 0.8761664984846785

## Imputing with category-3

Category-3 should take us to correct direction, as usually t-shirt weights more than socks.

RMSE Score: 9.736979657961824
R^2 Score: 0.8752196143425424

It seems that the result is not much better, but it's not much worse either so let's continue. 

## Imputing with category-3, season

I added season as it is important variable for weight, e.g. winter clothing typically weights more than summer clothing.

RMSE Score: 9.743931763166684
R^2 Score: 0.8750413668597842

Could be a bug in implementation how season missing values are handles. Needs more tuning.

## Imputing with category-1, category-2, category-3

RMSE Score: 9.893673539775255
R^2 Score: 0.8711712034149216

## Summary

It is possible that imputing the multiple columns might not work as I expect. I'll come back to this. However, with single column category-1 our model doesn't get much worse, so it means that we could keep the 0.87 level simply with imputing with category-1.


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cols = X.columns.tolist()

feature_imp = pd.DataFrame(
    sorted(zip(models[0].feature_importance(importance_type="gain"), cols)),
    columns=["value", "feature"])

plt.figure(figsize=(20, 10))
sns.barplot(
    x="value",
    y="feature",
    data=feature_imp.sort_values(by="value", ascending=False).head(50))

plt.savefig("features.png")


  import pandas.util.testing as tm


NameError: ignored

In [None]:
def run_lgb_bayesian(num_leaves, max_depth, lambda_l1, lambda_l2, bagging_fraction, bagging_freq, colsample_bytree, learning_rate):
    params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'regression',
        'n_jobs': -1,
        'seed': 42,
        'num_leaves': int(num_leaves),
        'learning_rate': learning_rate,
        'max_depth': int(max_depth),
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'bagging_fraction': bagging_fraction,
        'bagging_freq': int(bagging_freq),
        'colsample_bytree': colsample_bytree,
        'verbose': 0,
        'device':'gpu' 
    }
    print("Trying params", params)

    _, score = run_lgb(X, y, params)
    
    return score

if False:
    from bayes_opt import BayesianOptimization

    bounds_lgb = {
        'num_leaves': (20, 300),
        'max_depth': (8, 12),
        'lambda_l1': (0, 5),
        'lambda_l2': (0, 5),
        'bagging_fraction': (0.4, 1),
        'bagging_freq': (1, 10),
        'colsample_bytree': (0.4, 1),
        'learning_rate': (0.025, 0.1),
    }

    lgb_bo = BayesianOptimization(run_lgb_bayesian, bounds_lgb, random_state = 42)
    lgb_bo.maximize(init_points = 20, n_iter = 5, acq = 'ucb', xi = 0.0, alpha = 1e-6)

    params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'regression',
        'n_jobs': -1,
        'seed': 42,
        'num_leaves': lgb_bo.max['params']['num_leaves'],
        'learning_rate': lgb_bo.max['params']['learning_rate'],
        'max_depth': int(lgb_bo.max['params']['max_depth']),
        'lambda_l1': lgb_bo.max['params']['lambda_l1'],
        'lambda_l2': lgb_bo.max['params']['lambda_l2'],
        'bagging_fraction': lgb_bo.max['params']['bagging_fraction'],
        'bagging_freq': int(lgb_bo.max['params']['bagging_freq']),
        'colsample_bytree': lgb_bo.max['params']['colsample_bytree']
    }

    print(params)

In [None]:
params

{'bagging_fraction': 0.4,
 'bagging_freq': 10,
 'boosting_type': 'gbdt',
 'colsample_bytree': 0.4,
 'lambda_l1': 0.0,
 'lambda_l2': 0.0,
 'learning_rate': 0.1,
 'max_depth': 12,
 'metric': 'rmse',
 'n_jobs': -1,
 'num_leaves': 300.0,
 'objective': 'regression',
 'seed': 42}

In [None]:
t = df[df["co2_total"].isna()]

t.sample(100).to_csv("test.csv", index=False)
