In [25]:
%pip install -Uq upgini catboost

In [27]:
from os.path import exists
import pandas as pd

df_path = "train.csv.zip" if exists("train.csv.zip") else "https://github.com/upgini/upgini/raw/main/notebooks/train.csv.zip"


In [28]:
df = pd.read_csv(df_path)
df = df.sample(n = 19_000, random_state = 0)
df["store"] = df["store"].astype(str)
df["item"] = df["item"].astype(str)
df["date"] = pd.to_datetime(df["date"])
df.sort_values("date", inplace = True)
df.reset_index(inplace = True, drop = True)
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,7,5,5
1,2013-01-01,4,9,19
2,2013-01-01,1,33,37
3,2013-01-01,3,41,14
4,2013-01-01,5,24,26


In [29]:
train = df[df["date"] < "2017-01-01"]
test = df[df["date"] >= "2017-01-01"]

In [30]:
train_features = train.drop(columns= ["sales"])
train_target = train["sales"]
test_features = test.drop(columns= ["sales"])
test_target = test["sales"]

Enrich Values

In [31]:
from upgini import FeaturesEnricher, SearchKey
from upgini.metadata import CVType

enricher = FeaturesEnricher(
    search_keys = {
        "date": SearchKey.DATE,
    },
    cv = CVType.time_series
)

enricher.fit(train_features,
             train_target,
             eval_set = [(test_features, test_target)])


Try to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IPv4 to your training dataset
for search through all the available data sources.
See docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
Detected task type: ModelTaskType.REGRESSION



Column name,Status,Errors
target,All valid,-
date,All valid,-



Running search request, search_id=7b3e230a-5682-4074-8aee-100803b743c6
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com

[92m[1m
56 relevant feature(s) found with the search keys: ['date'][0m


Feature name,SHAP value,Coverage %,Value preview,Provider,Source,Feature type
f_autofe_max_1050,0.0566,100.0,"-0.3335, -0.6918, 0.9577",Upgini,AutoFE: features from Calendar data,Free
f_autofe_min_268,0.0505,100.0,"-0.7624, -0.6918, -0.8786",Upgini,AutoFE: features from Calendar data,Free
f_autofe_max_1129,0.0205,100.0,"0.8249, 0.6918, -0.7325",Upgini,AutoFE: features from Calendar data,Free
f_weather_date_weather_umap_47_5123ef0a,0.0097,100.0,"5.7363, 5.3626, 4.7",Upgini,Weather & climate normals data,Premium
f_weather_date_weather_umap_48_b39cd0c4,0.0096,100.0,"4.5686, 4.7026, 5.4386",Upgini,Weather & climate normals data,Premium
f_weather_date_weather_umap_31_fa6d9a99,0.0082,100.0,"5.1327, 4.7958, 4.9119",Upgini,Weather & climate normals data,Premium
f_autofe_min_291,0.0053,100.0,"-0.004, -0.345, -0.084",Upgini,"AutoFE: features from Calendar data,Markets data",Free
f_autofe_div_1999,0.0043,100.0,"-0.0097, 0.0088, 0.0098",Upgini,"AutoFE: features from Calendar data,Markets data",Free
f_weather_date_weather_umap_34_c3ef5b4f,0.0028,100.0,"4.8112, 4.7247, 5.533",Upgini,Weather & climate normals data,Premium
f_financial_date_gold_7d_to_1y_ae310379,0.0025,100.0,"0.902, 1.0424, 0.8555",Upgini,Markets data,Free


Provider,Source,All features SHAP,Number of relevant features
Upgini,AutoFE: features from Calendar data,0.135,15
Upgini,Weather & climate normals data,0.034,11
Upgini,"AutoFE: features from Calendar data,Markets data",0.017,13
Upgini,Markets data,0.0043,7
Upgini,World economic indicators,0.0012,5
Upgini,AutoFE: features from Markets data,0.0011,3
Upgini,AutoFE: feature from Markets data,0.0004,1
Upgini,Calendar data,0.0002,1


We detected 113 outliers in your sample.
Examples of outliers with maximum value of target:
84    205
47    196
38    187
Name: target, dtype: int64
Outliers will be excluded during the metrics calculation.
Before dropping target outliers size: 19000
After dropping target outliers size: 18887
Calculating accuracy uplift after enrichment...

which makes metrics between the train and eval_set incomparable.


Dataset type,Rows,Mean target,Baseline mean_squared_error,Enriched mean_squared_error,Uplift
Train,15148,49.9541,309.9905,190.2834,119.7071
Eval 1,3739,58.0011,509.2897,387.3258,121.964


In [32]:
from catboost import CatBoostRegressor
from catboost.utils import eval_metric

model = CatBoostRegressor(verbose = False, allow_writing_files = False, random_state = 0)

enricher.calculate_metrics(
    train_features, train_target,
    eval_set = [(test_features, test_target)],
    estimator = model,
    scoring = "mean_absolute_percentage_error"

    )

Calculating accuracy uplift after enrichment...
-
which makes metrics between the train and eval_set incomparable.


Unnamed: 0,Dataset type,Rows,Mean target,Baseline mean_absolute_percentage_error,Enriched mean_absolute_percentage_error,Uplift
0,Train,15148,49.9541,0.255035,0.156914,0.098122
1,Eval 1,3739,58.0011,0.269488,0.198606,0.070883


In [33]:
enriched_train_features = enricher.transform(train_features, keep_input = True)
enriched_test_features = enricher.transform(test_features, keep_input = True)
#enriched_train_features.head()

You are trying to launch enrichment for 15213 rows, which will exceed the rest limit 9270.


Button(button_style='danger', description='Request a quote', style=ButtonStyle())

You use Trial access to Upgini data enrichment. Limit for Trial: 10000 rows. You have already enriched: 730 rows.

Try to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IPv4 to your training dataset
for search through all the available data sources.
See docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history


Column name,Status,Errors
date,All valid,-



Running search request, search_id=5079538a-b9ba-498b-bf99-5baa5a4920b2
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com

Retrieving selected features from data sources...


<IPython.core.display.Javascript object>

In [None]:
model.fit(train_features, train_target)
preds = model.predict(test_features)
eval_metric(test_target.values, preds, "SMAPE")

[37.65141857448004]

In [None]:
model.fit(enriched_train_features, train_target)
enriched_preds = model.predict(enriched_test_features)
eval_metric(test_target.values, enriched_preds, "SMAPE")

CatBoostError: ignored