# DATA EXPLORATION

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [2]:
train_df = pd.read_csv("D:\\workspace\\Kaggle\\tabular-playground-series\\data\\raw\\train.csv", index_col="row_id")
train_df

Unnamed: 0_level_0,date,country,store,product,num_sold
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911
...,...,...,...,...,...
26293,2018-12-31,Sweden,KaggleMart,Kaggle Hat,823
26294,2018-12-31,Sweden,KaggleMart,Kaggle Sticker,250
26295,2018-12-31,Sweden,KaggleRama,Kaggle Mug,1004
26296,2018-12-31,Sweden,KaggleRama,Kaggle Hat,1441


In [3]:
test_df = pd.read_csv("D:\\workspace\\Kaggle\\tabular-playground-series\\data\\raw\\test.csv", index_col="row_id")
test_df

Unnamed: 0_level_0,date,country,store,product
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
26298,2019-01-01,Finland,KaggleMart,Kaggle Mug
26299,2019-01-01,Finland,KaggleMart,Kaggle Hat
26300,2019-01-01,Finland,KaggleMart,Kaggle Sticker
26301,2019-01-01,Finland,KaggleRama,Kaggle Mug
26302,2019-01-01,Finland,KaggleRama,Kaggle Hat
...,...,...,...,...
32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat
32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker
32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug
32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat


In [4]:
train_df.describe()

Unnamed: 0,num_sold
count,26298.0
mean,387.533577
std,266.076193
min,70.0
25%,190.0
50%,315.0
75%,510.0
max,2884.0


In [5]:
train_df.isnull().sum()

date        0
country     0
store       0
product     0
num_sold    0
dtype: int64

In [6]:
target = "num_sold"
features = [col for col in train_df.columns if col!=target]
X = train_df[features]
y = train_df[target]


# Feature Engineering

In [7]:
X["date"] = pd.to_datetime(X["date"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["date"] = pd.to_datetime(X["date"])


In [8]:
X["nb_of_days"] = (X["date"]-X["date"].min()).dt.days

In [9]:
X["country"].value_counts()

Finland    8766
Norway     8766
Sweden     8766
Name: country, dtype: int64

In [10]:
X["store"].value_counts()

KaggleMart    13149
KaggleRama    13149
Name: store, dtype: int64

In [11]:
X["product"].value_counts()

Kaggle Mug        8766
Kaggle Hat        8766
Kaggle Sticker    8766
Name: product, dtype: int64

There is only 3 categories then we will apply the onehot encoded method to this column.

In [12]:
object_cols = ["country", "store", "product"]
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(X[object_cols]))
OH_cols.index = X.index
num_X = X.drop(object_cols, axis=1)
OH_X = pd.concat([num_X, OH_cols], axis=1)
OH_X = OH_X.drop("date", axis=1)

In [13]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_percentage_error(y_valid, preds)*100

OH_X_train, OH_X_valid, y_train, y_valid = train_test_split(OH_X, y, random_state = 0)

In [14]:
print("MAE:") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

MAE:




9.624348722804866




## Pipeline

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
import datetime as dt

def days_since_beginning(df):
    df_copy = df.copy()
    df_copy["date"] = pd.to_datetime(df_copy["date"], format="%Y-%m-%d")
    df_copy["nb_of_days"] = (df_copy["date"] - dt.datetime(2015,1,1)).dt.days
    return df_copy.drop("date", axis=1)

categorical_cols = ["country", "store", "product"]
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

date_transformer = FunctionTransformer(days_since_beginning)

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('date', date_transformer, ["date"])
    ])

In [16]:
import numpy as np
from sklearn.metrics import make_scorer

def symmetric_mean_absolute_percentage_error(y_true, y_pred, sample_weight=None):
    epsilon = np.finfo(np.float64).eps
    smape = np.abs(y_pred - y_true) / np.maximum((np.abs(y_true) + np.abs(y_pred))/2 , epsilon)*100
    output_errors = np.average(smape, weights=sample_weight, axis=0)
    return np.average(output_errors)

my_score = make_scorer(symmetric_mean_absolute_percentage_error, greater_is_better=False)

model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 0)

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = symmetric_mean_absolute_percentage_error(y_valid, preds)
print('MAPE:', score)

MAPE: 9.210250366247482


In [17]:
from sklearn import set_config
set_config(display='diagram')
my_pipeline

## Cross Validation

In [18]:
from sklearn.model_selection import cross_validate

scores = cross_validate(my_pipeline, X, y, cv=5, scoring=my_score)

In [19]:
scores

{'fit_time': array([3.11103868, 3.35402822, 3.56528831, 3.40948153, 3.31544876]),
 'score_time': array([0.06080842, 0.07280612, 0.06778884, 0.06778216, 0.04787278]),
 'test_score': array([-19.18837256, -18.9030358 , -16.18138641, -16.42497026,
        -16.74723853])}

## Grid search 

In [29]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


param_grid = [
    {
        'model__bootstrap': [True, False],
        'model__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'model__max_features': ['auto', 'sqrt'],
        'model__min_samples_leaf': [1, 2, 4],
        'model__min_samples_split': [2, 5, 10],
        'model__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
    }
]

rf_random = RandomizedSearchCV(estimator = my_pipeline, score=my_score, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=3, random_state=42)
rf_random.fit(X, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV 1/3] END model__bootstrap=True, model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=10, model__n_estimators=2000;, score=0.766 total time=  22.3s


KeyboardInterrupt: 

In [28]:
[int(x) for x in np.logspace(1, 3, num=7)]

[10, 21, 46, 100, 215, 464, 1000]

In [24]:
grid_search.best_params_

{'model': RandomForestRegressor(n_estimators=10), 'model__n_estimators': 10}

In [None]:
from scipy.stats import loguniform

C = loguniform(1e-5, 100)

TypeError: 'rv_frozen' object is not iterable