In [1]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
ross_df = pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv', low_memory=False)
store_df = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
test_df = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')
submission_df = pd.read_csv('/kaggle/input/rossmann-store-sales/sample_submission.csv')

In [3]:
ross_df

In [4]:
!pip install numpy pandas matplotlib seaborn --quiet

In [5]:
!pip install  xgboost graphviz lightgbm scikit-learn xgboost lightgbm --upgrade --quiet

In [6]:
store_df

In [7]:
merged_df = ross_df.merge(store_df, how='left', on='Store')
merged_test_df = test_df.merge(store_df, how='left', on='Store')

In [8]:
merged_df

In [9]:
merged_df.info()

First, let's convert Date to a datecolumn and extract different parts of the date.

In [10]:
def split_date(df):
        df['Date']=pd.to_datetime(df['Date'])
        df['Year']=df.Date.dt.year
        df['Month'] = df.Date.dt.month
        df['Day'] = df.Date.dt.day
        df['WeekOfYear'] = df.Date.dt.isocalendar().week

In [11]:
split_date(merged_df)
split_date(merged_test_df)

In [12]:
merged_df[merged_df.Open==0].Sales.value_counts()

Next, notice that the sales are zero whenever the store is closed.

In [13]:
merged_df = merged_df[merged_df.Open == 1].copy()

In [14]:
merged_df

### Competition

Next, we can use the columns `CompetitionOpenSince[Month/Year]` columns from `store_df` to compute the number of months for which a competitor has been open near the store.

In [15]:
def comp_months(df):
    df['CompetitionOpen'] = 12* (df.Year-df.CompetitionOpenSinceYear)+(df.Month - df.CompetitionOpenSinceMonth)
    df['CompetitionOpen']= df['CompetitionOpen'].map(lambda x: 0 if x < 0 else x).fillna(0)

In [16]:
comp_months(merged_df)
comp_months(merged_test_df)

In [17]:
merged_df.head(5)

In [18]:
merged_df[['Date', 'CompetitionDistance', 'CompetitionOpenSinceYear', 'CompetitionOpenSinceMonth', 'CompetitionOpen']].sample(20)

In [19]:
merged_df[['Date', 'CompetitionDistance', 'CompetitionOpenSinceYear', 'CompetitionOpenSinceMonth', 'CompetitionOpen']].sample(20)

In [20]:
def check_promo_month(row):
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',              
                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    try:
        months = (row['PromoInterval'] or '').split(',')
        if row['Promo2Open'] and month2str[row['Month']] in months:
            return 1
        else:
            return 0
    except Exception:
        return 0

def promo_cols(df):
    # Months since Promo2 was open
    df['Promo2Open'] = 12 * (df.Year - df.Promo2SinceYear) +  (df.WeekOfYear - df.Promo2SinceWeek)*7/30.5
    df['Promo2Open'] = df['Promo2Open'].map(lambda x: 0 if x < 0 else x).fillna(0) * df['Promo2']
    # Whether a new round of promotions was started in the current month
    df['IsPromo2Month'] = df.apply(check_promo_month, axis=1) * df['Promo2']

In [21]:
promo_cols(merged_df)
promo_cols(merged_test_df)

In [22]:
merged_df[['Date', 'Promo2', 'Promo2SinceYear', 'Promo2SinceWeek', 'PromoInterval', 'Promo2Open', 'IsPromo2Month']].sample(20)

In [23]:
merged_df.columns

In [24]:
input_cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 
              'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpen', 
              'Day', 'Month', 'Year', 'WeekOfYear',  'Promo2', 
              'Promo2Open', 'IsPromo2Month']
target_col = 'Sales'

In [25]:
inputs = merged_df[input_cols].copy()
targets = merged_df[target_col].copy()

In [26]:
inputs

In [27]:
test_inputs = merged_test_df[input_cols].copy()

In [28]:
test_inputs

In [29]:
test_inputs

In [30]:
numeric_cols = ['Store', 'Promo', 'SchoolHoliday', 
              'CompetitionDistance', 'CompetitionOpen', 'Promo2', 'Promo2Open', 'IsPromo2Month',
              'Day', 'Month', 'Year', 'WeekOfYear',  ]
categorical_cols = ['DayOfWeek', 'StateHoliday', 'StoreType', 'Assortment']

**Impute Missing Data**

In [31]:
inputs[numeric_cols].isna().sum()

In [32]:
test_inputs[numeric_cols].isna().sum()

**Seems like competition distance is the only missing value, and we can simply fill it with the highest value (to indicate that competition is very far away).**

In [33]:
max_distance = inputs.CompetitionDistance.max()

In [34]:
max_distance

In [35]:
inputs['CompetitionDistance'].fillna(max_distance, inplace=True)
test_inputs['CompetitionDistance'].fillna(max_distance, inplace=True)

### Scale Numeric Values

Let's scale numeric values to the 0 to 1 range.

In [36]:
from sklearn.preprocessing import MinMaxScaler

In [37]:
scaler = MinMaxScaler().fit(inputs[numeric_cols])

In [38]:
inputs[numeric_cols] = scaler.transform(inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [39]:
inputs[numeric_cols]

In [40]:
from sklearn.preprocessing import OneHotEncoder

In [41]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(inputs[categorical_cols])
encoded_cols = list(encoder.get_feature_names(categorical_cols))

In [42]:
encoded_cols

In [43]:
inputs[encoded_cols] = encoder.transform(inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

In [44]:
X = inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]

In [45]:
X

In [46]:
X_test

### Training

To train a GBM, we can use the `XGBRegressor` class from the [`XGBoost`](https://xgboost.readthedocs.io/en/latest/) library.

In [47]:
from xgboost import XGBRegressor

In [48]:
model = XGBRegressor(random_state=42, n_jobs=-1, n_estimators=20, max_depth=4)

In [49]:
%%time
model.fit(X, targets)

In [50]:
preds = model.predict(X)

In [51]:
preds

### Evaluation

Let's evaluate the predictions using RMSE error.

In [52]:
from sklearn.metrics import mean_squared_error

def rmse(a, b):
    return mean_squared_error(a, b, squared=False)

In [53]:

rmse(preds, targets)

In [54]:
import matplotlib.pyplot as plt
plt.hist(merged_df.Sales.sample(1000))

In [55]:
import matplotlib.pyplot as plt
from xgboost import plot_tree
from matplotlib.pylab import rcParams
%matplotlib inline

rcParams['figure.figsize'] = 30,30

In [56]:
plot_tree(model, rankdir='LR',num_trees=1);

In [57]:
plot_tree(model, rankdir='LR',num_trees=19);

In [58]:
trees = model.get_booster().get_dump()

In [59]:
len(trees)

In [60]:
print(trees[0])

### Feature importance

Just like decision trees and random forests, XGBoost also provides a feature importance score for each column in the input.

In [61]:
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

In [62]:
importance_df.head(10)

In [63]:
import seaborn as sns
plt.figure(figsize=(10,6))
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

**K -Fold**

In [64]:
from sklearn.model_selection import KFold

Let's define a helper function `train_and_evaluate` which trains a model the given parameters and returns the trained model, training error and validation error.

In [65]:
def train_and_evaluate(X_train, train_targets, X_val, val_targets, **params):
    model = XGBRegressor(random_state=42, n_jobs=-1, **params)
    model.fit(X_train, train_targets)
    train_rmse = rmse(model.predict(X_train), train_targets)
    val_rmse = rmse(model.predict(X_val), val_targets)
    return model, train_rmse, val_rmse

In [66]:
kfold = KFold(n_splits=5)

In [67]:
models = []

for train_idxs, val_idxs in kfold.split(X):
    X_train, train_targets = X.iloc[train_idxs], targets.iloc[train_idxs]
    X_val, val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]
    model, train_rmse, val_rmse = train_and_evaluate(X_train, 
                                                     train_targets, 
                                                     X_val, 
                                                     val_targets, 
                                                     max_depth=4, 
                                                     n_estimators=20)
    models.append(model)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))

In [68]:
import numpy as np

def predict_avg(models, inputs):
    return np.mean([model.predict(inputs) for model in models], axis=0)

In [69]:
preds = predict_avg(models, X)

In [70]:
preds

## Hyperparameter Tuning and Regularization

Just like other machine learning models, there are several hyperparameters we can to adjust the capacity of model and reduce overfitting.

<img src="https://i.imgur.com/EJCrSZw.png" width="480">

Check out the following resources to learn more about hyperparameter supported by XGBoost:

- https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBRegressor
- https://xgboost.readthedocs.io/en/latest/parameter.html

In [71]:
model

In [72]:
def test_params_kfold(n_splits, **params):
    train_rmses, val_rmses, models = [], [], []
    kfold = KFold(n_splits)
    for train_idxs, val_idxs in kfold.split(X):
        X_train, train_targets = X.iloc[train_idxs], targets.iloc[train_idxs]
        X_val, val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]
        model, train_rmse, val_rmse = train_and_evaluate(X_train, train_targets, X_val, val_targets, **params)
        models.append(model)
        train_rmses.append(train_rmse)
        val_rmses.append(val_rmse)
    print('Train RMSE: {}, Validation RMSE: {}'.format(np.mean(train_rmses), np.mean(val_rmses)))
    return models

In [73]:
from sklearn.model_selection import train_test_split

In [74]:
X_train, X_val, train_targets, val_targets = train_test_split(X, targets, test_size=0.1)

In [75]:
def test_params(**params):
    model = XGBRegressor(n_jobs=-1, random_state=42, **params)
    model.fit(X_train, train_targets)
    train_rmse = rmse(model.predict(X_train), train_targets)
    val_rmse = rmse(model.predict(X_val), val_targets)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))

#### `n_estimators`

The number of trees to be created. More trees = greater capacity of the model.

In [76]:
test_params(n_estimators=10)

In [77]:
test_params(n_estimators=30)

In [78]:
test_params(n_estimators=100)

In [79]:
test_params(n_estimators=240)

In [80]:
test_params(max_depth=2,n_estimators=10)

In [81]:
test_params(max_depth=10,n_estimators=10)

In [82]:
test_params(max_depth=100,n_estimators=10)

#### `learning_rate`

The scaling factor to be applied to the prediction of each tree. A very high learning rate (close to 1) will lead to overfitting, and a low learning rate (close to 0) will lead to underfitting.

In [83]:
test_params(n_estimators=50, learning_rate=0.01)

In [84]:
test_params(n_estimators=50, learning_rate=0.1)

In [85]:
test_params(n_estimators=50, learning_rate=0.3)

In [86]:
test_params(n_estimators=50, learning_rate=0.9)

In [87]:
test_params(n_estimators=50, learning_rate=0.99)

#### `booster`

Instead of using Decision Trees, XGBoost can also train a linear model for each iteration. This can be configured using `booster`.

In [88]:
test_params(booster='gblinear')

## Putting it Together and Making Predictions

Let's train a final model on the entire training set with custom hyperparameters. 

In [89]:
model = XGBRegressor(n_jobs=-1, random_state=42, n_estimators=1000, 
                     learning_rate=0.2, max_depth=10, subsample=0.9, 
                     colsample_bytree=0.7)

In [90]:
%%time
model.fit(X, targets)

In [93]:
test_preds = model.predict(X_test)

In [94]:
test_df

In [95]:
submission_df

In [96]:
submission_df['Sales']=test_preds

In [97]:
submission_df

In [99]:
test_df.Open.isna().sum()

In [100]:
submission_df['Sales'] = submission_df['Sales'] * test_df.Open.fillna(1.)

In [102]:
submission_df.sample(20)

In [104]:
test_df[test_df['Id']==27538]

In [105]:
submission_df.to_csv('submission.csv', index=None)