In [33]:
from validation_step import *
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error

In [34]:
#Loading of datasets
data = pd.read_csv('data.csv')
categories = pd.read_csv('item_categories.csv')
items = pd.read_csv('items.csv')
shops = pd.read_csv('shops.csv')
test = pd.read_csv('./competitive-data-science-predict-future-sales/test.csv')

In [35]:
#Transformation of test set\n",
merge_params = [[items, 'item_id'], [categories, 'item_category_id'], [shops, 'shop_id']]

for df_merge, column in merge_params:
    test = test.merge(df_merge, on = f'{column}', how = 'left')

test['date_block_num'] = 34
test['year'] = 2015
test['month'] = 11
    
columns = ['item_category_id', 'main_category_id', 'minor_category_id',
    'date_block_num', 'item_id', 'month', 
    'year', 'shop_id', 'city_id']
test = test.loc[:, columns]

In [36]:
#According to the task conditions, True target values are clipped in the range [0,20]
data.item_cnt_month = data.item_cnt_month.clip(0,20)

In [37]:
#Merge Train and Test sets
data = pd.concat([data, test], ignore_index=True, sort=False, keys=columns)

  data = pd.concat([data, test], ignore_index=True, sort=False, keys=columns)


In [38]:
#Pipeline with feature enfineering, column trasformation, obtaining a complete data set, validation,
categorical_features = ['item_category_id', 'main_category_id', 'minor_category_id', 'shop_id']
target_log_transformation = ['item_cnt_month']

col_lags_dict = {'date_item_avg_item_cnt': [1,2,3,6,12], 'date_shop_avg_item_cnt': [1,2,3,6,12], 'date_shop_cat_avg_item_cnt': [1], 'date_cat_avg_item_cnt': [1],
    'date_minor_cat_avg_item_cnt': [1], 'date_main_cat_avg_item_cnt': [1], 'date_city_avg_item_cnt': [1], 'date_item_avg_item_price': [1,2,3,6], 'delta_revenue': [1]}

#Pipeline for feature engineering(revenue, shop_history, minor_catregory_history, lags)
pipeline_1 = pipeline_1(col_lags_dict)
pipeline_1.fit(data)
data = pipeline_1.transform(data)

#Pipeline for log transformation and encoding of categorical features
pipeline_2 = pipeline_2(target_log_transformation, categorical_features)
pipeline_2.fit(data)
transformed_data = pipeline_2.transform(data)

#Adding results from 2nd pipeline\n",
transformed_columns = ['item_cnt_month_log', 'item_category_id', 'main_category_id', 'minor_category_id', 'shop_id']
data[transformed_columns] = transformed_data

#Save in dataframe needed columns and removing features that can cause data/target leakage
columns = ['date_block_num', 'shop_id', 'item_id', 'item_category_id',
       'main_category_id', 'minor_category_id', 'month', 'year', 'city_id', 'shop_history',
           'minor_category_history', 'date_item_avg_item_cnt_lag_1',
           'date_item_avg_item_cnt_lag_2', 'date_item_avg_item_cnt_lag_3',
           'date_item_avg_item_cnt_lag_6', 'date_item_avg_item_cnt_lag_12',
           'date_shop_avg_item_cnt_lag_1', 'date_shop_avg_item_cnt_lag_2',
           'date_shop_avg_item_cnt_lag_3', 'date_shop_avg_item_cnt_lag_6',
           'date_shop_avg_item_cnt_lag_12', 'date_shop_cat_avg_item_cnt_lag_1',
           'date_cat_avg_item_cnt_lag_1', 'date_minor_cat_avg_item_cnt_lag_1',
           'date_main_cat_avg_item_cnt_lag_1', 'date_city_avg_item_cnt_lag_1',
           'date_item_avg_item_price_lag_1', 'date_item_avg_item_price_lag_2',
           'date_item_avg_item_price_lag_3', 'date_item_avg_item_price_lag_6',
           'delta_revenue_lag_1', 'item_cnt_month_log']
data = data.loc[:, columns].fillna(0)

In [39]:
#Data Validation
non_negative_columns = ['date_block_num', 'shop_id', 'item_id', 'item_category_id',
       'main_category_id', 'minor_category_id', 'month', 'year', 'city_id', 'shop_history',
           'minor_category_history', 'date_item_avg_item_cnt_lag_1',
           'date_item_avg_item_cnt_lag_2', 'date_item_avg_item_cnt_lag_3',
           'date_item_avg_item_cnt_lag_6', 'date_item_avg_item_cnt_lag_12',
           'date_shop_avg_item_cnt_lag_1', 'date_shop_avg_item_cnt_lag_2',
           'date_shop_avg_item_cnt_lag_3', 'date_shop_avg_item_cnt_lag_6',
           'date_shop_avg_item_cnt_lag_12', 'date_shop_cat_avg_item_cnt_lag_1',
           'date_cat_avg_item_cnt_lag_1', 'date_minor_cat_avg_item_cnt_lag_1',
           'date_main_cat_avg_item_cnt_lag_1', 'date_city_avg_item_cnt_lag_1',
           'date_item_avg_item_price_lag_1', 'date_item_avg_item_price_lag_2',
           'date_item_avg_item_price_lag_3', 'date_item_avg_item_price_lag_6',
           'item_cnt_month_log']

validation = Validator(non_negative_columns = non_negative_columns)

try:
    validated_data = validation.fit_transform(data)
    print("Validation successful, data is valid.")
except ValueError as e:
    print(f"Validation error: {e}")
except TypeError as e:
    print(f"Type error: {e}")

Validation successful, data is valid.


In [42]:
k = 3 
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [69]:
#Train/test split

X_test = data[data.date_block_num == 34].drop('item_cnt_month_log', axis = 1)

X = data[data.date_block_num != 34].drop('item_cnt_month_log', axis = 1)
y = data[data.date_block_num != 34]['item_cnt_month_log']

<strong>Linear Regression</strong>

In [45]:
n = 0
rmse = []
model = LinearRegression()

print('Linear Regression')

for train_idxs, val_idxs in kf.split(X):

    X_train, X_val = X.iloc[train_idxs], X.iloc[val_idxs]
    y_train, y_val = y.iloc[train_idxs], y.iloc[val_idxs]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse.append(root_mean_squared_error(y_pred, y_val))
    print(f'RMSE for split {n+1}: {rmse[n]:.3f}')
    n += 1
print(f'Mean RMSE for all splits: {np.mean(rmse):.3f}')

Linear Regression
RMSE for split 1: 0.395
RMSE for split 2: 0.394
RMSE for split 3: 0.394
Mean RMSE for all splits: 0.394


In [46]:
model.fit(X, y)
y_pred_linregr = np.expm1(model.predict(X_test))

In [24]:
submission = pd.DataFrame({'ID': np.arange(len(y_pred_linregr)), 'item_cnt_month': y_pred_linregr})
submission.to_csv('submission.csv', index = False)

_**Submission result for Linear Regression: 1.6058**_

<strong>RandomForestRegressor</strong>

In [48]:
n = 0
rmse = []
model = RandomForestRegressor()

print('RandomForestRegressor')

for train_idxs, val_idxs in kf.split(X):

    X_train, X_val = X.iloc[train_idxs], X.iloc[val_idxs]
    y_train, y_val = y.iloc[train_idxs], y.iloc[val_idxs]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse.append(root_mean_squared_error(y_pred, y_val))
    print(f'RMSE for split {n+1}: {rmse[n]:.3f}')
    n += 1
print(f'Mean RMSE for all splits: {np.mean(rmse):.3f}')

RandomForestRegressor
RMSE for split 1: 0.327
RMSE for split 2: 0.327
RMSE for split 3: 0.327
Mean RMSE for all splits: 0.327


In [49]:
model.fit(X, y)
y_pred_rfregr = np.expm1(model.predict(X_test))

In [50]:
submission['item_cnt_month'] = y_pred_rfregr
submission.to_csv('submission.csv', index = False)

_**Submission result for RFRegression: 1.83225**_

<strong>XGBRegressor</strong>

In [51]:
n = 0
rmse = []
model = XGBRegressor()

print('XGBRegressor')

for train_idxs, val_idxs in kf.split(X):

    X_train, X_val = X.iloc[train_idxs], X.iloc[val_idxs]
    y_train, y_val = y.iloc[train_idxs], y.iloc[val_idxs]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse.append(root_mean_squared_error(y_pred, y_val))
    print(f'RMSE for split {n+1}: {rmse[n]:.3f}')
    n += 1
print(f'Mean RMSE for all splits: {np.mean(rmse):.3f}')

XGBRegressor
RMSE for split 1: 0.351
RMSE for split 2: 0.351
RMSE for split 3: 0.350
Mean RMSE for all splits: 0.350


In [52]:
model.fit(X, y)
y_pred_xgbregressor = np.expm1(model.predict(X_test))

In [53]:
submission['item_cnt_month'] = y_pred_xgbregressor
submission.to_csv('submission.csv', index = False)

_**Submission result for XGBRegression: 1.54999**_

In [74]:
n = 0
rmse = []

model = XGBRegressor(n_estimators=500,
                    eta = 0.1,
                    max_depth = 8,
                    reg_lambda = 2)

print('XGBRegressor')

for train_idxs, val_idxs in kf.split(X):

    X_train, X_val = X.iloc[train_idxs], X.iloc[val_idxs]
    y_train, y_val = y.iloc[train_idxs], y.iloc[val_idxs]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse.append(root_mean_squared_error(y_pred, y_val))
    print(f'RMSE for split {n+1}: {rmse[n]:.3f}')
    n += 1
print(f'Mean RMSE for all splits: {np.mean(rmse):.3f}')

y_pred_xgbregressor = np.expm1(model.predict(X_test))

XGBRegressor
RMSE for split 1: 0.333
RMSE for split 2: 0.333
RMSE for split 3: 0.332
Mean RMSE for all splits: 0.333


In [75]:
submission['item_cnt_month'] = y_pred_xgbregressor
submission.to_csv('submission.csv', index = False)

_**Submission result for XGBRegression after some hyperpar.added: 1.55**_