In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from pathlib import Path
from pandas import DataFrame

In [2]:
def rmse(a, b):
    return np.sqrt(mean_squared_error(a, b))

In [53]:
def load_df(folder: Path) -> DataFrame:
    return pd.read_csv(folder/"data.csv")

In [55]:
def load_data_for_model(original_data: DataFrame) -> DataFrame:
    columns_to_drop = original_data.columns[original_data.columns.str.contains("_name")]
    original_data = original_data.drop(columns=columns_to_drop)
    X = original_data.drop(columns=["item_cnt_month"])
    y = original_data["item_cnt_month"]
    return X, y

In [46]:
train_folder = Path("../data/prepared/train/")
val_folder = Path("../data/prepared/val/")

In [57]:
train_df = load_df(train_folder)
val_df = load_df(val_folder)

In [60]:
X_train, y_train = load_data_for_model(train_df)
X_val, y_val = load_data_for_model(val_df)

In [61]:
X_train

Unnamed: 0,shop_id,item_id,date_block_num,date_year,date_month,item_price,item_category_id
0,0,32,1,2013,1,221.0,40
1,1,32,1,2013,1,221.0,40
2,3,32,1,2013,1,349.0,40
3,4,32,1,2013,1,349.0,40
4,5,32,1,2013,1,349.0,40
...,...,...,...,...,...,...,...
1730110,55,15073,33,2015,9,299.0,31
1730111,55,16801,33,2015,9,790.0,78
1730112,55,19444,33,2015,9,186.0,31
1730113,56,18483,33,2015,9,349.0,57


In [49]:
X_val

Unnamed: 0,shop_id,item_id,date_block_num,date_year,date_month,item_price,item_category_id
0,2,31,34,2015,10,399.00,37
1,6,31,34,2015,10,324.00,37
2,18,31,34,2015,10,399.00,37
3,21,31,34,2015,10,391.48,37
4,22,31,34,2015,10,395.12,37
...,...,...,...,...,...,...,...
34051,59,5647,34,2015,10,998.00,2
34052,59,6563,34,2015,10,499.00,25
34053,59,9905,34,2015,10,1109.00,37
34054,59,14256,34,2015,10,99.00,37


In [41]:
gb_reg = GradientBoostingRegressor(n_estimators=100)

In [42]:
gb_reg.fit(X_train, y_train)

GradientBoostingRegressor()

In [50]:
prediction = gb_reg.predict(X_val).clip(0, 20)

In [67]:
prediction

array([1.17315575, 1.5106117 , 1.41374416, ..., 1.53671332, 1.13387436,
       1.38539056])

In [51]:
X_val

Unnamed: 0,shop_id,item_id,date_block_num,date_year,date_month,item_price,item_category_id
0,2,31,34,2015,10,399.00,37
1,6,31,34,2015,10,324.00,37
2,18,31,34,2015,10,399.00,37
3,21,31,34,2015,10,391.48,37
4,22,31,34,2015,10,395.12,37
...,...,...,...,...,...,...,...
34051,59,5647,34,2015,10,998.00,2
34052,59,6563,34,2015,10,499.00,25
34053,59,9905,34,2015,10,1109.00,37
34054,59,14256,34,2015,10,99.00,37


In [62]:
test_range = pd.read_csv("../data/src/test.csv", index_col=["shop_id", "item_id"])
target_df = test_range.join(val_df.set_index(["shop_id", "item_id"]).assign(prediction=prediction)).assign(
    date_block_num=(24 + 9), 
    item_cnt_month=lambda df: df.item_cnt_month.fillna(0),
    prediction=lambda df: df.prediction.fillna(0),
    date_year=2015, 
    date_month=9,
)

In [63]:
rmse(target_df.item_cnt_month, target_df.prediction)

0.7123520541586794

In [64]:
target_df

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,date_block_num,item_cnt_month,date_year,date_month,item_price,item_name,item_category_id,item_category_name,prediction
shop_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5,5037,0,33,0.0,2015,9,,,,,0.00000
5,5320,1,33,0.0,2015,9,,,,,0.00000
5,5233,2,33,0.0,2015,9,,,,,0.00000
5,5232,3,33,0.0,2015,9,,,,,0.00000
5,5268,4,33,0.0,2015,9,,,,,0.00000
...,...,...,...,...,...,...,...,...,...,...,...
45,18454,214195,33,2.0,2015,9,149.0,СБ. Союз 55,55.0,Музыка - CD локального производства,1.13876
45,16188,214196,33,0.0,2015,9,,,,,0.00000
45,15757,214197,33,0.0,2015,9,,,,,0.00000
45,19648,214198,33,0.0,2015,9,,,,,0.00000


In [66]:
target_df.reset_index(drop=True)[["ID", "item_cnt_month"]].to_csv("../submission.csv", index=False)