In [1]:
import pandas as pd

In [2]:
train = pd.read_csv(r'/home/icarus/workspace/product-cost-prediction/data_preprocessed/train_preprocessed.csv')
test = pd.read_csv(r'/home/icarus/workspace/product-cost-prediction/data_preprocessed/test_preprocessed.csv')
test_with_ids = pd.read_csv(r'/home/icarus/workspace/product-cost-prediction/data_joined/test_joined.csv')
test_ids = test_with_ids['unnamed:_0']

In [4]:
test.head()

Unnamed: 0,store_sales,store_cost,grocery_area,frozen_area,net_weight,bag_stuffer,best_price_savers,best_savings,big_time_discounts,big_time_saving,...,bc,ca,df,gu,ja,or,ve,wa,yu,za
0,11.76,4.704,1424.85,465.54,28.7811,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.16,0.6696,1735.17,505.07,27.0368,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.83,0.8235,2038.11,481.98,26.31,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,8.82,4.41,1871.16,593.93,25.0018,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4.32,1.9872,1320.15,523.32,20.641,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Correlation

In [5]:
pd.set_option('display.max_rows', None)

In [6]:
corr_mat = train.corr()

In [7]:
corr_mat['cost'].sort_values(ascending=False)

cost                        1.000000
dollar_days                 0.118692
shelf_emptiers              0.108421
budget_bargains             0.099773
discount_frenzy             0.098924
three_for_one               0.098549
mid-size                    0.088991
df                          0.088148
savings_galore              0.085739
price_cutters               0.085579
super_savers                0.080530
za                          0.071359
price_destroyers            0.070276
super_wallet_savers         0.070147
you_save_days               0.068388
fantastic_discounts         0.067894
one_day_sale                0.062491
sales_days                  0.059385
sale_winners                0.056926
green_light_special         0.045552
dimes_off                   0.042851
supermarket                 0.039310
gourmet                     0.036444
gld                         0.036102
full_free                   0.034130
price_smashers              0.031823
bag_stuffer                 0.030445
c

### Splitting the data

In [8]:
x = train.drop(['cost'], axis=1)
y = train['cost']
print(f'x shape = {x.shape}')

x shape = (38854, 181)


In [9]:
test.shape

(19942, 181)

In [10]:
# Reorder test columns to match train columns
test = test[x.columns]

### Scaling the data

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)
test = scaler.transform(test)

### Shuffling the data

In [12]:
from sklearn.utils import shuffle
x, y = shuffle(x, y, random_state=42)

#### Linear Regression

In [13]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x, y)

#### Decision Tree Regressor

In [14]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(x,y)

#### Random Forest Regressor

In [15]:
from sklearn.ensemble import RandomForestRegressor
rnf_reg = RandomForestRegressor()
rnf_reg.fit(x, y)

#### Gradient Boosting Regressor

In [16]:
from sklearn.ensemble import GradientBoostingRegressor
gdb_reg = GradientBoostingRegressor()
gdb_reg.fit(x,y)

#### XGBoost Regressor

In [17]:
from xgboost import XGBRegressor
xgb_reg = XGBRegressor()
xgb_reg.fit(x,y)

#### Testing the models

In [18]:
from sklearn.model_selection import cross_val_score

scores_lin = cross_val_score(lin_reg, x, y, scoring= 'neg_root_mean_squared_error', cv = 2)

scores_tree = cross_val_score(tree_reg, x, y, scoring= 'neg_root_mean_squared_error', cv = 2)

scores_rnf = cross_val_score(rnf_reg, x, y, scoring= 'neg_root_mean_squared_error', cv = 2)

scores_gdb = cross_val_score(gdb_reg, x, y, scoring= 'neg_root_mean_squared_error', cv = 2)

scores_xgb = cross_val_score(xgb_reg, x, y, scoring= 'neg_root_mean_squared_error', cv = 2)

In [19]:
print(f'Linear regression negative root mean squared error = {scores_lin.mean()}')
print(f'Decision tree negative root mean squared error = {scores_tree.mean()}')
print(f'Random forest negative root mean squared error = {scores_rnf.mean()}')
print(f'Gradiantboost regressor negative root mean squared error = {scores_gdb.mean()}')
print(f'XGB regressor negative root mean squared error = {scores_xgb.mean()}')

Linear regression negative root mean squared error = -302313873464074.5
Decision tree negative root mean squared error = -110.15398680364726
Random forest negative root mean squared error = -81.97302114561347
Gradiantboost regressor negative root mean squared error = -123.1223121649511
XGB regressor negative root mean squared error = -82.23866779686293


In [20]:
final_prediction = (rnf_reg.predict(test) + xgb_reg.predict(test)) / 2

In [21]:
final = pd.DataFrame({'ID':test_ids.values, 'cost': final_prediction})

In [22]:
final.to_csv('Final predictions.csv',index=False)