In [1]:
import pandas as pd

In [2]:
train = pd.read_csv(r'/home/icarus/Workspace/Product-Cost-Prediction/data_preprocessed/train_preprocessed.csv')
test = pd.read_csv(r'/home/icarus/Workspace/Product-Cost-Prediction/data_preprocessed/test_preprocessed.csv')
test_with_ids = pd.read_csv(r'/home/icarus/Workspace/Product-Cost-Prediction/data_joined/test_joined.csv')
test_ids = test_with_ids['unnamed:_0']

In [3]:
test.head()

Unnamed: 0,store_sales,store_cost,grocery_area,frozen_area,net_weight,Bag Stuffer,Best Price Savers,Best Savings,Big Time Discounts,Big Time Saving,...,BC,CA,DF,GU,JA,OR,VE,WA,YU,ZA
0,11.76,4.704,1424.85,465.54,28.7811,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.16,0.6696,1735.17,505.07,27.0368,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.83,0.8235,2038.11,481.98,26.31,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,8.82,4.41,1871.16,593.93,25.0018,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4.32,1.9872,1320.15,523.32,20.641,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Correlation

In [4]:
pd.set_option('display.max_rows', None)

In [5]:
corr_mat = train.corr()

In [6]:
corr_mat['cost'].sort_values(ascending=False)

cost                        1.000000
Dollar Days                 0.118692
Shelf Emptiers              0.108421
Budget Bargains             0.099773
Discount Frenzy             0.098924
Three for One               0.098549
DF                          0.088148
Mid-Size                    0.086087
Savings Galore              0.085739
Price Cutters               0.085579
Super Savers                0.080530
ZA                          0.071359
Price Destroyers            0.070276
Super Wallet Savers         0.070147
You Save Days               0.068388
Fantastic Discounts         0.067894
One Day Sale                0.062491
Sales Days                  0.059385
Sale Winners                0.056926
Green Light Special         0.045552
Dimes Off                   0.042851
Gourmet                     0.038592
GLD                         0.036102
Supermarket                 0.035495
Full Free                   0.034130
Price Smashers              0.031823
Bag Stuffer                 0.030445
C

### Splitting the data

In [7]:
x = train.drop(['cost'], axis=1)
y = train['cost']
print(f'x shape = {x.shape}')

x shape = (38854, 70)


### Scaling the data

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)
test = scaler.transform(test) 

### Shuffling the data

In [9]:
from sklearn.utils import shuffle
x, y = shuffle(x, y, random_state=42)

#### Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x, y)

#### Decision Tree Regressor

In [11]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(x,y)

#### Random Forest Regressor

In [12]:
from sklearn.ensemble import RandomForestRegressor
rnf_reg = RandomForestRegressor()
rnf_reg.fit(x, y)

#### Gradient Boosting Regressor

In [13]:
from sklearn.ensemble import GradientBoostingRegressor
gdb_reg = GradientBoostingRegressor()
gdb_reg.fit(x,y)

#### XGBoost Regressor

In [14]:
from xgboost import XGBRegressor
xgb_reg = XGBRegressor()
xgb_reg.fit(x,y)

#### Testing the models

In [15]:
from sklearn.model_selection import cross_val_score

scores_lin = cross_val_score(lin_reg, x, y, scoring= 'neg_root_mean_squared_error', cv = 2)

scores_tree = cross_val_score(tree_reg, x, y, scoring= 'neg_root_mean_squared_error', cv = 2)

scores_rnf = cross_val_score(rnf_reg, x, y, scoring= 'neg_root_mean_squared_error', cv = 2)

scores_gdb = cross_val_score(gdb_reg, x, y, scoring= 'neg_root_mean_squared_error', cv = 2)

scores_xgb = cross_val_score(xgb_reg, x, y, scoring= 'neg_root_mean_squared_error', cv = 2)

In [16]:
print(f'Linear regression negative root mean squared error = {scores_lin.mean()}')
print(f'Decision tree negative root mean squared error = {scores_tree.mean()}')
print(f'Random forest negative root mean squared error = {scores_rnf.mean()}')
print(f'Gradiantboost regressor negative root mean squared error = {scores_gdb.mean()}')
print(f'XGB regressor negative root mean squared error = {scores_xgb.mean()}')

Linear regression negative root mean squared error = -135.6560331670168
Decision tree negative root mean squared error = -109.82883287774924
Random forest negative root mean squared error = -80.94524619463357
Gradiantboost regressor negative root mean squared error = -123.06895413507544
XGB regressor negative root mean squared error = -80.11605177641006


In [17]:
final_prediction = (rnf_reg.predict(test) + xgb_reg.predict(test)) / 2

In [18]:
final = pd.DataFrame({'ID':test_ids.values, 'cost': final_prediction})

In [19]:
final.to_csv('Final predictions.csv',index=False)