## Linear Regression Model 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [5]:
import xgboost as xgb 
from sklearn.ensemble import RandomForestRegressor

In [4]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-1.4.2-py3-none-win_amd64.whl (97.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.4.2


In [9]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [15]:
train.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_price', 'item_cnt_month'], dtype='object')

In [70]:
X_train = train.where(train['date_block_num'] <32).dropna().reset_index()

In [72]:
X_train.drop('index', axis =1, inplace = True)


In [82]:
X_train = X_train.iloc[:,:4]
X_train

Unnamed: 0,date_block_num,shop_id,item_id,item_price
0,0.0,0.0,32.0,221.0
1,0.0,0.0,33.0,347.0
2,0.0,0.0,35.0,247.0
3,0.0,0.0,43.0,221.0
4,0.0,0.0,51.0,127.0
...,...,...,...,...
1547910,31.0,59.0,22088.0,119.0
1547911,31.0,59.0,22092.0,179.0
1547912,31.0,59.0,22102.0,1250.0
1547913,31.0,59.0,22105.0,199.0


In [31]:
train.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_price', 'item_cnt_month'], dtype='object')

In [83]:
y_train = train.where(train['date_block_num'] <32).dropna().reset_index()
y_train.drop('index', axis =1, inplace = True)
y_train = y_train.iloc[:,-1:]
y_train.columns

Index(['item_cnt_month'], dtype='object')

In [75]:
X_test = train.where(train['date_block_num'] >31).dropna().reset_index()

In [76]:
X_test.drop('index', axis=1 ,inplace =True)

In [85]:
X_test = X_test.iloc[:,:4]
X_test.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price
0,32.0,2.0,33.0,199.0
1,32.0,2.0,486.0,300.0
2,32.0,2.0,792.0,979.0
3,32.0,2.0,975.0,349.0
4,32.0,2.0,1090.0,499.0


In [105]:
y_test = train.where(train['date_block_num'] >31).dropna().reset_index()
y_test.drop('index', axis =1, inplace = True)
y_test = y_test.iloc[:,-1:]
y_test.columns

Index(['item_cnt_month'], dtype='object')

In [93]:
from sklearn.metrics import mean_squared_error

In [111]:
from sklearn.ensemble import RandomForestRegressor

In [106]:
# Call model frame work for linear regression

Linear_Regression = LinearRegression()
Linear_Regression.fit(X_train, y_train)

predictions = Linear_Regression.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print("RMSE: %f" % (rmse))


RMSE: 16.548208


In [181]:
X_test.shape

(61209, 4)

In [110]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1547915, 4), (1547915, 1), (61209, 4), (61209, 1))

In [117]:
#target conversion in to flattened array
new_y_train = y_train.values.ravel()

# Call model frame work for random forest regressor 
RF_model = RandomForestRegressor(n_estimators=100,max_depth=20)
RF_model.fit(X_train, new_y_train)

pred_RF = RF_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred_RF ))
print("RMSE: %f" % (rmse))


RMSE: 14.968142


In [119]:
from xgboost import XGBRegressor

In [121]:
# Call model frame work for XGBoost

XG_model = XGBRegressor(n_estimators=100,learning_rate=0.2)
XG_model.fit(X_train,y_train)

preds = XG_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 15.368742


In [124]:
print(pred_RF)

[1.03424693 1.73501923 2.9836923  ... 2.63333333 2.39320174 2.14518218]


In [125]:
print(preds)

[1.2112683 2.8839986 0.9774871 ... 2.196866  1.6006378 3.1679978]


In [136]:
#test on new data for RF model
test = pd.read_csv('test.csv')

#to  shuffle dataframe to mix values (eliminate biasness)
#new_test = test.sample(frac=1).reset_index(drop=True)
new_pred_RF = RF_model.predict(new_test)

rmse = np.sqrt(mean_squared_error(y_test, new_pred_RF[0:61209]))
print("RMSE: %f" % (rmse))


RMSE: 17.285402


In [179]:
new_pred_RF.shape, y_test.shape

((214200,), (61209, 1))

In [191]:
predicts = Linear_Regression.predict(test)
rmse1 = np.sqrt(mean_squared_error(y_test, predicts[0:61209]))
rmse2 = np.sqrt(mean_squared_error(y_test, predicts[61210:122419]))
rmse3 = np.sqrt(mean_squared_error(y_test, predicts[122419:183628]))
rmse4 = np.sqrt(mean_squared_error(y_test[0:30572], predicts[183628:214200]))
print("RMSE1: %f" % (rmse1))
print("RMSE2: %f" % (rmse2))
print("RMSE3: %f" % (rmse3))
print("RMSE4: %f" % (rmse4))

RMSE1: 16.550371
RMSE2: 16.550038
RMSE3: 16.548764
RMSE4: 18.183192


In [202]:
predicts_2 = Linear_Regression.predict(test).clip(0,20)

In [205]:
sample_submission = pd.read_csv('sample_submission.csv')

In [211]:
sample_submission['item_cnt_month'] = Linear_Regression.predict(test.columns).clip(0, 20)
sample_submission.to_csv('submission_247ai.csv')

ValueError: could not convert string to float: 'date_block_num'

In [192]:
#linear regression doesnt change so be consider this
submission = pd.DataFrame(predicts, columns = ['item_cnt_month'])
submission.head()

Unnamed: 0,item_cnt_month
0,2.415752
1,2.264751
2,2.371228
3,2.37124
4,2.265375


In [193]:
submission.shape

(214200, 1)

In [194]:
submission.index.name = "ID"
submission.head()

Unnamed: 0_level_0,item_cnt_month
ID,Unnamed: 1_level_1
0,2.415752
1,2.264751
2,2.371228
3,2.37124
4,2.265375


In [195]:
#to make ID as a index to column
submission.reset_index(level=0, inplace=True)


In [196]:
submission.shape, submission.head()

((214200, 2),
    ID  item_cnt_month
 0   0        2.415752
 1   1        2.264751
 2   2        2.371228
 3   3        2.371240
 4   4        2.265375)

In [197]:
#saving as submission

submission.to_csv("submission.csv", index = False)

Here, we have first arranged the training data as item_cnt_day to item_cnt_month using group by .
then we have divided this data into two parts training and validation.(0 to 31 & 32 to33 :dat_block_num) 
then on test data which only has one feature so we have added other features by taking prediction on the required month i.e =34
then we fit our model on Linear Regression , RF and XG Boost and checked accuracy using RMSE.
after this we found min RMSE in Lin,\. Reg.


then we taken only no. of rows is equal to our test validation data (0:61209)
then we found RMSE min for linear regression i.e 16.55.
thus we found that linear regression is better in all case . hence we taken predicted values as our submission on kaggle.
We need rows in submssion.csv 214200 , so we do it in steps.

In [198]:
submission['item_cnt_month'].clip(0, 20)

0         2.415752
1         2.264751
2         2.371228
3         2.371240
4         2.265375
            ...   
214195    2.090122
214196    2.294431
214197    2.140749
214198    2.073016
214199    2.313780
Name: item_cnt_month, Length: 214200, dtype: float64

In [200]:
submission.shape

(214200, 2)