Applying various regression models such as logistic regression, decision trees, random forests and KNN

In [202]:
# importing required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [203]:
# Exctracting the data from the web scraped csv file
df = pd.read_csv("raw_data.csv")

In [204]:
#There is no requirement of title in the algorithm
df.drop("Title",axis=1,inplace=True)
df

Unnamed: 0,Brand,Reviews,Ratings,Availability,Price
0,RPS,30,3.3,In stock,170.0
1,Titan,193,4.3,In stock,10075.0
2,Hugo,273,4.5,In stock,32925.0
3,Casio,461,4.7,Only 2 left in stock,5295.0
4,LOUIS,1128,3.9,In stock,397.0
...,...,...,...,...,...
366,TIMEX,2,4.0,In stock,1502.0
367,Titan,31,4.3,Only 2 left in stock,5995.0
368,GUESS,3,4.6,In stock,5456.0
369,GUESS,18,4.3,In stock,9450.0


In [205]:
# extracting target values (price)
Y = np.array(df.pop("Price"))

# Since we want to use various Regression methods, then each of the feature must be numerical (continous) but in this case we have two categorical variables- Brand,Availability.
# Hence we must do some encoding for the same. One of the best options could be one-hot encoding. That means we are converting the data from categorical to binary.

In [206]:
# one-hot encoding for df.Availability
encoded_Availability = pd.get_dummies(df.Availability)
df_new = pd.concat([df,encoded_Availability],axis = 1)

# now we can remove 'In stock' column
df_new.drop('Availability',axis=1,inplace=True)

In [207]:
# one-hot encoding for df.Brand
encoded_brand = pd.get_dummies(df.Brand)
df_new = pd.concat([df_new,encoded_brand],axis=1)

# now we can remove 'Brand' column
df_new.drop('Brand',axis=1,inplace=True)

In [256]:
df_new

Unnamed: 0,Reviews,Ratings,In stock,Only 1 left in stock,Only 2 left in stock,Only 3 left in stock,Only 4 left in stock,Only 5 left in stock,Out of stock,Acnos,...,Shocknshop,Sonata,Sylvi,TIMEWEAR,TIMEX,Timex,Titan,Tommy,V2A,hamt
0,30,3.3,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,193,4.3,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,273,4.5,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,461,4.7,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1128,3.9,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,2,4.0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
367,31,4.3,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
368,3,4.6,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
369,18,4.3,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Main data is now being stored as np.array format as-

In [229]:
X = df_new.values
X

array([[ 30. ,   3.3,   1. , ...,   0. ,   0. ,   0. ],
       [193. ,   4.3,   1. , ...,   0. ,   0. ,   0. ],
       [273. ,   4.5,   1. , ...,   0. ,   0. ,   0. ],
       ...,
       [  3. ,   4.6,   1. , ...,   0. ,   0. ,   0. ],
       [ 18. ,   4.3,   1. , ...,   0. ,   0. ,   0. ],
       [  1. ,   5. ,   0. , ...,   0. ,   0. ,   0. ]])

Splitting the data for training and testing.

In [230]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state = 0)

# Applying Linear Regression

In [231]:

alg1 = LinearRegression()
alg1.fit(X_train,Y_train)

# getting the predictions
Y_pred_train = alg1.predict(X_train)
Y_pred_test = alg1.predict(X_test)

Testing the effeciency of Linear Regression

In [232]:
MSE_train = mean_squared_error(Y_train,Y_pred_train)
MSE_test = mean_squared_error(Y_test,Y_pred_test)

In [233]:
print(MSE_train)
print(MSE_test)

4337582.00414423
3709089.278891187


In [234]:
RMSE_train = np.sqrt(MSE_train)
RMSE_test = np.sqrt(MSE_test)
print(RMSE_train)
print(RMSE_test)

2082.6862471683608
1925.8996024952046


In [235]:
alg1.score(X_train,Y_train)

0.8119597985105549

In [236]:
alg1.score(X_test,Y_test)

0.7883121330251237

# Applying KNN

In [237]:
alg2 = KNeighborsRegressor()

# TO find the perfect K value we can apply Grid Search crorss validation
grid = {'n_neighbors':[i for i in range(1,51)]}

# initiating grid_search_object 
abc = GridSearchCV(alg2,grid)

#fitting the training data
abc.fit(X_train,Y_train)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30, ...]})

Getting the best value of K

In [238]:
abc.best_estimator_

KNeighborsRegressor(n_neighbors=4)

In [239]:
# so using K = 4

In [240]:
alg2 = KNeighborsRegressor(n_neighbors=4)
alg2.fit(X_train,Y_train)

KNeighborsRegressor(n_neighbors=4)

Getting the scores as well as MSE,RMSE and MAE for both training and testing data

In [241]:
# getting the predictions
Y_pred_train = alg2.predict(X_train)
Y_pred_test = alg2.predict(X_test)

MSE_train = mean_squared_error(Y_train,Y_pred_train)
MSE_test = mean_squared_error(Y_test,Y_pred_test)
print(MSE_train)
print(MSE_test)

9839624.294514388
8290949.729838709


In [242]:
RMSE_train = np.sqrt(MSE_train)
RMSE_test = np.sqrt(MSE_test)
print(RMSE_train)
print(RMSE_test)

3136.817542432838
2879.400932457776


In [243]:
alg2.score(X_train,Y_train)

0.5734386270615388

In [244]:
alg2.score(X_test,Y_test)

0.5268128288273073

# Applying Decision Tree Regressor

In [245]:
alg3 = DecisionTreeRegressor()
alg3.fit(X_train,Y_train)

DecisionTreeRegressor()

In [246]:
alg3.score(X_train,Y_train)

1.0

From the perfect score on the training data we can see that the model is doing *Overfitted* this is because there are a lot of features for the Tree to split upon, hence it will be performing NOT that good on test data as- 

In [247]:
alg3.score(X_test,Y_test)

0.8707065599255916

We can actually try to figure out a best value of maximum depth so that the score can be improved.

In [248]:
grid = {"max_depth":[i for i in range(1,100)]}
abc = GridSearchCV(alg3,grid)
abc.fit(X_train,Y_train)

abc.best_estimator_

DecisionTreeRegressor(max_depth=89)

In [249]:
# using the max_depth to be 8
alg3 = DecisionTreeRegressor(max_depth = 8)
alg3.fit(X_train,Y_train)
print(alg3.score(X_train,Y_train))
print(alg3.score(X_test,Y_test))

0.9435612093643545
0.7849758302328572


This score is slightly better than the earlier one

In [250]:
# getting the predictions
Y_pred_train = alg3.predict(X_train)
Y_pred_test = alg3.predict(X_test)

MSE_train = mean_squared_error(Y_train,Y_pred_train)
MSE_test = mean_squared_error(Y_test,Y_pred_test)
print(MSE_train)
print(MSE_test)

1301891.1948495298
3767546.313272846


In [251]:
RMSE_train = np.sqrt(MSE_train)
RMSE_test = np.sqrt(MSE_test)
print(RMSE_train)
print(RMSE_test)

1141.0044674976211
1941.0168245723287


# Applying Random  Forest Regressor

In [252]:
alg4 = RandomForestRegressor(n_estimators = 10000,random_state=8)
alg4.fit(X_train,Y_train)
alg4.score(X_train,Y_train)

0.9472673918895557

In [253]:
alg4.score(X_test,Y_test)

0.8743353070319116

This is a significant improvement over the previous methods as we were using as a lot more number of estimators (individual trees) are used for the regression.

In [254]:
# getting the predictions
Y_pred_train = alg4.predict(X_train)
Y_pred_test = alg4.predict(X_test)

MSE_train = mean_squared_error(Y_train,Y_pred_train)
MSE_test = mean_squared_error(Y_test,Y_pred_test)
print(MSE_train)
print(MSE_test)

1216399.5260571574
2201834.106431841


In [255]:
RMSE_train = np.sqrt(MSE_train)
RMSE_test = np.sqrt(MSE_test)
print(RMSE_train)
print(RMSE_test)

1102.9050394558715
1483.857845762808


# Conclusions- The best predicitons are given with the help of a random forest regreesor as it is using several individual trees to increase its performance