In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt

## Importing Dataset

In [2]:
train = pd.read_csv("../input/eda-avocado-prices/Filtered_dataset.csv")

In [3]:
train.drop(['Unnamed: 0'],axis=1,inplace=True)
train

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region,month,week,price_types
0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany,2015-12-01,2015-12-24,low
1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany,2015-12-01,2015-12-17,low
2,2015-12-13,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany,2015-12-01,2015-12-10,low
3,2015-12-06,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany,2015-12-01,2015-12-03,low
4,2015-11-29,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany,2015-11-01,2015-11-26,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11533,2018-02-04,1.63,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,organic,2018,WestTexNewMexico,2018-02-01,2018-02-01,mean
11534,2018-01-28,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,organic,2018,WestTexNewMexico,2018-01-01,2018-01-25,mean
11535,2018-01-21,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,WestTexNewMexico,2018-01-01,2018-01-18,high
11536,2018-01-14,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,WestTexNewMexico,2018-01-01,2018-01-11,expensive


In [4]:
x = train.drop(['Date','AveragePrice'],axis=1)
y = train['AveragePrice']

In [5]:
le = preprocessing.LabelEncoder()
for i in x.columns:
    if x[i].dtype == 'object':
        x[i] = le.fit_transform(x[i].astype(str))
x

Unnamed: 0,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region,month,week,price_types
0,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,0,2015,0,11,51,2
1,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,0,2015,0,11,50,2
2,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,0,2015,0,11,49,2
3,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,0,2015,0,11,48,2
4,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,0,2015,0,10,47,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11533,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,1,2018,53,37,161,3
11534,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,1,2018,53,36,160,3
11535,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,1,2018,53,36,159,1
11536,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,1,2018,53,36,158,0


In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [7]:
# splitting the dataset into train and test dataset with 4:1 ratio (80%-20%)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2, random_state = 26)

## Training on different algorithms

### Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression

# Create instance of model
lreg = LinearRegression()
# Pass training data into model
lreg.fit(x_train, y_train)

LinearRegression()

In [9]:
# Getting prediciton on x_test
y_pred_lreg = lreg.predict(x_test)

In [10]:
def rmse(y,y_pred):
    return (np.sqrt(mse(y,y_pred)))

In [11]:
# Scoring our model
print('Linear Regression')
# Root mean square error of our model
print('--'*50)
linreg_error = rmse(y_test, y_pred_lreg)
print('RMSE = ', linreg_error)


Linear Regression
----------------------------------------------------------------------------------------------------
RMSE =  0.2544310852015447


### RBF SUPPORT VECTOR REGRESSOR

In [12]:
%%time
from sklearn.svm import SVR

svr = SVR(kernel = 'rbf')

# Fit the model on training data
svr.fit(x_train, y_train)

CPU times: user 2.77 s, sys: 138 ms, total: 2.91 s
Wall time: 2.91 s


SVR()

In [13]:
# Getting the predictions for x_test
y_pred_svr = svr.predict(x_test)

In [14]:
print('Support Vector Classifier')
# Root mean square error of our model
print('--'*50)
svr_error = rmse(y_test, y_pred_svr)
print('RMSE = ', svr_error)

Support Vector Classifier
----------------------------------------------------------------------------------------------------
RMSE =  0.12147807507384512


### Decision Tree - Regression

In [15]:
%%time
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
# Fit new DT on training data
dtr.fit(x_train, y_train)

CPU times: user 144 ms, sys: 8.8 ms, total: 153 ms
Wall time: 209 ms


DecisionTreeRegressor()

In [16]:
# Predict DTR
y_pred_dtr = dtr.predict(x_test)

In [17]:
print('Decision Tree')
# Root mean square error of our model
print('--'*50)
dtr_error = rmse(y_test, y_pred_dtr)
print('RMSE = ', dtr_error)

Decision Tree
----------------------------------------------------------------------------------------------------
RMSE =  0.12550815256582049


### RANDOM FOREST

In [18]:
from sklearn.ensemble import RandomForestRegressor

# Create model object
rfr = RandomForestRegressor(n_estimators = 250,n_jobs=-1)
# Fit model to training data
rfr.fit(x_train,y_train)
y_pred_rfr = rfr.predict(x_test)

In [19]:
print('Random Forest')
# Root mean square error of our model
print('--'*50)
rfr_error = rmse(y_test, y_pred_rfr)
print('RMSE = ', rfr_error)

Random Forest
----------------------------------------------------------------------------------------------------
RMSE =  0.08701408958999135


### XGBoost Regressor

In [20]:
from xgboost import XGBRegressor

# Create model object
xgb = XGBRegressor(n_jobs=-1)

# Fit model to training data
xgb.fit(x_train, y_train)
y_pred_xgb = xgb.predict(x_test)

In [21]:
print('XGBoost Classifer')
print('--'*50)
xgb_error = rmse(y_test, y_pred_xgb)
print('RMSE = ', xgb_error)

XGBoost Classifer
----------------------------------------------------------------------------------------------------
RMSE =  0.08990305033932103


In [22]:
models = pd.DataFrame({
     'Model': ['Linear Regression', 'RBF SVC', 
               'Decision Tree', 'Random Forest','XGBoost Regressor'],
    'Score': [linreg_error, svr_error, 
               dtr_error, rfr_error,xgb_error]})
models.sort_values(by='Score', ascending=True)

Unnamed: 0,Model,Score
3,Random Forest,0.087014
4,XGBoost Regressor,0.089903
1,RBF SVC,0.121478
2,Decision Tree,0.125508
0,Linear Regression,0.254431
