In [52]:
# Importing the libraries
import numpy as np # for array operations
import pandas as pd # for working with DataFrames
import requests, io # for HTTP requests and I/O commands
import matplotlib.pyplot as plt # for data visualization
%matplotlib inline
import datetime
import plotly.express as px

# scikit-learn modules
from sklearn.model_selection import train_test_split # for splitting the data
from sklearn.metrics import mean_squared_error # for calculating the cost function
# Import tools needed for visualization
from sklearn.ensemble import RandomForestRegressor # for building the model
from sklearn.tree import export_graphviz
import pydot


In [53]:
dataset = pd.read_csv('avocado.csv')

In [54]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [55]:
def get_day(date):
    return datetime.datetime.strptime(date, '%Y-%m-%d').day

In [56]:
def get_month(date):
    return datetime.datetime.strptime(date, "%Y-%m-%d").month

In [57]:
dataset['Day'] = dataset['Date'].apply(get_day)

In [58]:
dataset['Month'] = dataset['Date'].apply(get_month)

In [59]:
x = dataset.drop(['Unnamed: 0', 'Date', 'AveragePrice', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'region'], axis=1) # Features
y = dataset['AveragePrice']  # Target

In [60]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 28)

In [61]:
datetime.datetime.strptime("2015-12-20", "%Y-%m-%d").month

12

In [62]:
# Initializing the Random Forest Regression model with 10 decision trees
model = RandomForestRegressor(n_estimators = 10, random_state = 0)

# Fitting the Random Forest Regression model to the data
model.fit(x_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

In [63]:
# Predicting the target values of the test set
y_pred = model.predict(x_test)

# RMSE (Root Mean Square Error)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)), '.3f'))
print("\nRMSE: ", rmse)


RMSE:  0.311


In [64]:
# Limit depth of tree to 3 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(x_train, y_train)
# Extract the small tree
tree_small = rf_small.estimators_[5]
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png');

In [65]:
x_test['Predicted Value'] = y_pred
x_test['Actual Value'] = y_test
x_test['Date'] = x_test['year'].astype(str) + '-' + x_test['Month'].astype(str) + '-' + x_test['Day'].astype(str)
x_test['RMSE'] = (x_test['Predicted Value'] - x_test['Actual Value'])**2**1/2

In [66]:
fig = px.scatter(x_test, x="Predicted Value", y="Actual Value",)
fig.show()

In [67]:
fig = px.scatter_3d(x_test, x='Date', y='Total Volume', z='Predicted Value', color= 'RMSE')
fig.show()