In [None]:
# Based off of the following tutorial: 
# https://colab.research.google.com/github/tensorflow/docs/blob/r2.0rc/site/en/r2/tutorials/keras/basic_regression.ipynb#scrollTo=oRKO_x8gWKv-

# Use seaborn for pairplot
!pip install seaborn

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

In [None]:
# Load in the dataset
dataset = pd.read_csv('/kaggle/input/avocado-prices/avocado.csv')
dataset.tail()

In [None]:
# Drop some columns (for now)
dataset = dataset.drop(columns=['Unnamed: 0'], axis=1)

# Drop region 
dataset = dataset.drop(columns=['region'])

# Drop date 
dataset = dataset.drop(columns=['Date'])
dataset.tail()

In [None]:
# Clean the data

# Check for unknown vals - there aren't any!
dataset.isna().sum()

# One hot convert type

atype = dataset.pop('type')
dataset['organic'] = (atype == "organic")*1.0
dataset['conventional'] = (atype == "conventional")*1.0

dataset.tail()

In [None]:
# Split the data into train and test
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [None]:
# Inspect data
sns.pairplot(train_dataset[["year", "AveragePrice", "Total Volume", "organic"]], diag_kind="kde")

In [None]:
# Get overall stats
train_stats = train_dataset.describe()
train_stats.pop("AveragePrice")
train_stats = train_stats.transpose()
train_stats

In [None]:
# Split features from labels
train_labels = train_dataset.pop('AveragePrice')
test_labels = test_dataset.pop('AveragePrice')

In [None]:
# Normalize data
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [None]:
# Build the model
def build_model():
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)
    ])
    
    optimizer = tf.keras.optimizers.RMSprop(0.001)
    
    model.compile(loss='mse',
                 optimizer=optimizer,
                 metrics=['mae', 'mse'])
    return model

In [None]:
model = build_model()
model.summary()

In [None]:
# Train the model
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 100 == 0: print('')
    print('.', end='')

EPOCHS = 1000

# history = model.fit(
#     normed_train_data, train_labels,
#     epochs=EPOCHS, validation_split=0.2, verbose=0,
#     callbacks=[PrintDot()])

In [None]:
# Visualize progress
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [Price]')
  plt.plot(hist['epoch'], hist['mean_absolute_error'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mean_absolute_error'],
           label = 'Val Error')
  # plt.ylim([0,5])
  plt.legend()

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error [$Price^2$]')
  plt.plot(hist['epoch'], hist['mean_squared_error'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mean_squared_error'],
           label = 'Val Error')
  # plt.ylim([0,20])
  plt.legend()
  plt.show()

In [None]:
model = build_model()

# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(normed_train_data, train_labels, epochs=EPOCHS,
                    validation_split = 0.2, verbose=0, callbacks=[early_stop, PrintDot()])

plot_history(history)

In [None]:
# Test on test data
loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=0)

print("Testing Set Mean Abs Error: ${:5.2f}".format(mae))

In [None]:
# Predict on testing set data
test_predictions = model.predict(normed_test_data).flatten()

plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])

In [None]:
# Error Distribution
error = test_predictions - test_labels
plt.hist(error, bins = 25)
plt.xlabel("Prediction Error [MPG]")
_ = plt.ylabel("Count")