Setup defaults and import libraries.

In [13]:
import numpy as np
import os
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn

# to make this notebook's output stable across runs
np.random.seed(17)

# change plot defaults
%matplotlib inline
mpl.rc('axes', labelsize=10)
mpl.rc('xtick', labelsize=8)
mpl.rc('ytick', labelsize=8)

Load the data

In [14]:
coffee_data = pd.read_csv("datasets//arabica_data_cleaned.csv", index_col=[0])

Create test and training datasets

In [15]:
from sklearn.model_selection import train_test_split

train_set, test_set =  train_test_split(coffee_data, test_size=0.2, random_state=17)

coffee = train_set.drop("Total.Cup.Points",axis=1)
coffee_labels = train_set["Total.Cup.Points"].copy()

Keep only the columns we are interested in

In [16]:
coffee = coffee[["Country.of.Origin", "Variety", "Processing.Method", "altitude_mean_meters"]]

Remove altitudes less than 200 meters and fill in with median values

In [17]:
coffee[coffee["altitude_mean_meters"] < 200] = None

altitude_median = coffee["altitude_mean_meters"].median()
coffee["altitude_mean_meters"].fillna(altitude_median, inplace=True)

Drop rows with missing data

In [18]:
coffee_rows_dropped =coffee.dropna()

dropped_row_indexes = coffee[~coffee.index.isin(coffee_rows_dropped.index)]
dropped_row_indexes = dropped_row_indexes.index.values.tolist()
coffee = coffee_rows_dropped.reset_index(drop=True)
coffee_labels = coffee_labels.drop(dropped_row_indexes)

Encode categories to numeric data

In [19]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder_country_of_origin = OrdinalEncoder()
country_of_origin_encoded = ordinal_encoder_country_of_origin.fit_transform(coffee[["Country.of.Origin"]])
encoded_country_of_origin = pd.DataFrame(data=country_of_origin_encoded, columns=["country_of_origin_encoded"])
coffee = coffee.merge(encoded_country_of_origin,left_index=True,right_index=True) 

ordinal_encoder_processing_method = OrdinalEncoder()
processing_method_encoded = ordinal_encoder_processing_method.fit_transform(coffee[["Processing.Method"]])
encoded_processing_method = pd.DataFrame(data=processing_method_encoded, columns=["processing_method_encoded"])
coffee = coffee.merge(encoded_processing_method,left_index=True,right_index=True) 

ordinal_encoder_variety = OrdinalEncoder()
variety_encoded = ordinal_encoder_variety.fit_transform(coffee[["Variety"]])
encoded_variety = pd.DataFrame(data=variety_encoded, columns=["variety_encoded"])
coffee = coffee.merge(encoded_variety,left_index=True,right_index=True) 

Scale the data and build prepared training dataset

In [20]:
from sklearn.preprocessing import StandardScaler

coffee_num = coffee[["altitude_mean_meters","processing_method_encoded","variety_encoded"]]

scaler = StandardScaler()
coffee_num_scaled = scaler.fit_transform(coffee_num)

coffee_prepared = np.concatenate([coffee_num_scaled,coffee[["country_of_origin_encoded"]].values], axis=1)

Train on the prepared data

In [24]:
from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()
linear_regression.fit(coffee_prepared, coffee_labels)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Test some of the trained data

In [26]:
sample_data = coffee_prepared[:10]
sample_labels = coffee_labels.iloc[:10]
print("Predictions:",linear_regression.predict(sample_data))
print("Labels:", list(sample_labels))

Predictions: [81.51293136 82.43912883 82.26316622 82.28568892 81.33942282 82.67052366
 81.93818105 81.43489021 82.24943436 82.59114624]
Labels: [81.75, 82.17, 83.75, 84.67, 81.92, 78.42, 79.0, 79.0, 86.0, 82.42]


Calculate the root mean squared errors for linear regression

In [28]:
from sklearn.metrics import mean_squared_error
coffee_predictions = linear_regression.predict(coffee_prepared)
linear_mse = mean_squared_error(coffee_labels,coffee_predictions)
linear_rmse = np.sqrt(linear_mse)
linear_rmse

2.600758224259066

Try a decision tree regression

In [29]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(coffee_prepared, coffee_labels)

coffee_predictions = tree_reg.predict(coffee_prepared)
tree_mse = mean_squared_error(coffee_labels,coffee_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

1.5703257562939459

K-fold cross validation:

In [31]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, coffee_prepared, coffee_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [3.30898163 2.09468036 2.58532742 2.2444926  3.05385241 3.03027305
 3.07314405 2.28117444 2.62030636 2.93115609]
Mean: 2.7223388414111245
Standard deviation: 0.39517646666621703


In [32]:
scores = cross_val_score(linear_regression, coffee_prepared, coffee_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [3.14457703 1.88255198 2.2647137  2.82786659 2.74007018 2.53324578
 3.28570511 2.1970685  2.71576122 2.19749747]
Mean: 2.5789057563632234
Standard deviation: 0.42464293689168886


In [33]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(coffee_prepared, coffee_labels)

coffee_predictions = forest_reg.predict(coffee_prepared)
forest_mse = mean_squared_error(coffee_labels,coffee_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

1.6777201028631585

In [34]:
scores = cross_val_score(forest_reg, coffee_prepared, coffee_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(forest_rmse_scores)

Scores: [2.8888984  2.01233259 2.49639533 2.29937942 2.93265467 2.59838345
 2.85962389 1.90371663 2.49853375 2.38031552]
Mean: 2.4870233643473663
Standard deviation: 0.33521906968735005


Perform a grid search for hyper parameter tuning

In [40]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30,70,100,150], 'max_features': [1,2,3,4]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10, 30,70,100,150], 'max_features': [1,2,3,4]},
  ]

forest_reg = RandomForestRegressor(random_state=17)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(coffee_prepared, coffee_labels)

grid_search.best_params_

{'max_features': 1, 'n_estimators': 70}

In [41]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

2.569480977152115 {'max_features': 1, 'n_estimators': 3}
2.490407331411389 {'max_features': 1, 'n_estimators': 10}
2.466321649750459 {'max_features': 1, 'n_estimators': 30}
2.448460584692986 {'max_features': 1, 'n_estimators': 70}
2.4534395217053464 {'max_features': 1, 'n_estimators': 100}
2.4513612104363163 {'max_features': 1, 'n_estimators': 150}
2.632720485157431 {'max_features': 2, 'n_estimators': 3}
2.498671403564893 {'max_features': 2, 'n_estimators': 10}
2.4815571864675956 {'max_features': 2, 'n_estimators': 30}
2.458334479092535 {'max_features': 2, 'n_estimators': 70}
2.4615362402031216 {'max_features': 2, 'n_estimators': 100}
2.456932294939917 {'max_features': 2, 'n_estimators': 150}
2.6018028444473758 {'max_features': 3, 'n_estimators': 3}
2.515790158251999 {'max_features': 3, 'n_estimators': 10}
2.485912987915155 {'max_features': 3, 'n_estimators': 30}
2.4674612833219247 {'max_features': 3, 'n_estimators': 70}
2.4707955443109997 {'max_features': 3, 'n_estimators': 100}
2.465

In [45]:
feature_importances = grid_search.best_estimator_.feature_importances_
#["altitude","processing","variety","country"]
feature_importances

array([0.47685044, 0.06747558, 0.18630188, 0.26937209])

The above feature importance indicates that altitude has the largest importance, followed by country of origin, variety, and finally processing method.

In [None]:
final_model = grid_search.best_estimator_

X_test = train_set.drop("Total.Cup.Points",axis=1)
y_test = train_set["Total.Cup.Points"].copy()

# need to solve this pipeline issue!!! put all transforms into functions?
X_test_prepared = #full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse