Setup defaults and import libraries.

In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn

# to make this notebook's output stable across runs
np.random.seed(17)

# change plot defaults
%matplotlib inline
mpl.rc('axes', labelsize=10)
mpl.rc('xtick', labelsize=8)
mpl.rc('ytick', labelsize=8)


Load the data

In [3]:
coffee = pd.read_csv("datasets//arabica_data_cleaned.csv", index_col=[0])

In [4]:
coffee.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1311 entries, 1 to 1312
Data columns (total 43 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Species                1311 non-null   object 
 1   Owner                  1304 non-null   object 
 2   Country.of.Origin      1310 non-null   object 
 3   Farm.Name              955 non-null    object 
 4   Lot.Number             270 non-null    object 
 5   Mill                   1001 non-null   object 
 6   ICO.Number             1165 non-null   object 
 7   Company                1102 non-null   object 
 8   Altitude               1088 non-null   object 
 9   Region                 1254 non-null   object 
 10  Producer               1081 non-null   object 
 11  Number.of.Bags         1311 non-null   int64  
 12  Bag.Weight             1311 non-null   object 
 13  In.Country.Partner     1311 non-null   object 
 14  Harvest.Year           1264 non-null   object 
 15  Grad

In [5]:
coffee.describe()

Unnamed: 0,Number.of.Bags,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean.Cup,Sweetness,Cupper.Points,Total.Cup.Points,Moisture,Category.One.Defects,Quakers,Category.Two.Defects,altitude_low_meters,altitude_high_meters,altitude_mean_meters
count,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1311.0,1310.0,1311.0,1084.0,1084.0,1084.0
mean,153.887872,7.563806,7.51807,7.397696,7.533112,7.517727,7.517506,9.833394,9.83312,9.903272,7.497864,82.115927,0.088863,0.426392,0.177099,3.591915,1759.548954,1808.843803,1784.196379
std,129.733734,0.378666,0.399979,0.405119,0.381599,0.359213,0.406316,0.559343,0.77135,0.530832,0.47461,3.515761,0.047957,1.832415,0.840583,5.350371,8767.847252,8767.187498,8767.016913
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,14.5,7.42,7.33,7.25,7.33,7.33,7.33,10.0,10.0,10.0,7.25,81.17,0.09,0.0,0.0,0.0,1100.0,1100.0,1100.0
50%,175.0,7.58,7.58,7.42,7.5,7.5,7.5,10.0,10.0,10.0,7.5,82.5,0.11,0.0,0.0,2.0,1310.64,1350.0,1310.64
75%,275.0,7.75,7.75,7.58,7.75,7.67,7.75,10.0,10.0,10.0,7.75,83.67,0.12,0.0,0.0,4.0,1600.0,1650.0,1600.0
max,1062.0,8.75,8.83,8.67,8.75,8.58,8.75,10.0,10.0,10.0,10.0,90.58,0.28,31.0,11.0,55.0,190164.0,190164.0,190164.0


Keep our columns to be used for features

In [6]:
coffee_slim = coffee[["Total.Cup.Points","Country.of.Origin","Variety","Processing.Method","altitude_mean_meters"]]

Split the data into training and test sets based on 

In [47]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(coffee_slim, test_size=0.2, random_state=17)
 
coffee_slim = train_set.drop("Total.Cup.Points", axis=1)
coffee_slim_labels = train_set["Total.Cup.Points"].copy()

Time to clean up the data.

In [48]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
coffee_slim_num = coffee_slim.drop(["Country.of.Origin","Variety","Processing.Method"], axis=1)
imputer.fit(coffee_slim_num)
X = imputer.transform(coffee_slim_num)
coffee_tr = pd.DataFrame(X, columns=coffee_slim_num.columns,index=coffee_slim.index)

In [23]:
from sklearn.preprocessing import OrdinalEncoder

coffee_slim_country = coffee_slim[["Country.of.Origin"]]

ordinal_encoded = OrdinalEncoder()
coffee_slim_country_encoded = ordinal_encoded.fit_transform(coffee_slim_country)
ordinal_encoded.categories_

[array(['Brazil', 'Burundi', 'China', 'Colombia', 'Costa Rica',
        'Cote d?Ivoire', 'Ecuador', 'El Salvador', 'Ethiopia', 'Guatemala',
        'Haiti', 'Honduras', 'Indonesia', 'Japan', 'Kenya', 'Laos',
        'Malawi', 'Mauritius', 'Mexico', 'Myanmar', 'Nicaragua', 'Panama',
        'Papua New Guinea', 'Peru', 'Philippines', 'Taiwan',
        'Tanzania, United Republic Of', 'Thailand', 'Uganda',
        'United States', 'United States (Hawaii)',
        'United States (Puerto Rico)', 'Vietnam', 'Zambia'], dtype=object)]

In [49]:
# #from sklearn.base import BaseEstimator, TransformerMixin
# # altitude_ix = 4

# # class CleanAltitudes(BaseEstimator, TransformerMixin):
# #     def __init__(self):
# #         None
# #     def fit(self, X, y=None):
# #         return self
# #     def transform(self, X, y=None):
# #         # Keep only entries that have altitudes >= 200
# #         X = X[X[:,altitude_ix] >= 200]
# #         return np.c_[X]

# # attr_cleaner = CleanAltitudes()
# # coffee_cleaned = attr_cleaner.transform(coffee_slim.values)

# from sklearn.preprocessing import FunctionTransformer
# def remove_bad_altitudes(X):
#     # Keep only entries that have altitudes >= 200
#     X = X[X[:,altitude_ix] >= 200]
#     return np.c_[X]

# altitude_cleaner = FunctionTransformer(remove_bad_altitudes, validate=False)
# coffee_slim_clean_altitudes = altitude_cleaner.fit_transform(coffee_slim.values)


In [50]:
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy="median"))
])

coffee_slim_num_tr = num_pipeline.fit_transform(coffee_slim_num)

from sklearn.compose import ColumnTransformer

num_attributes = list(coffee_slim_num)
cat_attributes = ["Country.of.Origin"]

full_pipeline = ColumnTransformer([
    ("num",num_pipeline, num_attributes),
    ("cat",OrdinalEncoder(), cat_attributes)
])

coffee_slim_prepared = full_pipeline.fit_transform(coffee_slim)

In [52]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(coffee_slim_prepared, coffee_slim_labels)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [62]:

coffee_slim_labels = coffee_slim_labels
coffee_slim_prepared = full_pipeline.transform(coffee_slim)

print("Predictions:", lin_reg.predict(coffee_slim_prepared)[:5])

print("Labels:", list(some_labels)[:5])

lin_mse = mean_squared_error(coffee_slim_labels, lin_reg.predict(coffee_slim_prepared))
lin_mse = np.sqrt(lin_mse)
lin_mse

Predictions: [82.32944702 82.21431566 82.44504023 82.04131664 82.04129532]
Labels: [82.75, 82.0, 83.17, 80.67, 82.42]


2.6665321201817096

In [58]:
from sklearn.tree import DecisionTreeRegressor

coffee_prepared = full_pipeline.transform(coffee_slim)
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(coffee_prepared, coffee_slim_labels)

coffee_predictions = tree_reg.predict(coffee_prepared)

tree_mse = mean_squared_error(coffee_slim_labels, coffee_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

1.9685917491616163