Setup defaults and import libraries.

In [126]:
import numpy as np
import os
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn

# to make this notebook's output stable across runs
np.random.seed(17)

# change plot defaults
%matplotlib inline
mpl.rc('axes', labelsize=10)
mpl.rc('xtick', labelsize=8)
mpl.rc('ytick', labelsize=8)

Load the data

In [127]:
coffee_data = pd.read_csv("datasets//arabica_data_cleaned.csv", index_col=[0])

Create test and training datasets

In [128]:
from sklearn.model_selection import train_test_split

train_set, test_set =  train_test_split(coffee_data, test_size=0.2, random_state=17)

coffee = train_set.drop("Total.Cup.Points",axis=1)
coffee_labels = train_set["Total.Cup.Points"].copy()

Keep only the columns we are interested in

In [129]:
coffee = coffee[["Country.of.Origin", "Variety", "Processing.Method", "altitude_mean_meters"]]

Remove altitudes less than 200 meters and fill in with median values

In [130]:
coffee[coffee["altitude_mean_meters"] < 200] = None

altitude_median = coffee["altitude_mean_meters"].median()
coffee["altitude_mean_meters"].fillna(altitude_median, inplace=True)

Drop rows with missing data

In [131]:
coffee=coffee.dropna()
coffee.reset_index(drop=True,inplace=True)

Encode categories to numeric data

In [132]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder_country_of_origin = OrdinalEncoder()
country_of_origin_encoded = ordinal_encoder_country_of_origin.fit_transform(coffee[["Country.of.Origin"]])
encoded_country_of_origin = pd.DataFrame(data=country_of_origin_encoded, columns=["country_of_origin_encoded"])
coffee = coffee.merge(encoded_country_of_origin,left_index=True,right_index=True) 

ordinal_encoder_processing_method = OrdinalEncoder()
processing_method_encoded = ordinal_encoder_processing_method.fit_transform(coffee[["Processing.Method"]])
encoded_processing_method = pd.DataFrame(data=processing_method_encoded, columns=["processing_method_encoded"])
coffee = coffee.merge(encoded_processing_method,left_index=True,right_index=True) 

ordinal_encoder_variety = OrdinalEncoder()
variety_encoded = ordinal_encoder_variety.fit_transform(coffee[["Variety"]])
encoded_variety = pd.DataFrame(data=variety_encoded, columns=["variety_encoded"])
coffee = coffee.merge(encoded_variety,left_index=True,right_index=True) 