Setup defaults and import libraries.

In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn

# to make this notebook's output stable across runs
np.random.seed(17)

Load the data

In [2]:
coffee_data = pd.read_csv("datasets//arabica_data_cleaned.csv", index_col=[0])

Create test and training datasets

In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set =  train_test_split(coffee_data, test_size=0.2, random_state=17)

coffee = train_set.drop("Total.Cup.Points",axis=1)
coffee_labels = train_set["Total.Cup.Points"].copy()

Write all of our data transformation steps

In [41]:
from sklearn.preprocessing import FunctionTransformer

def null_altitude_outliers(X):
    X[X < 200] = None
    X[X > 5000] = None
    return X

Build our pipeline

In [44]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler

pipeline = ColumnTransformer([
        ('country_encoding', Pipeline([
            ('imputer', SimpleImputer(strategy="constant")),
            ('encoder', OrdinalEncoder())
        ]),["Country.of.Origin"]),
        ('variety_encoding', Pipeline([
            ('imputer', SimpleImputer(strategy="constant")),
            ('encoder', OrdinalEncoder())
        ]),["Variety"]),
        ('processing_encoding', Pipeline([
            ('imputer', SimpleImputer(strategy="constant")),
            ('encoder', OrdinalEncoder())
        ]),["Processing.Method"]),
        ('numerical_data', Pipeline([
            ('null_outliers', FunctionTransformer(null_altitude_outliers,validate=False)),
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', MinMaxScaler())
        ]),["altitude_mean_meters"])
    ])

X = pipeline.fit_transform(coffee)