In [30]:
# evaluate mean imputation and random forest for the horse colic dataset
import numpy as np
import pandas as pd
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'
dataframe = read_csv(url, header=None, na_values='?')
dataframe.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2
5,2.0,1,528355,,,,2.0,1.0,3.0,1.0,...,,,,,1.0,2,0,0,0,2
6,1.0,1,526802,37.9,48.0,16.0,1.0,1.0,1.0,1.0,...,37.0,7.0,,,1.0,1,3124,0,0,2
7,1.0,1,529607,,60.0,,3.0,,,1.0,...,44.0,8.3,,,2.0,1,2208,0,0,2
8,2.0,1,530051,,80.0,36.0,3.0,4.0,3.0,1.0,...,38.0,6.2,,,3.0,1,3205,0,0,2
9,2.0,9,5299629,38.3,90.0,,1.0,,1.0,1.0,...,40.0,6.2,1.0,2.2,1.0,2,0,0,0,1


In [31]:
print('Missing: %d' % dataframe.isnull().sum().sum())

Missing: 1605


In [32]:
# split into input and output elements
data = dataframe.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]
imputer = SimpleImputer(strategy='mean')
# fit on the dataset
imputer.fit(X)
# transform the dataset
Xtrans = imputer.transform(X)
print('Missing: %d' % np.count_nonzero(np.isnan(Xtrans)))

Missing: 0


In [33]:

# define modeling pipeline
model = RandomForestClassifier()
imputer = SimpleImputer(strategy='mean')
pipeline = Pipeline(steps=[('i', imputer), ('m', model)])
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
scores
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Mean Accuracy: 0.861 (0.052)


In [34]:
scores.shape

(30,)

In [35]:
results = list()
strategies = ['mean', 'median', 'most_frequent', 'constant']
for s in strategies:
	# create the modeling pipeline
	pipeline = Pipeline(steps=[('i', SimpleImputer(strategy=s)), ('m', RandomForestClassifier())])
	# evaluate the model
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	# store results
	results.append(scores)
	print('>%s %.3f (%.3f)' % (s, mean(scores), std(scores)))

>mean 0.857 (0.049)
>median 0.868 (0.055)
>most_frequent 0.870 (0.055)
>constant 0.871 (0.051)


In [42]:
pipeline = Pipeline(steps=[('i', SimpleImputer(strategy='constant')), ('m', RandomForestClassifier())])
# fit the model
pipeline.fit(X, y)
# define new data
row = [2, 1, 530101, 38.50, 66, 28, 3, 3, np.nan, 2, 5, 4, 4, np.nan, np.nan, np.nan, 3, 5, 45.00, 8.40, np.nan, np.nan, 2, 11300, 00000, 00000, 2]
# make a prediction
yhat = pipeline.predict([row])
# summarize prediction
print('Predicted Class: %d' % yhat[0])

Predicted Class: 2


In [43]:
row

[2,
 1,
 530101,
 38.5,
 66,
 28,
 3,
 3,
 nan,
 2,
 5,
 4,
 4,
 nan,
 nan,
 nan,
 3,
 5,
 45.0,
 8.4,
 nan,
 nan,
 2,
 11300,
 0,
 0,
 2]