In [13]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier

np.random.seed(0)

iris = load_iris()

df = pd.DataFrame(iris.data, columns = iris.feature_names)
print(df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [14]:
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
df['is_train'] = np.random.uniform(0,1,len(df))<= .65

train, test = df[df['is_train']==True],df[df['is_train']==False]

print("observations in train data:", len(train))
print("observations in test data:", len(test))

observations in train data: 97
observations in test data: 53


In [15]:
features = df.columns[:4]
num_species = pd.factorize(train['species'])[0]

model = RandomForestClassifier(n_jobs = 2, random_state = 0)
model.fit(train[features], num_species)
model.predict(test[features])

print(model.predict_proba(test[features])[0:10])



[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [16]:
names = iris.target_names[model.predict(test[features])]
print(names[0:5])
print(test['species'].head())

['setosa' 'setosa' 'setosa' 'setosa' 'setosa']
1     setosa
7     setosa
8     setosa
10    setosa
13    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]
