In [None]:
#Load the library with the iris dataset
from sklearn.datasets import load_iris

# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

# Set random seed
np.random.seed(0)


In [None]:
# Create an object called iris with the iris data
iris = load_iris()

# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)

# View the top 5 rows
print(df.head(5))


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [None]:
# Add a new column with the species names, this is what we are going to try to predict
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

# View the top 5 rows
print(df.head(5))


   sepal length (cm)  sepal width (cm)  ...  petal width (cm)  species
0                5.1               3.5  ...               0.2   setosa
1                4.9               3.0  ...               0.2   setosa
2                4.7               3.2  ...               0.2   setosa
3                4.6               3.1  ...               0.2   setosa
4                5.0               3.6  ...               0.2   setosa

[5 rows x 5 columns]


In [None]:
# Create a new column that for each row, generates a random number between 0 and 1, and
# if that value is less than or equal to .75, then sets the value of that cell as True
# and false otherwise. This is a quick and dirty way of randomly assigning some rows to
# be used as the training data and some as the test data.
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

# View the top 5 rows
print(df.head(2))

   sepal length (cm)  sepal width (cm)  ...  species  is_train
0                5.1               3.5  ...   setosa      True
1                4.9               3.0  ...   setosa      True

[2 rows x 6 columns]


In [None]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df[df['is_train']==True], df[df['is_train']==False]
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))



Number of observations in the training data: 118
Number of observations in the test data: 32


In [None]:
# Create a list of the feature column's names
features = df.columns[:4]

# View features
print('Features:',features)


Features: Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')


In [None]:
y = pd.factorize(train['species'])[0]


print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2]


In [None]:


# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0,n_estimators=100)


# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train[features], y)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=2, oob_score=False, random_state=0,
            verbose=0, warm_start=False)


# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
print('Prediction of test samples:',clf.predict(test[features]))
# View the predicted probabilities of the first 10 observations
#print('clf.predict_proba(test[features]:', clf.predict_proba(test[features])[0:10])


Prediction of test samples: [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2]


In [None]:
# Create actual english names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test[features])]
# View the PREDICTED species for the first five observations
#print(preds[0:5])

# View the ACTUAL species for the first five observations
test['species'].head()
# Create confusion matrix
ct=pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])
print(ct)
'''
# View a list of the features and their importance scores
list(zip(train[features], clf.feature_importances_))
'''

Predicted Species  setosa  versicolor  virginica
Actual Species                                  
setosa                 13           0          0
versicolor              0           5          2
virginica               0           0         12


'\n# View a list of the features and their importance scores\nlist(zip(train[features], clf.feature_importances_))\n'