In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris

# Loading the dataset

In [3]:
iris = load_iris()

df = pd.DataFrame(iris.data, columns=iris.feature_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


# Inserting new column for targeted values

In [4]:
# Adding the new column for the species name of flowers

df['Species'] = pd.Categorical.from_codes(iris.target,iris.target_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [12]:
df['is_train'] = np.random.uniform(0,1,len(df)) <= 0.75

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,False
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [13]:
train, test = df[df['is_train']==True], df[df['is_train']==False]
    
print(len(train))
print(len(test))

114
36


# Creating a list of the feature column's names

In [14]:
features = df.columns[:4]
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [15]:
#Converting species name into digits

y = pd.factorize(train['Species'])[0]

# See target values
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2], dtype=int64)

# Creating a Random forest Classifier

In [19]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_jobs=2,random_state=0)

model.fit(train[features],y)

RandomForestClassifier(n_jobs=2, random_state=0)

In [20]:
 model.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

# Let's see how different decision tree provides what output

In [28]:
# Checking for test dataset from 10th to 20th sample data
model.predict_proba(test[features])[0:10]

array([[1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.93, 0.07, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.94, 0.06]])

In [26]:
# Mapping names for the plants to each predicted class.

preds = iris.target_names[model.predict(test[features])]

preds[0:15]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor'],
      dtype='<U10')

  # Creating Confusion Matrix

In [27]:
pd.crosstab(test['Species'],preds,rownames=['Actual output'],colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual output,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,9,0,0
versicolor,0,16,0
virginica,0,0,11
