In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [2]:
import os
os.chdir(r'C:\Users\Vish\Documents\Data\Dataset')
dataset = pd.read_csv('Iris.csv')

In [3]:
dataset.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
dataset.drop(['Id'],inplace=True,axis=1)

In [5]:
dataset.columns = ['sepal length in cm', 'sepal width in cm','petal length in cm','petal width in cm','species']

In [6]:
factor = pd.factorize(dataset['species'])
dataset.species = factor[0]
definitions = factor[1]
print(dataset.species.value_counts())
print(definitions)

2    50
1    50
0    50
Name: species, dtype: int64
Index(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype='object')


In [7]:
#Splitting the data into independent and dependent variables
X = dataset.iloc[:,0:4].values
y = dataset.iloc[:,4].values

In [8]:
#Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [9]:
#Model building
model = RandomForestClassifier(n_estimators = 1000, criterion='entropy',max_depth=10,min_samples_split=2,min_samples_leaf=1,max_leaf_nodes=2,bootstrap=True,oob_score=True,n_jobs=-1,random_state = 1)
#n_estimator:int, default=100--The no of trees to be built
#criterion:{“gini”, “entropy”}, default=“gini”--The function to measure the quality of a split
#Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.
#max_depth:{int, None}, default=None--The maximum depth of the tree
#min_samples_split:{int,float}, default=2--The minimum number of samples required to split an internal node
#min_samples_leaf:{int,float}, default=1--The minimum number of samples required to be at a leaf node
#max_features:{int, float, “auto”, “sqrt”, “log2”}, default=auto--The number of features to consider when looking for the best split
#max_leaf_nodes:{int, None} , default=None--The maximum number of leaf nodes to consider
#min_impurity_decrease:float, default=0.0--Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.
#bootstrap:boolean, default=True--Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.
#oob_score:boolean, default=False--Whether to use out-of-bag samples to estimate the generalization score
#n_jobs:{int, None}, default=None--The number of jobs to run in parallel
#random_state:{int, RandomState, None}, default=None--Can be used for generating reproducible results
model.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=10, max_leaf_nodes=2,
                       n_estimators=1000, n_jobs=-1, oob_score=True,
                       random_state=1)

In [10]:
#Model building
#model = RandomForestRegressor(n_estimators = 1000, criterion='entropy',max_depth=10,min_samples_split=2,min_samples_leaf=1,max_leaf_nodes=2,bootstrap=True,oob_score=True,n_jobs=-1,random_state = 1)
#n_estimator:int, default=100--The no of trees to be built
#criterion:{“mse”, “mae”}, default=“mse”--The function to measure the quality of a split 
# Supported criteria are “mse” for the mean squared error, which is equal to variance reduction as feature selection criterion and “mae” for the mean absolute error.
#max_depth:int, None, default=None--The maximum depth of the tree
#min_samples_split:int, float, default=2--The minimum number of samples required to split an internal node
#min_samples_leaf:int, float, default=1--The minimum number of samples required to be at a leaf node
#max_features:{int, float, “auto”, “sqrt”, “log2”}, default=auto--The number of features to consider when looking for the best split
#max_leaf_nodes:int, None, default=None--The maximum number of leaf nodes to consider
#min_impurity_decrease:float, default=0.0--Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.
#bootstrap:boolean, default=True--Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.
#oob_score:boolean, default=False--Whether to use out-of-bag samples to estimate the generalization score
#n_jobs:int, None, default=None--The number of jobs to run in parallel
#random_state:int, RandomState, None, default=None--Can be used for generating reproducible results
#model.fit(X_train, y_train)

In [11]:
#Predictions
#The predict fuction converts probability values > .5 to 1 else 0
y_pred = model.predict(X_test)

In [12]:
print(model.oob_score_)

0.8303571428571429


In [13]:
# Using our own threshold
# decisions = (model.predict_proba(X_test) >= 0.6).astype(int)
# y_pred=decisions[:,1]
# print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
# print("Recall: ", metrics.precision_score(y_test,y_pred))
# print("F1 score: ", metrics.f1_score(y_test,y_pred))

In [14]:
reversefactor = dict(zip(range(3),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
print(pd.crosstab(y_test, y_pred, rownames=['Actual Species'], colnames=['Predicted Species']))

Predicted Species  Iris-setosa  Iris-versicolor  Iris-virginica
Actual Species                                                 
Iris-setosa                 13                0               0
Iris-versicolor              0                5              10
Iris-virginica               0                0              10


In [15]:
#Evaluating model
cm=metrics.confusion_matrix(y_test,y_pred)
print(cm)
metrics.accuracy_score(y_test,y_pred)

[[13  0  0]
 [ 0  5 10]
 [ 0  0 10]]


0.7368421052631579

In [16]:
print(list(zip(dataset.columns[0:4], model.feature_importances_)))

[('sepal length in cm', 0.156), ('sepal width in cm', 0.0), ('petal length in cm', 0.429), ('petal width in cm', 0.415)]
