# Other Popular Machine Learning Methods
## Ensemble methods with random forest

### This is a classification problem, where in we will be estimating the species label for iris flowers.

In [1]:
import numpy as np
import pandas as pd

import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split 
from sklearn import metrics

In [2]:
# access RFclassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
iris = datasets.load_iris()

df = pd.DataFrame(iris.data, columns=iris.feature_names)

# target variable
y = pd.DataFrame(iris.target)

y.columns = ['labels']

print(df.head())
y[0:5]

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


Unnamed: 0,labels
0,0
1,0
2,0
3,0
4,0


The data set (indicators) contains information on the:
- sepal length (cm)
- sepal width (cm)  
- petal length (cm)  
- petal width (cm)
- species type

In [5]:
df.isnull()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
145,False,False,False,False
146,False,False,False,False
147,False,False,False,False
148,False,False,False,False


In [4]:
df.isnull().any()==True
# no null values

sepal length (cm)    False
sepal width (cm)     False
petal length (cm)    False
petal width (cm)     False
dtype: bool

In [7]:
# 3 types of flowers
print(y.labels.nunique())

3


In [6]:
# how many flowers in each class
print(y.labels.value_counts())

# 50 data points in each category for our species type

0    50
1    50
2    50
Name: labels, dtype: int64


# Preparing the data for training the model

In [8]:
# split data, 80-20 proportion
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=.2, random_state=17)

# Build a Random Forest model

In [9]:
# instantiate a classifier object, (n_estimators=200) is number of trees generated
classifier = RandomForestClassifier(n_estimators=200, random_state=0)

# reformat our target data, so it conforms to the requirement of the model (output of the ravel function)
y_train_array = np.ravel(y_train)

# call the fit function
classifier.fit(X_train, y_train_array)

# generate prediction from y-based on the test data
y_pred = classifier.predict(X_test)

# Evaluating the model on the test data

In [10]:
# use classification_report function, compare y_test and y_pred
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.92      1.00      0.96        11
           2       1.00      0.92      0.96        12

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



In [None]:
# according to the result above we're having a pretty good accuracy score for our model (f1-score accuracy -> 0.97)

# look at 2 test points to see how they compare visually compare

In [11]:
# create an array for y_test data
y_test_array = np.ravel(y_test)
print(y_test_array)

[0 1 2 1 2 2 1 2 1 2 2 0 1 0 2 0 0 2 2 2 2 0 2 1 1 1 1 1 0 1]


In [12]:
print(y_pred)

[0 1 2 1 2 2 1 2 1 2 2 0 1 0 2 0 0 2 2 2 1 0 2 1 1 1 1 1 0 1]


In [None]:
# so this is how 97% accuracy looks like, it looks like this record has just gut 1 misclassification
y_test_array[-10] = 2
y_pred[-10] = 1

# overall, the classifier is performing very well