# Train Random Forest ML model with Iris Species dataset

## Importing Libraries

In [1]:
# data manipulation and analysis
import pandas as pd

# used to split the dataset into training and testing sets
from sklearn.model_selection import train_test_split 

# Import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier 

#  Evaluate the accuracy of a classification model.
from sklearn.metrics import accuracy_score 

# For serializing and deserializing Python objects. It is often used to save machine learning models.
import pickle 

## Load the dataset iris species dataset

In [2]:
iris_species_df = pd.read_csv('Iris.csv')
iris_species_df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


## Drop the Id column 

In [3]:
iris_species_df.drop('Id', axis = 1, inplace = True) 
iris_species_df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


## Explore the data 

In [4]:
iris_species_df.dtypes

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [5]:
iris_species_df.describe()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


## Look species column

In [6]:
iris_species_df['Species'].count()

150

In [7]:
iris_species_df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [8]:
iris_species_df['Species'].value_counts()

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

## Rename the target column into numbers to aid training of the model 

In [9]:
iris_species_df['Species']= iris_species_df['Species'].map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}) 
iris_species_df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [10]:
iris_species_df.describe()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667,1.0
std,0.828066,0.433594,1.76442,0.763161,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


## Split the data into features column (X) and the target column (y) 

In [11]:
# selects all rows and all columns except the last one
X = iris_species_df.iloc[:, :-1] 

# selects all rows and only the last column
y = iris_species_df.iloc[:, -1] 

In [12]:
X

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [13]:
y

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: Species, Length: 150, dtype: int64

## Split data into training and testing data 

In [14]:
# 70% of data training and 30 % of data as testing data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) 

In [15]:
X_train.shape, y_train.shape

((105, 4), (105,))

In [16]:
X_test.shape, y_test.shape

((45, 4), (45,))

## Initializes a RandomForestClassifier

In [17]:
classifier = RandomForestClassifier()
classifier

## Fit the RF model  

In [18]:
classifier.fit(X_train, y_train)

## Predicting on the test dataset

In [19]:
y_pred = classifier.predict(X_test) 

## Finding out the accuracy 

In [20]:
score = accuracy_score(y_test, y_pred) 
score

0.9777777777777777

## Pickling the model 

In [21]:
# serialize and save the trained model as classifier.pkl
# file object opened in binary write mode 
pickle_out = open("classifier.pkl", "wb")

In [22]:
# pickle.dump() used to serialize (convert to a byte stream) and save the object to the opened file.
pickle.dump(classifier, pickle_out) 
pickle_out.close()