**Random forest is constructed using multiple decision trees and the final decision is obtained by majority votes of the decision trees.**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


### EDA

In [3]:
df.shape

(344, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [5]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [6]:
# Drop the null values
df.dropna(inplace=True)
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

## Feature Engineering

### Transforming Categorial data type into numeric by One Hot Encoding

In [7]:
df.sex.unique()

array(['Male', 'Female'], dtype=object)

In [8]:
# pd.get_dummies(df['sex']).astype(int).head()
sex = pd.get_dummies(df['sex'], drop_first=True).astype(int)
sex.head()

Unnamed: 0,Male
0,1
1,0
2,0
4,0
5,1


In [9]:
# Apply One hot Encoding to Island
df.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [10]:
island = pd.get_dummies(df['island'], drop_first=True).astype(int)
island.head()

Unnamed: 0,Dream,Torgersen
0,0,1
1,0,1
2,0,1
4,0,1
5,0,1


### Concatenate the above datframe into original dataframe

In [11]:
newdf = pd.concat([df, sex, island], axis=1)
newdf.head(2)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Male,Dream,Torgersen
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,1,0,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0,0,1


### Drop the repeated Columns

In [12]:
newdf.drop(['island', 'sex'], axis=1, inplace=True)
newdf.head(2)

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Male,Dream,Torgersen
0,Adelie,39.1,18.7,181.0,3750.0,1,0,1
1,Adelie,39.5,17.4,186.0,3800.0,0,0,1


### Creating Seperate Target Variable

In [13]:
y = newdf.species
print(y.head())
print(y.unique())

0    Adelie
1    Adelie
2    Adelie
4    Adelie
5    Adelie
Name: species, dtype: object
['Adelie' 'Chinstrap' 'Gentoo']


In [14]:
y = y.map({'Adelie': 0, 'Chinstrap': 1, 'Gentoo': 2})
y.head()

0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

### Dropping the Target Variable

In [15]:
newdf.drop(['species'], axis=1, inplace=True)
newdf.head(2)

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Male,Dream,Torgersen
0,39.1,18.7,181.0,3750.0,1,0,1
1,39.5,17.4,186.0,3800.0,0,0,1


In [16]:
X = newdf

## Train Test and Split the data

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train", X_train.shape)
print("X_test", X_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)

X_train (266, 7)
X_test (67, 7)
y_train (266,)
y_test (67,)


## Training Random Forest Classification 

In [18]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=5, criterion='entropy', random_state=42)
# # n_estimator means how many decision tress you are making
rfc.fit(X_train, y_train)

## Predicting Test Results

In [19]:
predictions = rfc.predict(X_test)
predictions[0:10]

array([1, 2, 0, 1, 0, 2, 2, 1, 1, 1], dtype=int64)

In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print(classification_report(y_test, predictions))
print('\n')

print(confusion_matrix(y_test, predictions))
print('\n')

print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98        31
           1       0.93      1.00      0.96        13
           2       1.00      1.00      1.00        23

    accuracy                           0.99        67
   macro avg       0.98      0.99      0.98        67
weighted avg       0.99      0.99      0.99        67



[[30  1  0]
 [ 0 13  0]
 [ 0  0 23]]


0.9850746268656716


## Trying with Different number of trees and Gini Criteria 

In [21]:
rfc = RandomForestClassifier(n_estimators=10, criterion='gini', random_state=42)
rfc.fit(X_train, y_train)

In [22]:
predictions = rfc.predict(X_test)

In [23]:
print(classification_report(y_test, predictions))
print('\n')

print(confusion_matrix(y_test, predictions))
print('\n')

print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        31
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        23

    accuracy                           1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67



[[31  0  0]
 [ 0 13  0]
 [ 0  0 23]]


1.0


***With more trees the model give 100% accuracy***