<a href="https://colab.research.google.com/github/anandababugudipudi/OCR-using-RandomForestClassifier/blob/main/OCR_using_RandomForestClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Import necessary packages**

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle

###**Load the IRIS dataset**

In [None]:
digits_data = load_digits()

###**Seperate Features and Labels**

In [None]:
X = digits_data.images.reshape((len(digits_data.images), -1))
y = digits_data.target

print(X.shape)
print(y.shape)

(1797, 64)
(1797,)


###**Split the data into Train and Test Data**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 122)

###**Build the RFC model**

We use `n_estimators = -1` for making available of all processes.

In [None]:
model = RandomForestClassifier(n_estimators = -1, max_features = 'sqrt')

###**Create Dictionary for Hyperparameter Tuning**

In [None]:
param_grid = {
    "n_estimators": [10, 100, 500, 1000],
    "max_depth": [1, 5, 10, 15],
    "min_samples_leaf": [1, 2, 4, 10, 15, 30, 50]
}

###**Create a `GridSearchCV` Cross Validator**

In [None]:
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, cv = 10)

###**Train the model on Training data**

Train the model on Training data and print the best parameters.

In [None]:
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

{'max_depth': 15, 'min_samples_leaf': 1, 'n_estimators': 500}


In [None]:
optimal_estimators = grid_search.best_params_.get("n_estimators")
optimal_depth = grid_search.best_params_.get("max_depth")
optimal_leaf = grid_search.best_params_.get("min_samples_leaf")
print(f"Optimal n_estimators : {optimal_estimators}")
print(f"Optimal optimal_depth : {optimal_depth}")
print(f"Optimal optimal_leaf : {optimal_leaf}")

Optimal n_estimators : 500
Optimal optimal_depth : 15
Optimal optimal_leaf : 1


###**Making predictions on Trained model**

In [None]:
grid_predictions = grid_search.predict(X_test)

###**Measuring the performance of the model**

In [None]:
print(confusion_matrix(y_test, grid_predictions))
acc = f"{round(accuracy_score(y_test, grid_predictions)*100,2)}%"
print(acc)

[[65  0  0  0  0  0  0  0  0  0]
 [ 0 47  0  0  0  0  0  0  0  0]
 [ 1  0 54  0  0  0  0  0  0  0]
 [ 0  0  0 56  0  1  0  0  0  1]
 [ 0  0  0  0 57  0  0  1  0  0]
 [ 0  0  0  0  1 56  1  0  0  0]
 [ 1  0  0  0  0  1 57  0  0  0]
 [ 0  0  0  0  0  0  0 48  0  0]
 [ 0  2  0  0  1  0  0  0 41  0]
 [ 0  0  0  0  0  0  0  0  1 47]]
97.78%


In [None]:
# save the model to disk
file_name = f"OCR_RFC_max_depth-{optimal_depth}_min_samples_leaf-{optimal_leaf}_n_estimators-{optimal_estimators}_acc-{acc}.sav"
pickle.dump(model, open(file_name, 'wb'))