# OCR-System for Arabic Handwritten words recognition 

# Data Processing 

In [None]:
#Importing the necessary libraries to upload , process and pre-treat the data
import numpy as np
from PIL import Image
import cv2
import os

# we define the Directory containing the images and we create an other directory named "output_OCR" to stock the pre-treated images
image_dir = "bmp"
output_dir="output_OCR"

# We create a list to store the loaded images
loaded_images = []
new_images=[]  #This list is created to be used later in this code to faciliate the treatment of data

#Fixing a pre-defined width and height
new_width = 224
new_height = 224

# Iterate over the files in the directory
for filename in os.listdir(image_dir):
    if filename.endswith(".bmp"): 
        file_path = os.path.join(image_dir, filename)
        image = Image.open(file_path)
        image = image.convert("RGB")  #Converting the bmp images to RGB format
        resized_image = image.resize((new_width, new_height)) #resizing the images
        grayscale_image = resized_image.convert("L")  #Converting them to grayscale images
        loaded_images.append(grayscale_image)

for image in loaded_images :        
        if image.mode != "L":             #In this loop , we make sure that every image stored in loaded_images is in grayscale format 
            image = image.convert("L")
        
        
        # Convert image to NumPy array
        image_array = np.array(image)
        
        
        # Normalize the pixel values
        normalized_image = (image_array.astype(np.float32) - 0) / 255.0
        
        # Convert the normalized image back to PIL image
        normalized_image = (normalized_image * 255).astype(np.uint8)  # Convert back to appropriate data type
        normalized_image = Image.fromarray(normalized_image)
        
        # Save the grayscale image
        output_path = os.path.join(output_dir, filename[:-4] + ".png")  # Change the extension to .png
        normalized_image.save(output_path)
        new_images.append(normalized_image)


#for image in loaded_images:
    #cv2.imshow(image)

## Extracting the labels 

In [None]:
## -------------------------------------All the labels exist in the ".tru" files so we read ----------------------------------
## ----------the files and ten we extract the label infomration from them using the following code------------------------

#Specifing the directory of files
label_dir = "tru"
labels = []  #The labels gonna be storedd in this list

#Iterate over the files in path
for filename in os.listdir(label_dir):
    if filename.endswith(".tru"):
        file_path = os.path.join(label_dir, filename)
        
        # Read the contents of the label file
        with open(file_path, "r") as file:
            lines = file.readlines()
            
            # Extract the label from the desired line
            label = lines[6].strip()  
            
            # Add the label to the list
            labels.append(label)

#for label in labels:
   # print(label)

# Spliting Data 

In [None]:
from sklearn.model_selection import train_test_split
import os

#Spliting the data
train_paths, test_paths, train_labels, test_labels = train_test_split(new_images, labels, test_size=0.2, random_state=42)

# Print the number of samples in each set
print("Training set size:", len(train_paths))
print("Test set size:", len(test_paths))

# Building the CNN Model 

In [None]:
# -------------------------------We build the CNN-model using the tansorflow libraries , this model is gonna used------------
#-------------------------------------for spacial analysis of images-----------------------------------------------------

import tensorflow as tf
from tensorflow.keras.models import save_model
from tensorflow import keras
from tensorflow.keras import layers

train_paths = np.asarray(train_paths) #Converting the training set images to numpy array
train_paths = train_paths.tolist()
#train_paths=train_paths.reshape(train_paths.shape[0],28,28,1)    #Reshaping the array to be suitable with CNN input
train_labels = [str(label) for label in train_labels]  #Make sure that the labels are represented as strings


# Define the CNN model
model = keras.Sequential([
    # Convolutional layers
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(new_height, new_width, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    # Flatten the 2D feature maps to 1D
    layers.Flatten(),
    # Defining the fully connected layers
    layers.Dense(4, activation='relu'),
    layers.Dense(4, activation='relu') 
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Print the model summary
model.summary()
#Fit the model
model.fit(train_paths, train_labels, epochs=10, batch_size=32, validation_data=(train_paths,train_labels))
#Save the model
model.save('cnn_model.h5')


# Building the RNN-Model 

In [None]:
#----------The RNN-Model is used to extract the temporal dependencies in the sequencies for a better result generating---------

from tensorflow import keras
from tensorflow.keras.models import save_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense


# Define the RNN model architecture
model1 = Sequential()
model1.add(Embedding(input_dim=455, output_dim=150, input_length=30))
model1.add(SimpleRNN(units=100))
model1.add(Dense(55, activation='softmax'))

# Compile the model
model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model1.fit(train_paths, train_labels, epochs=10, batch_size=32, validation_data=(train_labels, train_labels))
#Saving the model
save_model(model1,'rnn_model.h5')
# Evaluate the model
loss, accuracy = model1.evaluate(test_paths,test_labels)

print('Test loss:', loss)
print('Test accuracy:', accuracy)

# Connecting the CNN-model with RNN

In [None]:
from tensorflow.keras.models import load_model

#We load the both model that we saved before
cnn_model = load_model('cnn_model.h5')
rnn_model = load_model('rnn_model.h5')

# we Prepare the input data
preprocessed_input = preprocess_input(test_paths)  # Preprocess the input data for the CNN model

# then we Pass the data through the cnn model
cnn_features = cnn_model.predict(preprocessed_input)  

# Preprocess the CNN features
preprocessed_features = preprocess_features(cnn_features)  # In this step we are just making sure that the cnn output is suitable to
#to be passed through the RNN-Model

#Finally we Pass the preprocessed features into the RNN model and we make the prediction
predictions = rnn_model.predict(preprocessed_features)  