In [1]:
# Load the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image as im
import pickle
import cv2
from tqdm import tqdm
from scipy.ndimage import rotate
import time
import joblib
from sklearn.metrics import accuracy_score,f1_score,classification_report
import os
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
labels = [ 'Scheherazade New' , 'Marhey' , 'Lemonada' , 'IBM Plex Sans Arabic']
image_size = 600
def show_images(images,titles=None):
    #This function is used to show image(s) with titles by sending an array of images and an array of associated titles.
    # images[0] will be drawn with the title titles[0] if exists
    # You aren't required to understand this function, use it as-is.
    n_ims = len(images)
    if titles is None: titles = ['(%d)' % i for i in range(1,n_ims + 1)]
    fig = plt.figure()
    n = 1
    for image,title in zip(images,titles):
        a = fig.add_subplot(1,n_ims,n)
        if image.ndim == 2: 
            plt.gray()
        plt.imshow(image)
        a.set_title(title)
        plt.axis('off')
        n += 1
    fig.set_size_inches(np.array(fig.get_size_inches()) * n_ims)
    plt.show() 

In [3]:
# Load the images from fonts-dataset folder
def load_images():
    # Load the images from the fonts-dataset folder
    images_train = []
    labels_train = []
    filenames = []
    empty_images_filenames = ["360.jpeg","627.jpeg","853.jpeg"] 
    # Use tqdm to show a progress bar
    for i in tqdm(labels):
        for filename in os.listdir(f'fonts-dataset/{i}'):
            img = cv2.imread(f'fonts-dataset/{i}/{filename}', cv2.IMREAD_GRAYSCALE)
            # img = cv2.resize(img, (image_size, image_size))
            if i == "Lemonada" and filename in empty_images_filenames:
                print(filename)
                print("empty image")
                continue
            images_train.append(img)
            labels_train.append(i)
            filenames.append(filename)
    return images_train, labels_train,filenames



In [4]:
# Load the images
X_train, y_train_org, filenames = load_images()
# Change the y_train to numbers
y_train_org = [labels.index(i) for i in y_train_org]

 50%|█████     | 2/4 [00:25<00:25, 12.78s/it]

360.jpeg
empty image
627.jpeg
empty image
853.jpeg
empty image


100%|██████████| 4/4 [00:48<00:00, 12.12s/it]


In [5]:
len(X_train), len(y_train_org), len(filenames)

(3997, 3997, 3997)

In [6]:
def find_score(arr, angle):
    """
    Find the score of the skew angle to be used in deskewing the image
    
    Args:
    arr: the image array
    angle: the angle to rotate the image by
    
    Returns:
    hist: the histogram of the image
    score: the score of the skew angle
    """
    
    # mode{‘reflect’, ‘grid-mirror’, ‘constant’, ‘grid-constant’, ‘nearest’, ‘mirror’, ‘grid-wrap’, ‘wrap’}
    data = rotate(arr, angle, reshape=False, order=0, mode='constant', cval=0, prefilter=False)
    hist = np.sum(data, axis=1)
    score = np.sum((hist[1:] - hist[:-1]) ** 2)
    return hist, score

def rotate_image(image, angle):
    """
    Rotates an image by a given angle and fills the remaining pixels with white color.

    Args:
        image: A NumPy array representing the input image.
        angle: The rotation angle in degrees.

    Returns:
        A new NumPy array representing the rotated image.
    """
    # Get image height and width
    height, width = image.shape[:2]

    # Compute the rotation matrix
    rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1)

    # Perform the rotation and fill the remaining pixels with white color
    rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=(1, 1, 1))

    return rotated_image

def deskew(binary_img):
    """
    Deskew the image
    
    Args:
    binary_img: the binary image
    
    Returns:
    pix: the deskewed image
    """
    bin_img = (binary_img // 255.0)
    # angles to check for skew angle = 45 degrees and 90 degrees and 180
    angles = np.array ([0 , 45 , 90 , 135 , 180 , 225 , 270 , 315])
    scores = []
    for angle in angles:
        hist, score = find_score(bin_img, angle)
        scores.append(score)

    best_score = max(scores)
    best_angle = angles[scores.index(best_score)]
    # print('Best angle: {}'.format(best_angle))

    # correct skew
    # data = rotate(bin_img, best_angle, reshape=False, order=0)
    data = rotate_image(bin_img, best_angle)
    img = im.fromarray((255 * data).astype("uint8"))

    pix = np.array(img)
    return pix

In [7]:
def preprocess(img):
    """
    Preprocess the image
    
    Args:
    img: the image
    
    Returns:
    img: the preprocessed image
    """
    sharpen_kernel = np.array([[0,-1, 0], [-1,5,-1], [0,-1,0]])
    img = cv2.medianBlur(img, 3) # To remove Salt and Pepper noise
    img = cv2.filter2D(img, -1, sharpen_kernel)  # Sharpen the image
    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] # Convert the image to binary
    deskewed_img = deskew(img) # Deskew the image
    final_img = cv2.bitwise_not(deskewed_img) if np.mean(deskewed_img) > 127 else deskewed_img # Invert the image if the mean is less than 127 
    return final_img

In [8]:
# # Preprocess the images
# X_train_preprocess = [preprocess(i) for i in tqdm(X_train)]
    
# # Dump the preprocessed images to a file
# with open('preprocessed_images.pkl', 'wb') as f:
#     pickle.dump(X_train_preprocess, f)

In [9]:
# Load the preprocessed images
with open('preprocessed_images.pkl', 'rb') as f:
    X_train_preprocess = pickle.load(f)

In [10]:
import numpy as np
import cv2 as cv


def save_image(img, folder, title):
    cv.imwrite(f'./{folder}/{title}.png', img)

def projection(gray_img, axis:str='horizontal'):
    """ Compute the horizontal or the vertical projection of a gray image """

    if axis == 'horizontal':
        projection_bins = np.sum(gray_img, 1).astype('int32')
    elif axis == 'vertical':
        projection_bins = np.sum(gray_img, 0).astype('int32')

    return projection_bins

In [11]:
# def preprocess(image):

#     # Maybe we end up using only gray level image.
#     # gray_img = cv.bitwise_not(image) # Invert the image
#     binary_img = cv.threshold(image, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)[1]
#     deskewed_img = deskew(binary_img)
#     return deskewed_img


def projection_segmentation(clean_img, axis, cut=15, min_width=20, min_height=30):
    """Segment the image based on the projection profile

    Args:
        clean_img : Preprocessed image
        axis (str): 'horizontal' or 'vertical'
        cut (int, optional): Gap between the segments. Defaults to 3.
        min_width (int, optional): Width of the segment. Defaults to 5.
        min_height (int, optional): Height of the segment. Defaults to 5.

    Returns:
        _type_: _description_
    """
    segments = []
    start = -1
    cnt = 0

    projection_bins = projection(clean_img, axis)
    for idx, projection_bin in enumerate(projection_bins):

        if projection_bin != 0:
            cnt = 0
        if projection_bin != 0 and start == -1:
            start = idx
        if projection_bin == 0 and start != -1:
            cnt += 1
            if cnt >= cut:
                if axis == 'horizontal':
                    # Line segmentation
                    segment = clean_img[max(start-1, 0):idx, :]
                    # if segment.shape[0] >= min_height:                    
                    segments.append(segment)
                elif axis == 'vertical':
                    # Word segmentation
                    segment = clean_img[:, max(start-1, 0):idx]
                    # if segment.shape[1] >= min_width:
                    segments.append(segment)
                cnt = 0
                start = -1
    
    return segments


# Line Segmentation
#----------------------------------------------------------------------------------------
def line_horizontal_projection(image, cut=3):

    # Segmentation    
    lines = projection_segmentation(image, axis='horizontal', cut=cut)
    return lines


# Word Segmentation
#----------------------------------------------------------------------------------------
def word_vertical_projection(line_image, cut=3):
    
    line_words = projection_segmentation(line_image, axis='vertical', cut=cut)
    line_words.reverse()
    return line_words


def extract_words(img, visual=0):

    lines = line_horizontal_projection(img)
    words = []
    
    for idx, line in enumerate(lines):
        
        if visual:
            # Check for the size of the line to be greater than 30
            # if line.shape[0] > 30:
            save_image(line, 'lines', f'line{idx}')

        line_words = word_vertical_projection(line)
        for w in line_words:
            # if len(words) == 585:
            #     print(idx)
            words.append((w, line))
        # words.extend(line_words)

    # breakpoint()
    if visual:
        for idx, word in enumerate(words):
            # check for the size of the word to be greater than 30
            # print (word[0].shape)
            # if word[0].shape[0] < 100 and word[0].shape[1] > 20 :
            save_image(word[0], 'words', f'word{idx}')
    return words

# # Try to extract the words from the preprocessed images
# x = 31
# p = preprocess(X_train[x])

# # show_images([X_train[999],p])

# # Extract the words from the preprocessed images
# words = extract_words(p, visual=1)
# show_images([X_train[x], p])



In [12]:
# Resize the images to 600x600
X_train_resized = []
for i in tqdm(X_train_preprocess):
    img = cv2.resize(i, (image_size, image_size))
    X_train_resized.append(img)
X_train_resized = np.array(X_train_resized)
    

100%|██████████| 4000/4000 [00:01<00:00, 2929.30it/s]


In [13]:
# Remove from X_train_resized with index = [2291, 2587, 2838]
X_train_resized = np.delete(X_train_resized, [2291, 2587, 2838], 0)

In [14]:
# Apply hog for the images
from skimage.feature import hog

X_train_hog = []
for i in tqdm(X_train_resized):
    X_train_hog.append(hog(i, orientations= 16, pixels_per_cell=(32, 32), cells_per_block=(4, 4), block_norm='L2-Hys'))
    

X_train_hog = np.array(X_train_hog)
print(X_train_hog.shape)
# # Try hog on one image
# hog_image = hog(X_train_resized[0], orientations= 16, pixels_per_cell=(32, 32), cells_per_block=(4,4), block_norm='L2-Hys')
# print(hog_image.shape)

100%|██████████| 3997/3997 [02:34<00:00, 25.86it/s]


(3997, 57600)


In [15]:
# Apply SIFT for the images
sift = cv2.SIFT_create()

X_train_sift = []
for i in tqdm(X_train_resized):
    kp, des = sift.detectAndCompute(i, None)
    if des is None:
        # Add a row of zeros to the SIFT descriptors
        des = np.zeros((1, 128))
    des = des.flatten()
    X_train_sift.append(des)


  0%|          | 0/3997 [00:00<?, ?it/s]

100%|██████████| 3997/3997 [03:40<00:00, 18.09it/s]


In [16]:
# Get the maximum keypoint length of the SIFT descriptors
max_kp = max(len(kp)/128 for kp in X_train_sift)

# Get the average keypoint length of the SIFT descriptors
avg_kp = np.mean([len(kp)/128 for kp in X_train_sift])

# Get the minimum keypoint length of the SIFT descriptors
min_kp = min(len(kp)/128 for kp in X_train_sift)

# Print the maximum keypoint length
print(max_kp)

# Print the average keypoint length
print(avg_kp)

# Print the minimum keypoint length
print(min_kp)


11506.0
1265.8418814110582
23.0


In [17]:
# Get the index of the image with keypoint <= 10
idx = [i for i in range(len(X_train_sift)) if len(X_train_sift[i])/128 <= 50]
# Print idx 
print(idx)

# Found images which has no text 

[1783, 3780]


In [19]:
# Pad the SIFT descriptors to the maximum length
fixed_len = 128 * 350  # Adjust this as needed

# Create a generator that yields each padded descriptor on-the-fly
padded_descriptors = (np.pad(des, (0, max(0, fixed_len - des.shape[0])))[:fixed_len] for des in X_train_sift)

# Convert the generator to a numpy array
X_train_sift_np = np.array(list(padded_descriptors))

# Print the shape of the numpy array
print(X_train_sift_np.shape)

(3997, 44800)


In [20]:
# # Concatenate the hog and sift features
X_train_features = np.concatenate((X_train_hog, X_train_sift_np), axis=1)
print(X_train_features.shape)


(3997, 102400)


In [21]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_features, y_train_org, test_size=0.2, random_state=42, stratify=y_train_org)

In [22]:
# Make a pipeline for the model which consist of StandardScaler , PCA and the model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Create a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.99)),
])

# Fit the pipeline
pipe.fit(X_train)

# Dump the pipeline to a file
joblib.dump(pipe, 'pipeline.pkl')

['pipeline.pkl']

In [23]:
# Load the pipeline
pipe = joblib.load('pipeline.pkl')

In [25]:
# Transform the data
X_train_transformed = pipe.transform(X_train)
X_test_transformed = pipe.transform(X_test)

# Print the shape of the transformed data
print(X_train_transformed.shape)

(3197, 2982)


In [27]:
# Build a shallow neural network model of 1 hidden layer with 128 neurons and relu activation function
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Create a model of 1 hidden layer with 128 neurons and relu activation function and adam solver and softmax output layer
model = MLPClassifier(hidden_layer_sizes=(256,), activation='relu', solver='adam', max_iter=1000)

# Fit the model
model.fit(X_train_transformed, y_train)

# Predict the training data
y_train_pred = model.predict(X_train_transformed)

# Predict the testing data
y_test_pred = model.predict(X_test_transformed)

# Print the accuracy of the model
print('Train accuracy: ', accuracy_score(y_train, y_train_pred)*100)

print('Test accuracy: ', accuracy_score(y_test, y_test_pred)*100)

# Print the classification report
print('Train classification report: ', classification_report(y_train, y_train_pred, target_names=labels))

print('Test classification report: ', classification_report(y_test, y_test_pred, target_names=labels))

# Accuracy: 92.75%

Train accuracy:  100.0
Test accuracy:  90.375
Train classification report:                        precision    recall  f1-score   support

    Scheherazade New       1.00      1.00      1.00       800
              Marhey       1.00      1.00      1.00       800
            Lemonada       1.00      1.00      1.00       797
IBM Plex Sans Arabic       1.00      1.00      1.00       800

            accuracy                           1.00      3197
           macro avg       1.00      1.00      1.00      3197
        weighted avg       1.00      1.00      1.00      3197

Test classification report:                        precision    recall  f1-score   support

    Scheherazade New       0.91      0.86      0.88       200
              Marhey       0.87      0.86      0.86       200
            Lemonada       0.97      0.97      0.97       200
IBM Plex Sans Arabic       0.87      0.92      0.90       200

            accuracy                           0.90       800
           macro avg  

In [33]:
# lin_model = LogisticRegression()

# # Define the hyperparameters
# param_grid = {
#     'C': np.logspace(-4, 4, 30),
#     'penalty': ['l2'],
#     'solver': ['liblinear', 'saga', 'lbfgs'],
#     'warm_start': [True, False]
# }

# # Initialize the RandomizedSearchCV
# random_search = RandomizedSearchCV(lin_model, param_distributions=param_grid, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# # Fit the model
# random_search.fit(X_train_transformed, y_train)
# # cv_results_df = pd.DataFrame(clf_searched.cv_results_)[relevant_columns].round(decimals=3).sort_values(by='rank_test_score')
# # cv_results_df.head(10)
# relevant_columns = ['param_C', 'param_penalty', 'param_solver', 'param_warm_start', 'mean_test_score', 'std_test_score', 'rank_test_score']
# cv_results_df = pd.DataFrame(random_search.cv_results_)[relevant_columns].round(decimals=3).sort_values(by='rank_test_score')
# cv_results_df.head(10)

# # Print the best parameters
# print(random_search.best_params_)

In [106]:
# Initialize the model and fit the data

# Initialize the model
# {'warm_start': True, 'solver': 'saga', 'penalty': 'l2', 'C': 0.001}

# {'warm_start': True, 'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.0006723357536499335}
model = LogisticRegression(warm_start=True, solver='saga', penalty='l2', C=0.8, random_state=42)
# model = LogisticRegression()

# Fit the model
model.fit(X_train_transformed, y_train)

# Predict the test data
y_pred = model.predict(X_test_transformed)

# Predict the train data
y_pred_train = model.predict(X_train_transformed)

# Print the training accuracy
print(f"Training Accuracy: {accuracy_score(y_train, y_pred_train)*100}")

# Print the testing accuracy
print(f"Testing Accuracy: {accuracy_score(y_test, y_pred)*100}")

# Print the f1 score of the training data
print(f"Training F1 Score: {f1_score(y_train, y_pred_train, average='weighted')}")

# Print the f1 score of the testing data
print (f"Testing F1 Score: {f1_score(y_test, y_pred, average='weighted')}")

# Print the classification report of the training data
print(classification_report(y_train, y_pred_train, target_names=labels))

# Print the classification report of the testing data
print(classification_report(y_test, y_pred, target_names=labels))

# Save the model
joblib.dump(model, 'logistic_model.pkl')

#ACCUARCY: 95.5%

Training Accuracy: 100.0
Testing Accuracy: 95.5
Training F1 Score: 1.0
Testing F1 Score: 0.9551498172846553
                      precision    recall  f1-score   support

    Scheherazade New       1.00      1.00      1.00       800
              Marhey       1.00      1.00      1.00       800
            Lemonada       1.00      1.00      1.00       797
IBM Plex Sans Arabic       1.00      1.00      1.00       800

            accuracy                           1.00      3197
           macro avg       1.00      1.00      1.00      3197
        weighted avg       1.00      1.00      1.00      3197

                      precision    recall  f1-score   support

    Scheherazade New       0.95      0.94      0.95       200
              Marhey       0.91      0.94      0.93       200
            Lemonada       0.99      0.99      0.99       200
IBM Plex Sans Arabic       0.96      0.94      0.95       200

            accuracy                           0.95       800
           macro av



['logistic_model.pkl']

In [None]:
# svm = SVC()

# param_dist = {
#     'C': np.logspace(-3, 3, 15), 
#     'kernel': ['poly', 'rbf'], 
#     'degree': [2, 3],
#     'gamma': ['scale', 'auto']
# }

# clf_searched = RandomizedSearchCV(svm, param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1, verbose=1)

# clf_searched.fit(X_train_transformed, y_train)
# relevant_columns = ['param_C', 'param_kernel', 'param_gamma', 'mean_test_score', 'std_test_score', 'rank_test_score']
# cv_results_df = pd.DataFrame(clf_searched.cv_results_)[relevant_columns].round(decimals=3).sort_values(by='rank_test_score')
# cv_results_df.head(10)

# Best parameters of the SVM
# {C : 7.196857 , kernel : 'rbf', gamma : 'scale'}

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_score,std_test_score,rank_test_score
73,7.196857,rbf,scale,0.933,0.01,1
45,19.306977,rbf,scale,0.933,0.01,1
95,372.759372,rbf,scale,0.933,0.01,1
84,51.794747,rbf,scale,0.933,0.01,1
80,19.306977,rbf,scale,0.933,0.01,1
6,7.196857,rbf,scale,0.933,0.01,1
71,1000.0,rbf,scale,0.933,0.01,1
16,372.759372,rbf,scale,0.933,0.01,1
88,138.949549,rbf,scale,0.933,0.01,1
29,138.949549,rbf,scale,0.933,0.01,1


In [38]:
# Initialize the SVM model
svm = SVC(C=7.196857 , kernel='rbf', gamma='scale', random_state=42)

# Fit the SVM on the training data
svm.fit(X_train_transformed, y_train)

# Predict the labels of the test set
y_pred = svm.predict(X_test_transformed)
# Predict the labels of the training set
y_pred_train = svm.predict(X_train_transformed)

# Print the accuracy of the SVM model on the training set
print(f"Accuracy of SVM model on the training set: {accuracy_score(y_train, y_pred_train)*100}")

# Print the accuracy of the SVM model on the test set
print(f"Accuracy of SVM model: {accuracy_score(y_test, y_pred)*100}")

# Print the classification report of the SVM model
print(classification_report(y_train, y_pred_train, target_names=labels))

# Print the classification report of the SVM model
print(classification_report(y_test, y_pred, target_names=labels))

# Save the SVM model as h5 file
joblib.dump(svm, 'svm_model.pkl')

#ACCURACY: 91.25%

Accuracy of SVM model on the training set: 100.0
Accuracy of SVM model: 90.125
                      precision    recall  f1-score   support

    Scheherazade New       1.00      1.00      1.00       800
              Marhey       1.00      1.00      1.00       800
            Lemonada       1.00      1.00      1.00       797
IBM Plex Sans Arabic       1.00      1.00      1.00       800

            accuracy                           1.00      3197
           macro avg       1.00      1.00      1.00      3197
        weighted avg       1.00      1.00      1.00      3197

                      precision    recall  f1-score   support

    Scheherazade New       0.99      0.66      0.79       200
              Marhey       0.87      0.96      0.91       200
            Lemonada       1.00      1.00      1.00       200
IBM Plex Sans Arabic       0.80      0.99      0.88       200

            accuracy                           0.90       800
           macro avg       0.92      0.90      0

['svm_model.pkl']

In [None]:
# # Random Forest Classifier

# random_forest_clf = RandomForestClassifier(random_state=42)

# # Define the hyperparameters distribution (use same ranges as before)
# param_dist = {
#     'n_estimators': np.arange(200, 500), 
#     'min_samples_split':  range(2,51),                    # 2 to 50
#     'min_samples_leaf': range(1,21),                      # 1 to 20
#     'max_depth': range(5, 51, 5),                              # 5 to 50 with step of 5
#     'min_impurity_decrease': np.linspace(0.0, 0.2, 20),                   # Decide a reasonable range here (with 20 values)
# }

# # Initialize RandomizedSearchCV
# clf_searched = RandomizedSearchCV(random_forest_clf, param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1, verbose=1)

# clf_searched.fit(X_train_transformed, y_train)
# relevant_columns = ['param_n_estimators', 'param_min_samples_split', 'param_min_samples_leaf', 'param_max_depth', 'param_min_impurity_decrease', 
#                     'mean_test_score', 'std_test_score', 'rank_test_score']
# cv_results_df = pd.DataFrame(clf_searched.cv_results_)[relevant_columns].round(decimals=3).sort_values(by='rank_test_score')
# cv_results_df.head(10)

# # Best parameters of the Random Forest Classifier
# # {'n_estimators': 371, 'min_samples_split': 48, 'min_samples_leaf': 4, 'min_impurity_decrease': 0.0, 'max_depth': 30}
# # Best score: 0.776

In [None]:
random_forest_clf = RandomForestClassifier(n_estimators=371, min_samples_split=48, min_samples_leaf=4, min_impurity_decrease=0., max_depth=30, random_state=42)

# Fit the Random Forest Classifier
random_forest_clf.fit(X_train_transformed, y_train)

# Predict the labels of the test set
y_pred = random_forest_clf.predict(X_test_transformed)

# Predict the labels of the training set
y_pred_train = random_forest_clf.predict(X_train_transformed)

# Print the accuracy of the Random Forest Classifier model on the training set
print(f"Accuracy of Random Forest Classifier model on the training set: {accuracy_score(y_train, y_pred_train)*100}")

# Print the accuracy of the Random Forest Classifier model on the test set
print(f"Accuracy of Random Forest Classifier model: {accuracy_score(y_test, y_pred)*100}")

# Print the classification report of the Random Forest Classifier model
print(classification_report(y_train, y_pred_train, target_names=labels))

# Print the classification report of the Random Forest Classifier model
print(classification_report(y_test, y_pred, target_names=labels))

# Save the Random Forest Classifier model as h5 file
joblib.dump(random_forest_clf, 'random_forest_clf.pkl')

#ACCURACY: 83.25%

Accuracy of Random Forest Classifier model on the training set: 99.96875
Accuracy of Random Forest Classifier model: 83.25
                      precision    recall  f1-score   support

    Scheherazade New       1.00      1.00      1.00       800
              Marhey       1.00      1.00      1.00       800
            Lemonada       1.00      1.00      1.00       800
IBM Plex Sans Arabic       1.00      1.00      1.00       800

            accuracy                           1.00      3200
           macro avg       1.00      1.00      1.00      3200
        weighted avg       1.00      1.00      1.00      3200

                      precision    recall  f1-score   support

    Scheherazade New       0.92      0.71      0.81       200
              Marhey       0.67      0.95      0.79       200
            Lemonada       0.96      1.00      0.98       200
IBM Plex Sans Arabic       0.86      0.67      0.75       200

            accuracy                           0.83       800
    

['random_forest_clf.pkl']

In [110]:
# Build a neural networ consist of input layer of size 400, hidden layer of size 256 , hidden layer of size 128 and output layer of size 4 pytorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Convert the data to tensors
X_train_tensor = torch.tensor(X_train_transformed, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_transformed, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Print the shape of the tensors
print(X_train_tensor.shape, y_train_tensor.shape, X_test_tensor.shape, y_test_tensor.shape)

# Create a dataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create a dataloader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Define the neural network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(2982, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 4)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
# Create the model
model = Net()

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0002)

best_score = 0.0

# Train the model
for epoch in range(30):
    model.train()
    running_loss = 0.0
    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, (X, y) in pbar:
        optimizer.zero_grad()
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

        # update running loss value
        running_loss += loss.item()

        # update the progress bar with the average loss
        pbar.set_description(f"Epoch {epoch+1} Loss: {running_loss/(i+1):.4f}")

    # reset running loss for next epoch
    running_loss = 0.0

    # Evaluate the model
    model.eval()

    # Get the predictions
    y_pred = []
    y_true = []
    for X, y in test_loader:
        output = model(X)
        y_pred.extend(torch.argmax(output, 1).tolist())
        y_true.extend(y.tolist())

    # Calculate the accuracy
    accuracy = accuracy_score(y_true, y_pred)*100
    print('Accuracy: ', accuracy)

    # If the accuracy is better than the best score so far, save the model
    if accuracy > best_score:
        best_score = accuracy
        torch.save(model.state_dict(), 'best_model.pth')

# Print the classification report
print(classification_report(y_true, y_pred, target_names=labels))
# Accuracy:  96.75%

torch.Size([3197, 2982]) torch.Size([3197]) torch.Size([800, 2982]) torch.Size([800])


Epoch 1 Loss: 0.7761: 100%|██████████| 200/200 [00:02<00:00, 96.11it/s] 


Accuracy:  94.875


Epoch 2 Loss: 0.0249: 100%|██████████| 200/200 [00:02<00:00, 99.03it/s] 


Accuracy:  95.25


Epoch 3 Loss: 0.0105: 100%|██████████| 200/200 [00:01<00:00, 102.62it/s]


Accuracy:  95.625


Epoch 4 Loss: 0.0025: 100%|██████████| 200/200 [00:02<00:00, 99.61it/s] 


Accuracy:  96.0


Epoch 5 Loss: 0.0012: 100%|██████████| 200/200 [00:02<00:00, 94.65it/s] 


Accuracy:  96.125


Epoch 6 Loss: 0.0008: 100%|██████████| 200/200 [00:02<00:00, 98.86it/s] 


Accuracy:  96.125


Epoch 7 Loss: 0.0006: 100%|██████████| 200/200 [00:01<00:00, 100.82it/s]


Accuracy:  96.25


Epoch 8 Loss: 0.0004: 100%|██████████| 200/200 [00:02<00:00, 99.90it/s] 


Accuracy:  96.25


Epoch 9 Loss: 0.0003: 100%|██████████| 200/200 [00:02<00:00, 99.95it/s] 


Accuracy:  96.25


Epoch 10 Loss: 0.0003: 100%|██████████| 200/200 [00:01<00:00, 101.16it/s]


Accuracy:  96.25


Epoch 11 Loss: 0.0002: 100%|██████████| 200/200 [00:01<00:00, 100.65it/s]


Accuracy:  96.25


Epoch 12 Loss: 0.0002: 100%|██████████| 200/200 [00:02<00:00, 98.96it/s] 


Accuracy:  96.25


Epoch 13 Loss: 0.0001: 100%|██████████| 200/200 [00:01<00:00, 100.37it/s]


Accuracy:  96.25


Epoch 14 Loss: 0.0001: 100%|██████████| 200/200 [00:01<00:00, 102.97it/s]


Accuracy:  96.25


Epoch 15 Loss: 0.0001: 100%|██████████| 200/200 [00:02<00:00, 97.66it/s] 


Accuracy:  96.25


Epoch 16 Loss: 0.0001: 100%|██████████| 200/200 [00:01<00:00, 100.15it/s]


Accuracy:  96.25


Epoch 17 Loss: 0.0001: 100%|██████████| 200/200 [00:01<00:00, 103.50it/s]


Accuracy:  96.375


Epoch 18 Loss: 0.0001: 100%|██████████| 200/200 [00:02<00:00, 99.67it/s] 


Accuracy:  96.375


Epoch 19 Loss: 0.0001: 100%|██████████| 200/200 [00:02<00:00, 100.00it/s]


Accuracy:  96.375


Epoch 20 Loss: 0.0000: 100%|██████████| 200/200 [00:01<00:00, 101.95it/s]


Accuracy:  96.375


Epoch 21 Loss: 0.0000: 100%|██████████| 200/200 [00:02<00:00, 99.50it/s] 


Accuracy:  96.375


Epoch 22 Loss: 0.0000: 100%|██████████| 200/200 [00:02<00:00, 99.95it/s] 


Accuracy:  96.375


Epoch 23 Loss: 0.0000: 100%|██████████| 200/200 [00:01<00:00, 103.01it/s]


Accuracy:  96.375


Epoch 24 Loss: 0.0000: 100%|██████████| 200/200 [00:01<00:00, 100.01it/s]


Accuracy:  96.375


Epoch 25 Loss: 0.0000: 100%|██████████| 200/200 [00:02<00:00, 98.34it/s] 


Accuracy:  96.375


Epoch 26 Loss: 0.0000: 100%|██████████| 200/200 [00:01<00:00, 102.56it/s]


Accuracy:  96.375


Epoch 27 Loss: 0.0000: 100%|██████████| 200/200 [00:02<00:00, 99.35it/s] 


Accuracy:  96.5


Epoch 28 Loss: 0.0000: 100%|██████████| 200/200 [00:02<00:00, 98.77it/s] 


Accuracy:  96.5


Epoch 29 Loss: 0.0000: 100%|██████████| 200/200 [00:01<00:00, 100.12it/s]


Accuracy:  96.5


Epoch 30 Loss: 0.0000: 100%|██████████| 200/200 [00:01<00:00, 103.41it/s]


Accuracy:  96.5
                      precision    recall  f1-score   support

    Scheherazade New       0.95      0.95      0.95       200
              Marhey       0.94      0.94      0.94       200
            Lemonada       1.00      0.99      1.00       200
IBM Plex Sans Arabic       0.97      0.96      0.97       200

            accuracy                           0.96       800
           macro avg       0.97      0.96      0.97       800
        weighted avg       0.97      0.96      0.97       800



In [115]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import torch

class PyTorchClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model, criterion, optimizer, epochs):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.epochs = epochs
        self.label_encoder = LabelEncoder()

    def fit(self, X, y):
        # Convert data to tensors
        X_tensor = torch.tensor(X, dtype=torch.float32)
        y = self.label_encoder.fit_transform(y)
        y_tensor = torch.tensor(y, dtype=torch.long)

        # Set the classes_ attribute
        self.classes_ = self.label_encoder.classes_

        best_loss = float('inf')

        # Train the model
        for epoch in tqdm(range(self.epochs), desc='Training', unit='epoch'):
            self.model.train()
            self.optimizer.zero_grad()
            output = self.model(X_tensor)
            loss = self.criterion(output, y_tensor)
            loss.backward()
            self.optimizer.step()

            # Save the model with the lowest loss
            if loss.item() < best_loss:
                best_loss = loss.item()
                torch.save(self.model.state_dict(), 'best_model.pth')

        return self

    def predict(self, X):
        # Convert data to tensor
        X_tensor = torch.tensor(X, dtype=torch.float32)

        # Get the model's predictions
        self.model.eval()
        predictions = []
        with torch.no_grad():
            output = self.model(X_tensor)
            predictions = torch.argmax(output, dim=1)

        return self.label_encoder.inverse_transform(predictions)

    def predict_proba(self, X):
        # Convert data to tensor
        X_tensor = torch.tensor(X, dtype=torch.float32)

        # Get the model's predictions
        self.model.eval()
        with torch.no_grad():
            output = self.model(X_tensor)
            probabilities = torch.softmax(output, dim=1)

        return probabilities.numpy()

In [118]:
# Stacking the models together to get the best model
from sklearn.ensemble import StackingClassifier

# Define the base models
# Stacking the models together to get the best model

# Define the base models
estimators = [
    ('logistic', LogisticRegression(warm_start=True, solver='saga', penalty='l2', C=0.8, random_state=42)),
    ('neural_net', PyTorchClassifier(Net(), criterion, optimizer, 10))
]
meta_model = SVC()

# Initialize the stacking classifier
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=meta_model, cv=5)

# Fit the stacking classifier
stacking_clf.fit(X_train_transformed, y_train)

# Predict the labels of the test set
y_pred = stacking_clf.predict(X_test_transformed)

# Predict the labels of the training set
y_pred_train = stacking_clf.predict(X_train_transformed)

# Print the accuracy of the Stacking Classifier model on the training set
print(f"Accuracy of Stacking Classifier model on the training set: {accuracy_score(y_train, y_pred_train)*100}")

# Print the accuracy of the Stacking Classifier model on the test set
print(f"Accuracy of Stacking Classifier model: {accuracy_score(y_test, y_pred)*100}")

# Print the classification report of the Stacking Classifier model
print(classification_report(y_train, y_pred_train, target_names=labels))

# Print the classification report of the Stacking Classifier model
print(classification_report(y_test, y_pred, target_names=labels))

# ACCURACY: 96%

Training: 100%|██████████| 10/10 [00:00<00:00, 15.08epoch/s]
Training: 100%|██████████| 10/10 [00:00<00:00, 19.49epoch/s]
Training: 100%|██████████| 10/10 [00:00<00:00, 17.76epoch/s]
Training: 100%|██████████| 10/10 [00:00<00:00, 18.94epoch/s]
Training: 100%|██████████| 10/10 [00:00<00:00, 18.59epoch/s]
Training: 100%|██████████| 10/10 [00:00<00:00, 18.55epoch/s]


Accuracy of Stacking Classifier model on the training set: 100.0
Accuracy of Stacking Classifier model: 96.0
                      precision    recall  f1-score   support

    Scheherazade New       1.00      1.00      1.00       800
              Marhey       1.00      1.00      1.00       800
            Lemonada       1.00      1.00      1.00       797
IBM Plex Sans Arabic       1.00      1.00      1.00       800

            accuracy                           1.00      3197
           macro avg       1.00      1.00      1.00      3197
        weighted avg       1.00      1.00      1.00      3197

                      precision    recall  f1-score   support

    Scheherazade New       0.96      0.95      0.95       200
              Marhey       0.95      0.93      0.94       200
            Lemonada       0.99      0.99      0.99       200
IBM Plex Sans Arabic       0.93      0.97      0.95       200

            accuracy                           0.96       800
           macro a