In [5]:
# Load the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image as im
import pickle
import cv2
from tqdm import tqdm
from scipy.ndimage import rotate
import time
import joblib
from sklearn.metrics import accuracy_score,f1_score,classification_report
import os
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [6]:
labels = [ 'Scheherazade New' , 'Marhey' , 'Lemonada' , 'IBM Plex Sans Arabic']
image_size = 600
def show_images(images,titles=None):
    #This function is used to show image(s) with titles by sending an array of images and an array of associated titles.
    # images[0] will be drawn with the title titles[0] if exists
    # You aren't required to understand this function, use it as-is.
    n_ims = len(images)
    if titles is None: titles = ['(%d)' % i for i in range(1,n_ims + 1)]
    fig = plt.figure()
    n = 1
    for image,title in zip(images,titles):
        a = fig.add_subplot(1,n_ims,n)
        if image.ndim == 2: 
            plt.gray()
        plt.imshow(image)
        a.set_title(title)
        plt.axis('off')
        n += 1
    fig.set_size_inches(np.array(fig.get_size_inches()) * n_ims)
    plt.show() 

In [7]:
# Load the images from fonts-dataset folder
def load_images():
    # Load the images from the fonts-dataset folder
    images_train = []
    labels_train = []
    filenames = []
    empty_images_filenames = ["360.jpeg","627.jpeg","853.jpeg"] 
    # Use tqdm to show a progress bar
    for i in tqdm(labels):
        for filename in os.listdir(f'fonts-dataset/{i}'):
            img = cv2.imread(f'fonts-dataset/{i}/{filename}', cv2.IMREAD_GRAYSCALE)
            # img = cv2.resize(img, (image_size, image_size))
            if i == "Lemonada" and filename in empty_images_filenames:
                print(filename)
                print("empty image")
                continue
            images_train.append(img)
            labels_train.append(i)
            filenames.append(filename)
    return images_train, labels_train,filenames



In [8]:
# Load the images
X_train, y_train_org, filenames = load_images()
# Change the y_train to numbers
y_train_org = [labels.index(i) for i in y_train_org]

 50%|█████     | 2/4 [00:14<00:14,  7.45s/it]

360.jpeg
empty image
627.jpeg
empty image
853.jpeg
empty image


100%|██████████| 4/4 [00:28<00:00,  7.24s/it]


In [9]:
len(X_train), len(y_train_org), len(filenames)

(3997, 3997, 3997)

In [10]:
def find_score(arr, angle):
    """
    Find the score of the skew angle to be used in deskewing the image
    
    Args:
    arr: the image array
    angle: the angle to rotate the image by
    
    Returns:
    hist: the histogram of the image
    score: the score of the skew angle
    """
    
    # mode{‘reflect’, ‘grid-mirror’, ‘constant’, ‘grid-constant’, ‘nearest’, ‘mirror’, ‘grid-wrap’, ‘wrap’}
    data = rotate(arr, angle, reshape=False, order=0, mode='constant', cval=0, prefilter=False)
    hist = np.sum(data, axis=1)
    score = np.sum((hist[1:] - hist[:-1]) ** 2)
    return hist, score

def rotate_image(image, angle):
    """
    Rotates an image by a given angle and fills the remaining pixels with white color.

    Args:
        image: A NumPy array representing the input image.
        angle: The rotation angle in degrees.

    Returns:
        A new NumPy array representing the rotated image.
    """
    # Get image height and width
    height, width = image.shape[:2]

    # Compute the rotation matrix
    rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1)

    # Perform the rotation and fill the remaining pixels with white color
    rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=(1, 1, 1))

    return rotated_image

def deskew(binary_img):
    """
    Deskew the image
    
    Args:
    binary_img: the binary image
    
    Returns:
    pix: the deskewed image
    """
    bin_img = (binary_img // 255.0)
    # angles to check for skew angle = 45 degrees and 90 degrees and 180
    angles = np.array ([0 , 45 , 90 , 135 , 180 , 225 , 270 , 315])
    scores = []
    for angle in angles:
        hist, score = find_score(bin_img, angle)
        scores.append(score)

    best_score = max(scores)
    best_angle = angles[scores.index(best_score)]
    # print('Best angle: {}'.format(best_angle))

    # correct skew
    # data = rotate(bin_img, best_angle, reshape=False, order=0)
    data = rotate_image(bin_img, best_angle)
    img = im.fromarray((255 * data).astype("uint8"))

    pix = np.array(img)
    return pix

In [11]:
def preprocess(img):
    """
    Preprocess the image
    
    Args:
    img: the image
    
    Returns:
    img: the preprocessed image
    """
    sharpen_kernel = np.array([[0,-1, 0], [-1,5,-1], [0,-1,0]])
    img = cv2.medianBlur(img, 3) # To remove Salt and Pepper noise
    img = cv2.filter2D(img, -1, sharpen_kernel)  # Sharpen the image
    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] # Convert the image to binary
    deskewed_img = deskew(img) # Deskew the image
    final_img = cv2.bitwise_not(deskewed_img) if np.mean(deskewed_img) > 127 else deskewed_img # Invert the image if the mean is less than 127 
    return final_img

In [12]:
# # Preprocess the images
# X_train_preprocess = [preprocess(i) for i in tqdm(X_train)]
    
# # Dump the preprocessed images to a file
# with open('preprocessed_images.pkl', 'wb') as f:
#     pickle.dump(X_train_preprocess, f)

In [13]:
# Load the preprocessed images
with open('preprocessed_images.pkl', 'rb') as f:
    X_train_preprocess = pickle.load(f)

In [14]:
import numpy as np
import cv2 as cv


def save_image(img, folder, title):
    cv.imwrite(f'./{folder}/{title}.png', img)

def projection(gray_img, axis:str='horizontal'):
    """ Compute the horizontal or the vertical projection of a gray image """

    if axis == 'horizontal':
        projection_bins = np.sum(gray_img, 1).astype('int32')
    elif axis == 'vertical':
        projection_bins = np.sum(gray_img, 0).astype('int32')

    return projection_bins

In [15]:
# def preprocess(image):

#     # Maybe we end up using only gray level image.
#     # gray_img = cv.bitwise_not(image) # Invert the image
#     binary_img = cv.threshold(image, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)[1]
#     deskewed_img = deskew(binary_img)
#     return deskewed_img


def projection_segmentation(clean_img, axis, cut=15, min_width=20, min_height=30):
    """Segment the image based on the projection profile

    Args:
        clean_img : Preprocessed image
        axis (str): 'horizontal' or 'vertical'
        cut (int, optional): Gap between the segments. Defaults to 3.
        min_width (int, optional): Width of the segment. Defaults to 5.
        min_height (int, optional): Height of the segment. Defaults to 5.

    Returns:
        _type_: _description_
    """
    segments = []
    start = -1
    cnt = 0

    projection_bins = projection(clean_img, axis)
    for idx, projection_bin in enumerate(projection_bins):

        if projection_bin != 0:
            cnt = 0
        if projection_bin != 0 and start == -1:
            start = idx
        if projection_bin == 0 and start != -1:
            cnt += 1
            if cnt >= cut:
                if axis == 'horizontal':
                    # Line segmentation
                    segment = clean_img[max(start-1, 0):idx, :]
                    # if segment.shape[0] >= min_height:                    
                    segments.append(segment)
                elif axis == 'vertical':
                    # Word segmentation
                    segment = clean_img[:, max(start-1, 0):idx]
                    # if segment.shape[1] >= min_width:
                    segments.append(segment)
                cnt = 0
                start = -1
    
    return segments


# Line Segmentation
#----------------------------------------------------------------------------------------
def line_horizontal_projection(image, cut=3):

    # Segmentation    
    lines = projection_segmentation(image, axis='horizontal', cut=cut)
    return lines


# Word Segmentation
#----------------------------------------------------------------------------------------
def word_vertical_projection(line_image, cut=3):
    
    line_words = projection_segmentation(line_image, axis='vertical', cut=cut)
    line_words.reverse()
    return line_words


def extract_words(img, visual=0):

    lines = line_horizontal_projection(img)
    words = []
    
    for idx, line in enumerate(lines):
        
        if visual:
            # Check for the size of the line to be greater than 30
            # if line.shape[0] > 30:
            save_image(line, 'lines', f'line{idx}')

        line_words = word_vertical_projection(line)
        for w in line_words:
            # if len(words) == 585:
            #     print(idx)
            words.append((w, line))
        # words.extend(line_words)

    # breakpoint()
    if visual:
        for idx, word in enumerate(words):
            # check for the size of the word to be greater than 30
            # print (word[0].shape)
            # if word[0].shape[0] < 100 and word[0].shape[1] > 20 :
            save_image(word[0], 'words', f'word{idx}')
    return words

# # Try to extract the words from the preprocessed images
# x = 31
# p = preprocess(X_train[x])

# # show_images([X_train[999],p])

# # Extract the words from the preprocessed images
# words = extract_words(p, visual=1)
# show_images([X_train[x], p])



In [16]:
# Resize the images to 600x600
X_train_resized = []
for i in tqdm(X_train_preprocess):
    img = cv2.resize(i, (image_size, image_size))
    X_train_resized.append(img)
X_train_resized = np.array(X_train_resized)
    

100%|██████████| 4000/4000 [00:01<00:00, 3910.07it/s]


In [20]:
# Remove from X_train_resized with index = [2291, 2587, 2838]
X_train_resized = np.delete(X_train_resized, [2291, 2587, 2838], 0)

In [21]:
# Apply hog for the images
from skimage.feature import hog

X_train_hog = []
for i in tqdm(X_train_resized):
    X_train_hog.append(hog(i, orientations= 16, pixels_per_cell=(32, 32), cells_per_block=(4, 4), block_norm='L2-Hys'))
    

X_train_hog = np.array(X_train_hog)
print(X_train_hog.shape)
# # Try hog on one image
# hog_image = hog(X_train_resized[0], orientations= 16, pixels_per_cell=(32, 32), cells_per_block=(4,4), block_norm='L2-Hys')
# print(hog_image.shape)

100%|██████████| 3997/3997 [02:32<00:00, 26.26it/s]


(3997, 57600)


In [22]:
# Apply SIFT for the images
sift = cv2.SIFT_create()

X_train_sift = []
c=0
for i in tqdm(X_train_resized):
    kp, des = sift.detectAndCompute(i, None)
    if des is None:
        # Add a row of zeros to the SIFT descriptors
        des = np.zeros((1, 128))
    des = des.flatten()
    X_train_sift.append(des)
    c+=1



100%|██████████| 3997/3997 [03:37<00:00, 18.40it/s]


In [25]:
# Get the maximum keypoint length of the SIFT descriptors
max_kp = max(len(kp)/128 for kp in X_train_sift)

# Get the average keypoint length of the SIFT descriptors
avg_kp = np.mean([len(kp)/128 for kp in X_train_sift])

# Get the minimum keypoint length of the SIFT descriptors
min_kp = min(len(kp)/128 for kp in X_train_sift)

# Print the maximum keypoint length
print(max_kp)

# Print the average keypoint length
print(avg_kp)

# Print the minimum keypoint length
print(min_kp)


11506.0
1265.8418814110582
23.0


In [26]:
# Get the index of the image with keypoint <= 10
idx = [i for i in range(len(X_train_sift)) if len(X_train_sift[i])/128 <= 50]
# Print idx 
print(idx)

# Found images which has no text 

[1783, 3780]


In [27]:
# Pad the SIFT descriptors to the maximum length
fixed_len = 128 * 300  # Adjust this as needed

# Create a generator that yields each padded descriptor on-the-fly
padded_descriptors = (np.pad(des, (0, max(0, fixed_len - des.shape[0])))[:fixed_len] for des in X_train_sift)

# Convert the generator to a numpy array
X_train_sift_np = np.array(list(padded_descriptors))

# Print the shape of the numpy array
print(X_train_sift_np.shape)

(3997, 38400)


In [28]:
# # Concatenate the hog and sift features
X_train_features = np.concatenate((X_train_hog, X_train_sift_np), axis=1)
print(X_train_features.shape)


(3997, 96000)


In [47]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_features, y_train_org, test_size=0.2, random_state=42, stratify=y_train_org)

In [30]:
# Make a pipeline for the model which consist of StandardScaler , PCA and the model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Create a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.99)),
])

# Fit the pipeline
pipe.fit(X_train)

# Dump the pipeline to a file
joblib.dump(pipe, 'pipeline.pkl')

['pipeline.pkl']

In [None]:
# Load the pipeline
pipe = joblib.load('pipeline.pkl')

In [31]:
# Transform the data
X_train_transformed = pipe.transform(X_train)
X_test_transformed = pipe.transform(X_test)

# Print the shape of the transformed data
print(X_train_transformed.shape)

(3197, 2984)


In [37]:
# Build a shallow neural network model of 1 hidden layer with 128 neurons and relu activation function
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Create a model
model = MLPClassifier(hidden_layer_sizes=(256,), max_iter=1000, activation='relu', solver='adam', 
                      random_state=42)

# Fit the model
model.fit(X_train_transformed, y_train)

# Predict the training data
y_train_pred = model.predict(X_train_transformed)

# Predict the testing data
y_test_pred = model.predict(X_test_transformed)

# Print the accuracy of the model
print('Train accuracy: ', accuracy_score(y_train, y_train_pred)*100)

print('Test accuracy: ', accuracy_score(y_test, y_test_pred)*100)

# Print the classification report
print('Train classification report: ', classification_report(y_train, y_train_pred, target_names=labels))

print('Test classification report: ', classification_report(y_test, y_test_pred, target_names=labels))

# Accuracy: 92.75%

Train accuracy:  100.0
Test accuracy:  92.75
Train classification report:                        precision    recall  f1-score   support

    Scheherazade New       1.00      1.00      1.00       800
              Marhey       1.00      1.00      1.00       800
            Lemonada       1.00      1.00      1.00       797
IBM Plex Sans Arabic       1.00      1.00      1.00       800

            accuracy                           1.00      3197
           macro avg       1.00      1.00      1.00      3197
        weighted avg       1.00      1.00      1.00      3197

Test classification report:                        precision    recall  f1-score   support

    Scheherazade New       0.92      0.91      0.91       200
              Marhey       0.92      0.93      0.93       200
            Lemonada       0.98      0.98      0.98       200
IBM Plex Sans Arabic       0.89      0.89      0.89       200

            accuracy                           0.93       800
           macro avg   

In [33]:
# lin_model = LogisticRegression()

# # Define the hyperparameters
# param_grid = {
#     'C': np.logspace(-4, 4, 30),
#     'penalty': ['l2'],
#     'solver': ['liblinear', 'saga', 'lbfgs'],
#     'warm_start': [True, False]
# }

# # Initialize the RandomizedSearchCV
# random_search = RandomizedSearchCV(lin_model, param_distributions=param_grid, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# # Fit the model
# random_search.fit(X_train_transformed, y_train)
# # cv_results_df = pd.DataFrame(clf_searched.cv_results_)[relevant_columns].round(decimals=3).sort_values(by='rank_test_score')
# # cv_results_df.head(10)
# relevant_columns = ['param_C', 'param_penalty', 'param_solver', 'param_warm_start', 'mean_test_score', 'std_test_score', 'rank_test_score']
# cv_results_df = pd.DataFrame(random_search.cv_results_)[relevant_columns].round(decimals=3).sort_values(by='rank_test_score')
# cv_results_df.head(10)

# # Print the best parameters
# print(random_search.best_params_)

In [34]:
# Initialize the model and fit the data

# Initialize the model
# {'warm_start': True, 'solver': 'saga', 'penalty': 'l2', 'C': 0.001}

# {'warm_start': True, 'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.0006723357536499335}
model = LogisticRegression(warm_start=True, solver='saga', penalty='l2', C=0.8, random_state=42)
# model = LogisticRegression()

# Fit the model
model.fit(X_train_transformed, y_train)

# Predict the test data
y_pred = model.predict(X_test_transformed)

# Predict the train data
y_pred_train = model.predict(X_train_transformed)

# Print the training accuracy
print(f"Training Accuracy: {accuracy_score(y_train, y_pred_train)*100}")

# Print the testing accuracy
print(f"Testing Accuracy: {accuracy_score(y_test, y_pred)*100}")

# Print the f1 score of the training data
print(f"Training F1 Score: {f1_score(y_train, y_pred_train, average='weighted')}")

# Print the f1 score of the testing data
print (f"Testing F1 Score: {f1_score(y_test, y_pred, average='weighted')}")

# Print the classification report of the training data
print(classification_report(y_train, y_pred_train, target_names=labels))

# Print the classification report of the testing data
print(classification_report(y_test, y_pred, target_names=labels))

# Save the model
joblib.dump(model, 'logistic_model.pkl')

#ACCUARCY: 95.375%

Training Accuracy: 100.0
Testing Accuracy: 95.375
Training F1 Score: 1.0
Testing F1 Score: 0.9538616748349624
                      precision    recall  f1-score   support

    Scheherazade New       1.00      1.00      1.00       800
              Marhey       1.00      1.00      1.00       800
            Lemonada       1.00      1.00      1.00       797
IBM Plex Sans Arabic       1.00      1.00      1.00       800

            accuracy                           1.00      3197
           macro avg       1.00      1.00      1.00      3197
        weighted avg       1.00      1.00      1.00      3197

                      precision    recall  f1-score   support

    Scheherazade New       0.96      0.94      0.95       200
              Marhey       0.91      0.94      0.93       200
            Lemonada       0.99      0.99      0.99       200
IBM Plex Sans Arabic       0.95      0.94      0.94       200

            accuracy                           0.95       800
           macro 



['logistic_model.pkl']

In [None]:
# svm = SVC()

# param_dist = {
#     'C': np.logspace(-3, 3, 15), 
#     'kernel': ['poly', 'rbf'], 
#     'degree': [2, 3],
#     'gamma': ['scale', 'auto']
# }

# clf_searched = RandomizedSearchCV(svm, param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1, verbose=1)

# clf_searched.fit(X_train_transformed, y_train)
# relevant_columns = ['param_C', 'param_kernel', 'param_gamma', 'mean_test_score', 'std_test_score', 'rank_test_score']
# cv_results_df = pd.DataFrame(clf_searched.cv_results_)[relevant_columns].round(decimals=3).sort_values(by='rank_test_score')
# cv_results_df.head(10)

# Best parameters of the SVM
# {C : 7.196857 , kernel : 'rbf', gamma : 'scale'}

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_score,std_test_score,rank_test_score
73,7.196857,rbf,scale,0.933,0.01,1
45,19.306977,rbf,scale,0.933,0.01,1
95,372.759372,rbf,scale,0.933,0.01,1
84,51.794747,rbf,scale,0.933,0.01,1
80,19.306977,rbf,scale,0.933,0.01,1
6,7.196857,rbf,scale,0.933,0.01,1
71,1000.0,rbf,scale,0.933,0.01,1
16,372.759372,rbf,scale,0.933,0.01,1
88,138.949549,rbf,scale,0.933,0.01,1
29,138.949549,rbf,scale,0.933,0.01,1


In [38]:
# Initialize the SVM model
svm = SVC(C=7.196857 , kernel='rbf', gamma='scale', random_state=42)

# Fit the SVM on the training data
svm.fit(X_train_transformed, y_train)

# Predict the labels of the test set
y_pred = svm.predict(X_test_transformed)
# Predict the labels of the training set
y_pred_train = svm.predict(X_train_transformed)

# Print the accuracy of the SVM model on the training set
print(f"Accuracy of SVM model on the training set: {accuracy_score(y_train, y_pred_train)*100}")

# Print the accuracy of the SVM model on the test set
print(f"Accuracy of SVM model: {accuracy_score(y_test, y_pred)*100}")

# Print the classification report of the SVM model
print(classification_report(y_train, y_pred_train, target_names=labels))

# Print the classification report of the SVM model
print(classification_report(y_test, y_pred, target_names=labels))

# Save the SVM model as h5 file
joblib.dump(svm, 'svm_model.pkl')

#ACCURACY: 91.25%

Accuracy of SVM model on the training set: 100.0
Accuracy of SVM model: 90.125
                      precision    recall  f1-score   support

    Scheherazade New       1.00      1.00      1.00       800
              Marhey       1.00      1.00      1.00       800
            Lemonada       1.00      1.00      1.00       797
IBM Plex Sans Arabic       1.00      1.00      1.00       800

            accuracy                           1.00      3197
           macro avg       1.00      1.00      1.00      3197
        weighted avg       1.00      1.00      1.00      3197

                      precision    recall  f1-score   support

    Scheherazade New       0.99      0.66      0.79       200
              Marhey       0.87      0.96      0.91       200
            Lemonada       1.00      1.00      1.00       200
IBM Plex Sans Arabic       0.80      0.99      0.88       200

            accuracy                           0.90       800
           macro avg       0.92      0.90      0

['svm_model.pkl']

In [None]:
# # Random Forest Classifier

# random_forest_clf = RandomForestClassifier(random_state=42)

# # Define the hyperparameters distribution (use same ranges as before)
# param_dist = {
#     'n_estimators': np.arange(200, 500), 
#     'min_samples_split':  range(2,51),                    # 2 to 50
#     'min_samples_leaf': range(1,21),                      # 1 to 20
#     'max_depth': range(5, 51, 5),                              # 5 to 50 with step of 5
#     'min_impurity_decrease': np.linspace(0.0, 0.2, 20),                   # Decide a reasonable range here (with 20 values)
# }

# # Initialize RandomizedSearchCV
# clf_searched = RandomizedSearchCV(random_forest_clf, param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1, verbose=1)

# clf_searched.fit(X_train_transformed, y_train)
# relevant_columns = ['param_n_estimators', 'param_min_samples_split', 'param_min_samples_leaf', 'param_max_depth', 'param_min_impurity_decrease', 
#                     'mean_test_score', 'std_test_score', 'rank_test_score']
# cv_results_df = pd.DataFrame(clf_searched.cv_results_)[relevant_columns].round(decimals=3).sort_values(by='rank_test_score')
# cv_results_df.head(10)

# # Best parameters of the Random Forest Classifier
# # {'n_estimators': 371, 'min_samples_split': 48, 'min_samples_leaf': 4, 'min_impurity_decrease': 0.0, 'max_depth': 30}
# # Best score: 0.776

In [None]:
random_forest_clf = RandomForestClassifier(n_estimators=371, min_samples_split=48, min_samples_leaf=4, min_impurity_decrease=0., max_depth=30, random_state=42)

# Fit the Random Forest Classifier
random_forest_clf.fit(X_train_transformed, y_train)

# Predict the labels of the test set
y_pred = random_forest_clf.predict(X_test_transformed)

# Predict the labels of the training set
y_pred_train = random_forest_clf.predict(X_train_transformed)

# Print the accuracy of the Random Forest Classifier model on the training set
print(f"Accuracy of Random Forest Classifier model on the training set: {accuracy_score(y_train, y_pred_train)*100}")

# Print the accuracy of the Random Forest Classifier model on the test set
print(f"Accuracy of Random Forest Classifier model: {accuracy_score(y_test, y_pred)*100}")

# Print the classification report of the Random Forest Classifier model
print(classification_report(y_train, y_pred_train, target_names=labels))

# Print the classification report of the Random Forest Classifier model
print(classification_report(y_test, y_pred, target_names=labels))

# Save the Random Forest Classifier model as h5 file
joblib.dump(random_forest_clf, 'random_forest_clf.pkl')

#ACCURACY: 83.25%

Accuracy of Random Forest Classifier model on the training set: 99.96875
Accuracy of Random Forest Classifier model: 83.25
                      precision    recall  f1-score   support

    Scheherazade New       1.00      1.00      1.00       800
              Marhey       1.00      1.00      1.00       800
            Lemonada       1.00      1.00      1.00       800
IBM Plex Sans Arabic       1.00      1.00      1.00       800

            accuracy                           1.00      3200
           macro avg       1.00      1.00      1.00      3200
        weighted avg       1.00      1.00      1.00      3200

                      precision    recall  f1-score   support

    Scheherazade New       0.92      0.71      0.81       200
              Marhey       0.67      0.95      0.79       200
            Lemonada       0.96      1.00      0.98       200
IBM Plex Sans Arabic       0.86      0.67      0.75       200

            accuracy                           0.83       800
    

['random_forest_clf.pkl']

In [56]:
# Build a neural networ consist of input layer of size 400, hidden layer of size 256 , hidden layer of size 128 and output layer of size 4 pytorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Convert the data to tensors
X_train_tensor = torch.tensor(X_train_transformed, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_transformed, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Print the shape of the tensors
print(X_train_tensor.shape, y_train_tensor.shape, X_test_tensor.shape, y_test_tensor.shape)

# Create a dataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create a dataloader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the neural network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(2984, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 4)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
# Create the model
model = Net()

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0002)

# Train the model
for epoch in tqdm(range(100)):
    model.train()
    for X, y in train_loader:
        optimizer.zero_grad()
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')
    
# Evaluate the model
model.eval()

# Get the predictions
y_pred = []
y_true = []
for X, y in test_loader:
    output = model(X)
    y_pred.extend(torch.argmax(output, 1).tolist())
    y_true.extend(y.tolist())
    
# Print the accuracy
print('Accuracy: ', accuracy_score(y_true, y_pred)*100)

# Print the classification report
print(classification_report(y_true, y_pred, target_names=labels))
# Accuracy:  96.375%

torch.Size([3197, 2984]) torch.Size([3197]) torch.Size([800, 2984]) torch.Size([800])


  1%|          | 1/100 [00:00<01:38,  1.00it/s]

Epoch 1, Loss: 0.3966382145881653


  2%|▏         | 2/100 [00:01<01:28,  1.10it/s]

Epoch 2, Loss: 0.04180854186415672


  3%|▎         | 3/100 [00:02<01:26,  1.12it/s]

Epoch 3, Loss: 0.005950699094682932


  4%|▍         | 4/100 [00:03<01:27,  1.10it/s]

Epoch 4, Loss: 0.0030725402757525444


  5%|▌         | 5/100 [00:04<01:27,  1.09it/s]

Epoch 5, Loss: 0.001876615104265511


  6%|▌         | 6/100 [00:05<01:24,  1.11it/s]

Epoch 6, Loss: 0.0016626014839857817


  7%|▋         | 7/100 [00:06<01:22,  1.13it/s]

Epoch 7, Loss: 0.0014224061742424965


  8%|▊         | 8/100 [00:07<01:20,  1.14it/s]

Epoch 8, Loss: 0.0007364930352196097


  9%|▉         | 9/100 [00:08<01:18,  1.15it/s]

Epoch 9, Loss: 0.0005590920336544514


 10%|█         | 10/100 [00:08<01:17,  1.16it/s]

Epoch 10, Loss: 0.0003658164059743285


 11%|█         | 11/100 [00:09<01:16,  1.16it/s]

Epoch 11, Loss: 0.0004937460180372


 12%|█▏        | 12/100 [00:10<01:17,  1.14it/s]

Epoch 12, Loss: 0.0003923581098206341


 13%|█▎        | 13/100 [00:11<01:15,  1.15it/s]

Epoch 13, Loss: 0.0003619663475546986


 14%|█▍        | 14/100 [00:12<01:14,  1.16it/s]

Epoch 14, Loss: 0.00028472140547819436


 15%|█▌        | 15/100 [00:13<01:12,  1.17it/s]

Epoch 15, Loss: 0.00018523944891057909


 16%|█▌        | 16/100 [00:14<01:11,  1.17it/s]

Epoch 16, Loss: 0.00020199721620883793


 17%|█▋        | 17/100 [00:14<01:10,  1.17it/s]

Epoch 17, Loss: 0.0001640623522689566


 18%|█▊        | 18/100 [00:15<01:09,  1.18it/s]

Epoch 18, Loss: 0.00012937464634887874


 19%|█▉        | 19/100 [00:16<01:09,  1.17it/s]

Epoch 19, Loss: 0.00013986659178044647


 20%|██        | 20/100 [00:17<01:11,  1.12it/s]

Epoch 20, Loss: 0.00011749611439881846


 21%|██        | 21/100 [00:18<01:08,  1.15it/s]

Epoch 21, Loss: 0.0001647258031880483


 22%|██▏       | 22/100 [00:19<01:06,  1.17it/s]

Epoch 22, Loss: 9.03828622540459e-05


 23%|██▎       | 23/100 [00:20<01:04,  1.19it/s]

Epoch 23, Loss: 7.5092728366144e-05


 24%|██▍       | 24/100 [00:20<01:02,  1.21it/s]

Epoch 24, Loss: 8.201754098990932e-05


 25%|██▌       | 25/100 [00:21<01:01,  1.22it/s]

Epoch 25, Loss: 6.974231655476615e-05


 26%|██▌       | 26/100 [00:22<01:00,  1.22it/s]

Epoch 26, Loss: 6.125826621428132e-05


 27%|██▋       | 27/100 [00:23<00:59,  1.22it/s]

Epoch 27, Loss: 6.422504520742223e-05


 28%|██▊       | 28/100 [00:24<01:00,  1.19it/s]

Epoch 28, Loss: 4.5232231059344485e-05


 29%|██▉       | 29/100 [00:24<00:58,  1.21it/s]

Epoch 29, Loss: 4.566370262182318e-05


 30%|███       | 30/100 [00:25<00:57,  1.21it/s]

Epoch 30, Loss: 3.7397399864858016e-05


 31%|███       | 31/100 [00:26<00:56,  1.22it/s]

Epoch 31, Loss: 4.101844024262391e-05


 32%|███▏      | 32/100 [00:27<00:55,  1.22it/s]

Epoch 32, Loss: 3.9551501686219126e-05


 33%|███▎      | 33/100 [00:28<00:54,  1.23it/s]

Epoch 33, Loss: 3.0541552405338734e-05


 34%|███▍      | 34/100 [00:28<00:53,  1.23it/s]

Epoch 34, Loss: 3.0488105039694346e-05


 35%|███▌      | 35/100 [00:29<00:52,  1.23it/s]

Epoch 35, Loss: 3.23294589179568e-05


 36%|███▌      | 36/100 [00:30<00:53,  1.20it/s]

Epoch 36, Loss: 2.5543027732055634e-05


 37%|███▋      | 37/100 [00:31<00:51,  1.21it/s]

Epoch 37, Loss: 2.3323520508711226e-05


 38%|███▊      | 38/100 [00:32<00:50,  1.22it/s]

Epoch 38, Loss: 2.132991903636139e-05


 39%|███▉      | 39/100 [00:33<00:49,  1.22it/s]

Epoch 39, Loss: 2.3594502636115067e-05


 40%|████      | 40/100 [00:33<00:48,  1.23it/s]

Epoch 40, Loss: 2.1469579223776236e-05


 41%|████      | 41/100 [00:34<00:47,  1.24it/s]

Epoch 41, Loss: 1.5131189684325363e-05


 42%|████▏     | 42/100 [00:35<00:46,  1.24it/s]

Epoch 42, Loss: 1.542301833978854e-05


 43%|████▎     | 43/100 [00:36<00:45,  1.24it/s]

Epoch 43, Loss: 1.9858189261867665e-05


 44%|████▍     | 44/100 [00:37<00:46,  1.21it/s]

Epoch 44, Loss: 1.495855667599244e-05


 45%|████▌     | 45/100 [00:37<00:44,  1.22it/s]

Epoch 45, Loss: 1.1415191693231463e-05


 46%|████▌     | 46/100 [00:38<00:44,  1.22it/s]

Epoch 46, Loss: 1.726037953631021e-05


 47%|████▋     | 47/100 [00:39<00:43,  1.22it/s]

Epoch 47, Loss: 9.89018280961318e-06


 48%|████▊     | 48/100 [00:40<00:42,  1.23it/s]

Epoch 48, Loss: 1.219199566548923e-05


 49%|████▉     | 49/100 [00:41<00:41,  1.23it/s]

Epoch 49, Loss: 1.3515703358280007e-05


 50%|█████     | 50/100 [00:42<00:40,  1.23it/s]

Epoch 50, Loss: 7.333389476116281e-06


 51%|█████     | 51/100 [00:42<00:39,  1.23it/s]

Epoch 51, Loss: 9.49966397456592e-06


 52%|█████▏    | 52/100 [00:43<00:39,  1.21it/s]

Epoch 52, Loss: 1.2163343853899278e-05


 53%|█████▎    | 53/100 [00:44<00:38,  1.22it/s]

Epoch 53, Loss: 8.01574788056314e-06


 54%|█████▍    | 54/100 [00:45<00:37,  1.22it/s]

Epoch 54, Loss: 8.023955160751939e-06


 55%|█████▌    | 55/100 [00:46<00:36,  1.23it/s]

Epoch 55, Loss: 7.904753147158772e-06


 56%|█████▌    | 56/100 [00:46<00:35,  1.23it/s]

Epoch 56, Loss: 8.311706551467068e-06


 57%|█████▋    | 57/100 [00:47<00:35,  1.23it/s]

Epoch 57, Loss: 5.4466099754790775e-06


 58%|█████▊    | 58/100 [00:48<00:34,  1.23it/s]

Epoch 58, Loss: 5.541152859223075e-06


 59%|█████▉    | 59/100 [00:49<00:33,  1.23it/s]

Epoch 59, Loss: 8.14728900877526e-06


 60%|██████    | 60/100 [00:50<00:33,  1.20it/s]

Epoch 60, Loss: 5.454804067994701e-06


 61%|██████    | 61/100 [00:51<00:32,  1.21it/s]

Epoch 61, Loss: 4.842345333599951e-06


 62%|██████▏   | 62/100 [00:51<00:31,  1.22it/s]

Epoch 62, Loss: 3.6297060432843864e-06


 63%|██████▎   | 63/100 [00:52<00:30,  1.23it/s]

Epoch 63, Loss: 4.028440343972761e-06


 64%|██████▍   | 64/100 [00:53<00:29,  1.23it/s]

Epoch 64, Loss: 4.603925845003687e-06


 65%|██████▌   | 65/100 [00:54<00:28,  1.23it/s]

Epoch 65, Loss: 3.707810037667514e-06


 66%|██████▌   | 66/100 [00:55<00:27,  1.23it/s]

Epoch 66, Loss: 5.138291271578055e-06


 67%|██████▋   | 67/100 [00:56<00:27,  1.20it/s]

Epoch 67, Loss: 4.464163794182241e-06


 68%|██████▊   | 68/100 [00:56<00:26,  1.21it/s]

Epoch 68, Loss: 3.7859115309402114e-06


 69%|██████▉   | 69/100 [00:57<00:25,  1.21it/s]

Epoch 69, Loss: 4.20108653997886e-06


 70%|███████   | 70/100 [00:58<00:24,  1.22it/s]

Epoch 70, Loss: 3.0295527722046245e-06


 71%|███████   | 71/100 [00:59<00:23,  1.22it/s]

Epoch 71, Loss: 3.0172200240485836e-06


 72%|███████▏  | 72/100 [01:00<00:23,  1.22it/s]

Epoch 72, Loss: 2.5609399472159566e-06


 73%|███████▎  | 73/100 [01:00<00:22,  1.19it/s]

Epoch 73, Loss: 2.96377993436181e-06


 74%|███████▍  | 74/100 [01:01<00:21,  1.19it/s]

Epoch 74, Loss: 1.882681999632041e-06


 75%|███████▌  | 75/100 [01:02<00:20,  1.20it/s]

Epoch 75, Loss: 2.301967924722703e-06


 76%|███████▌  | 76/100 [01:03<00:20,  1.18it/s]

Epoch 76, Loss: 2.0717714050988434e-06


 77%|███████▋  | 77/100 [01:04<00:19,  1.19it/s]

Epoch 77, Loss: 2.116988298439537e-06


 78%|███████▊  | 78/100 [01:05<00:18,  1.20it/s]

Epoch 78, Loss: 2.1991993435221957e-06


 79%|███████▉  | 79/100 [01:05<00:17,  1.21it/s]

Epoch 79, Loss: 1.8991242995980429e-06


 80%|████████  | 80/100 [01:06<00:16,  1.22it/s]

Epoch 80, Loss: 1.648374336582492e-06


 81%|████████  | 81/100 [01:07<00:15,  1.22it/s]

Epoch 81, Loss: 1.311300820816541e-06


 82%|████████▏ | 82/100 [01:08<00:14,  1.23it/s]

Epoch 82, Loss: 1.4880590697430307e-06


 83%|████████▎ | 83/100 [01:09<00:14,  1.20it/s]

Epoch 83, Loss: 1.4346209127324983e-06


 84%|████████▍ | 84/100 [01:10<00:13,  1.17it/s]

Epoch 84, Loss: 1.483948608438368e-06


 85%|████████▌ | 85/100 [01:11<00:13,  1.14it/s]

Epoch 85, Loss: 1.122210505855037e-06


 86%|████████▌ | 86/100 [01:11<00:12,  1.16it/s]

Epoch 86, Loss: 1.1263209671596996e-06


 87%|████████▋ | 87/100 [01:12<00:11,  1.14it/s]

Epoch 87, Loss: 9.577843229635619e-07


 88%|████████▊ | 88/100 [01:13<00:10,  1.16it/s]

Epoch 88, Loss: 1.1468740694908774e-06


 89%|████████▉ | 89/100 [01:14<00:09,  1.18it/s]

Epoch 89, Loss: 1.0030013299910934e-06


 90%|█████████ | 90/100 [01:15<00:08,  1.20it/s]

Epoch 90, Loss: 9.043458248925162e-07


 91%|█████████ | 91/100 [01:16<00:07,  1.18it/s]

Epoch 91, Loss: 6.24820700068085e-07


 92%|█████████▏| 92/100 [01:16<00:06,  1.20it/s]

Epoch 92, Loss: 7.44030103305704e-07


 93%|█████████▎| 93/100 [01:17<00:05,  1.21it/s]

Epoch 93, Loss: 9.701155931907124e-07


 94%|█████████▍| 94/100 [01:18<00:04,  1.22it/s]

Epoch 94, Loss: 6.65927530008048e-07


 95%|█████████▌| 95/100 [01:19<00:04,  1.23it/s]

Epoch 95, Loss: 5.960462772236497e-07


 96%|█████████▌| 96/100 [01:20<00:03,  1.23it/s]

Epoch 96, Loss: 7.070339620440791e-07


 97%|█████████▋| 97/100 [01:20<00:02,  1.23it/s]

Epoch 97, Loss: 6.207101819200034e-07


 98%|█████████▊| 98/100 [01:21<00:01,  1.20it/s]

Epoch 98, Loss: 7.933579126984114e-07


 99%|█████████▉| 99/100 [01:22<00:00,  1.21it/s]

Epoch 99, Loss: 5.015010060560599e-07


100%|██████████| 100/100 [01:23<00:00,  1.20it/s]

Epoch 100, Loss: 5.878245019630413e-07
Accuracy:  96.375
                      precision    recall  f1-score   support

    Scheherazade New       0.96      0.96      0.96       200
              Marhey       0.95      0.94      0.94       200
            Lemonada       0.99      0.99      0.99       200
IBM Plex Sans Arabic       0.96      0.96      0.96       200

            accuracy                           0.96       800
           macro avg       0.96      0.96      0.96       800
        weighted avg       0.96      0.96      0.96       800






In [None]:
# Stacking the models together to get the best model
from sklearn.ensemble import StackingClassifier

# Define the base models
estimators = [
    ('logistic', LogisticRegression(warm_start=True, solver='saga', penalty='l2', C=0.8, random_state=42)),
    ('svm', SVC(C=7.196857 , kernel='rbf', gamma='scale', random_state=42)),
    # ('mlp', MLPClassifier(hidden_layer_sizes=(256,), max_iter=1000, activation='relu', solver='adam', 
    #                 random_state=42))
]
meta_model = LogisticRegression()

# Initialize the stacking classifier
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=meta_model, cv=5)

# Fit the stacking classifier
stacking_clf.fit(X_train_transformed, y_train)

# Predict the labels of the test set
y_pred = stacking_clf.predict(X_test_transformed)

# Predict the labels of the training set
y_pred_train = stacking_clf.predict(X_train_transformed)

# Print the accuracy of the Stacking Classifier model on the training set
print(f"Accuracy of Stacking Classifier model on the training set: {accuracy_score(y_train, y_pred_train)*100}")

# Print the accuracy of the Stacking Classifier model on the test set
print(f"Accuracy of Stacking Classifier model: {accuracy_score(y_test, y_pred)*100}")

# Print the classification report of the Stacking Classifier model
print(classification_report(y_train, y_pred_train, target_names=labels))

# Print the classification report of the Stacking Classifier model
print(classification_report(y_test, y_pred, target_names=labels))

# ACCURACY: 96%

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy of Stacking Classifier model on the training set: 100.0
Accuracy of Stacking Classifier model: 96.0
                      precision    recall  f1-score   support

    Scheherazade New       1.00      1.00      1.00       800
              Marhey       1.00      1.00      1.00       800
            Lemonada       1.00      1.00      1.00       800
IBM Plex Sans Arabic       1.00      1.00      1.00       800

            accuracy                           1.00      3200
           macro avg       1.00      1.00      1.00      3200
        weighted avg       1.00      1.00      1.00      3200

                      precision    recall  f1-score   support

    Scheherazade New       0.97      0.92      0.95       200
              Marhey       0.95      0.95      0.95       200
            Lemonada       0.99      1.00      0.99       200
IBM Plex Sans Arabic       0.93      0.97      0.95       200

            accuracy                           0.96       800
           macro a