In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk("./"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Frame the problem
Using the customer description, Define the problem your trying to solve in your own words (remember this is not technial but must be specific so the customer understands the project

The goal of this project is to develop a system that can look at images of handwritten cursive letters and accurately identify which letter each image represents. The customer wants a tool that can process many different handwritten samples, including different handwriting styles, and automatically recognize each letter the image is showing.

# 2. Get the Data 
Define how you recieved the data (provided, gathered..)

The dataset was provided as a zip file containing around 30 samples of cursive letter a through z with the folder seperated by student who wrote each letter and file name signifying the letter. The fact that the dataset is handwritten images will make recognition more challenging due to the natural variations in handwriting style.

In [1]:
import zipfile, os
from pillow_heif import register_heif_opener
from PIL import Image

zip_path = "images.zip"
extract_path = "extracted_images"
converted_path = "converted_images"


os.makedirs(extract_path, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print("All images extracted.")


register_heif_opener()
os.makedirs(converted_path, exist_ok=True)

for root, _, files in os.walk(extract_path):
    for f in files:
        if f.lower().endswith(".heic"):
            heic_path = os.path.join(root, f)
            rel_dir = os.path.relpath(root, extract_path)
            target_dir = os.path.join(converted_path, rel_dir)
            os.makedirs(target_dir, exist_ok=True)

            jpg_path = os.path.join(target_dir, f.rsplit('.', 1)[0] + ".jpg")
            with Image.open(heic_path) as img:
                img.convert("L").resize((64, 64)).save(jpg_path, "JPEG", quality=85, optimize=True)

print("All HEIC images converted to JPG (Aâ€“Z structure preserved).")


All HEIC images converted to JPG and saved in: converted_images


# 3. Explore the Data
Gain insights into the data you have from step 2, making sure to identify any bias

After extracting the zip file, I explored the dataset and verified the number of samples per letter. Each letter had a varying amount as there were some duplicates of letters and each folder was formatted in a weird way, so most likely I will have to remove all subfolders and make it one big folder with each image named after the letter it is. I will also have to crop some images to just see the paper it is written on because some are very zoomed out, which will make it difficult to detect the image.

In [None]:
import cv2
import numpy as np

data = []
labels = []

for root, _, files in os.walk("converted_images"):
    for file in files:
        if file.lower().endswith((".jpg", ".jpeg", ".png")):
            img_path = os.path.join(root, file)
            label = os.path.basename(root).lower()
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, (64, 64))
            data.append(img)
            labels.append(label)

data = np.array(data).reshape(-1, 64, 64, 1) / 255.0
labels = np.array(labels)

print(f"Loaded {len(data)} images from {len(np.unique(labels))} letter folders.")


# 4.Prepare the Data


Apply any data transformations and explain what and why


In this step, I prepared the handwritten cursive letter images for modeling. I will seperate it into subfolders respresenting each letter. I loaded every image, resized it to a consistent 200x200, and converted it to grayscale so that the model will focus more on shape then color. The pixel values were normalized to 0-1 scale to ensure consistency. The folder names will be used as labels, when the data is encoded into numerical values and then one-hot encoded so they can be used for classification later. Then the data was split into 80% training and 20% testing.

In [None]:
import os
import cv2
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

data_dir = "converted_images"

data = []
labels = []

for root, _, files in os.walk(data_dir):
    for f in files:
        if f.lower().endswith((".jpg", ".jpeg", ".png")):
            img_path = os.path.join(root, f)
            label = os.path.basename(root).lower() 

            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, (64, 64)) 
            data.append(img)
            labels.append(label)

data = np.array(data).reshape(-1, 64, 64, 1)  
labels = np.array(labels)

data = data.astype("float32") / 255.0

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(labels)
y_categorical = to_categorical(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(
    data, y_categorical, test_size=0.2, random_state=42, stratify=y_categorical
)

print(f"Total images: {len(data)}")
print(f"Letters (classes): {len(np.unique(labels))}")
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")


# 5. Model the data
Using selected ML models, experment with your choices and describe your findings. Finish by selecting a Model to continue with


# 6. Fine Tune the Model

With the select model descibe the steps taken to acheve the best rusults possiable 


# 7. Present
In a customer faceing Document provide summery of finding and detail approach taken


# 8. Launch the Model System
Define your production run code, This should be self susficent and require only your model pramaters 
