## CLIP 332x332


In [None]:
from google.colab import drive
import os
import pandas as pd
from PIL import Image
import torch
from torch import nn
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from transformers import CLIPModel, CLIPProcessor
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score


# Mount Google Drive
drive.mount('/content/drive')
data_path = '/content/drive/My Drive/capstone_dataset/'
labels_path = os.path.join(data_path, 'image_labels.xlsx')
images_path = os.path.join(data_path, 'images')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Read and process labels
df = pd.read_excel(labels_path)
label_mapping = {'H': 2, 'M': 1, 'L': 0}
df['Label'] = df['Label'].map(label_mapping)
original_count = len(df)
df = df.dropna(subset=['Label'])
print(f"Removed {original_count - len(df)} rows with NaN labels.")

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)


# Data transformation and loading
transform = transforms.Compose([
    transforms.Resize((336, 336)),
    transforms.ToTensor(),
])

class HandwritingDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None):
        self.img_labels = annotations_file
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = Image.open(img_path).convert("RGB")
        label = self.img_labels.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        return image, label

train_dataset = HandwritingDataset(train_df, images_path, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)


# Model setup
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14-336")
model.to(device)  # Move model to GPU

# Adding a linear layer to project from feature space to class space
projection_layer = nn.Linear(768, 3)  # Adjust this to match the actual feature dimension
projection_layer.to(device)

optimizer = Adam(list(model.parameters()) + list(projection_layer.parameters()), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()
num_epochs = 15

# Training loop
for epoch in range(num_epochs):
    pbar = tqdm(train_dataloader, total=len(train_dataloader), desc=f"Epoch {epoch+1}/{num_epochs}")
    for images, labels in pbar:
        images, labels = images.to(device), labels.to(device).long()

        optimizer.zero_grad()
        features = model.get_image_features(images)
        output = projection_layer(features)

        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()

        pbar.set_description(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

torch.save(model.state_dict(), '/content/drive/My Drive/capstone_dataset/model_weights.pth')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Removed 0 rows with NaN labels.
Using cuda for training.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Epoch 1/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 2/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 3/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 4/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 5/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 6/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 7/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 8/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 9/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 10/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 11/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 12/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 13/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 14/15:   0%|          | 0/133 [00:00<?, ?it/s]

Epoch 15/15:   0%|          | 0/133 [00:00<?, ?it/s]

In [None]:
# Create test dataset and dataloader
test_dataset = HandwritingDataset(test_df, images_path, transform=transform)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

model.eval()
projection_layer.eval()

all_labels = []
all_preds = []


with torch.no_grad():
    for images, labels in test_dataloader:
        images, labels = images.to(device), labels.to(device)

        # Get image features and project to class space
        features = model.get_image_features(images)
        outputs = projection_layer(features)
        _, preds = torch.max(outputs, dim=1)

        # Collect all true labels and predictions
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())


accuracy = accuracy_score(all_labels, all_preds)
conf_matrix = confusion_matrix(all_labels, all_preds)

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.2881
Confusion Matrix:
[[ 0 17  0]
 [ 0 17  0]
 [ 0 25  0]]


## Patch-based CLIP 224x224

In [None]:
num_epochs = 5
patch_size = (224, 224)
batch_size = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Mount Google Drive
drive.mount('/content/drive')
data_path = '/content/drive/My Drive/capstone_dataset/'
labels_path = os.path.join(data_path, 'image_labels.xlsx')
images_path = os.path.join(data_path, 'images')

df = pd.read_excel(labels_path)
label_mapping = {'H': 2, 'M': 1, 'L': 0}
df['label'] = df['label'].map(label_mapping)
original_count = len(df)
df = df.dropna(subset=['label'])
print(f"Removed {original_count - len(df)} rows with NaN labels.")

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Data transformation and loading
transform = transforms.Compose([
    transforms.ToTensor(),
])

class PatchedHandwritingDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None, patch_size=(224, 224)):
        self.img_labels = annotations_file
        self.img_dir = img_dir
        self.transform = transform
        self.patch_size = patch_size
        self.samples = []
        self.prepare_patches()

    def prepare_patches(self):
        for _, row in self.img_labels.iterrows():
            img_path = os.path.join(self.img_dir, row['image_name'])
            image = Image.open(img_path).convert("RGB")
            patches = self.extract_patches(image)
            for patch in patches:
                self.samples.append((patch, row['label']))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        patch, label = self.samples[idx]
        if self.transform:
            patch = self.transform(patch)
        return patch, label

    def extract_patches(self, image):
        iw, ih = image.size
        pw, ph = self.patch_size
        npw = ((iw - 1) // pw + 1) * pw
        nph = ((ih - 1) // ph + 1) * ph
        padded_image = ImageOps.pad(image, size=(npw, nph), color='black')
        patches = []
        for i in range(0, nph, ph):
            for j in range(0, npw, pw):
                patch = padded_image.crop((j, i, j + pw, i + ph))
                patches.append(patch)
        return patches

train_dataset = PatchedHandwritingDataset(train_df, images_path, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Model setup
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
model.to(device)

# Adding a linear layer to project from feature space to class space
projection_layer = nn.Linear(768, 3)
projection_layer.to(device)

optimizer = Adam(list(model.parameters()) + list(projection_layer.parameters()), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    model.train()
    for images, labels in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{num_epochs}'):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        features = model.get_image_features(images)
        output = projection_layer(features)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Average Loss: {total_loss / len(train_dataloader)}')

# Save the trained model
torch.save(model.state_dict(), '/content/drive/My Drive/capstone_dataset/new_model_weights.pth')


Mounted at /content/drive
Removed 0 rows with NaN labels.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Epoch 1/5:   0%|          | 0/6112 [00:00<?, ?it/s]

Epoch 1, Average Loss: 1.1051149813468075


Epoch 2/5:   0%|          | 0/6112 [00:00<?, ?it/s]

Epoch 2, Average Loss: 1.096079938191706


Epoch 3/5:   0%|          | 0/6112 [00:00<?, ?it/s]

Epoch 3, Average Loss: 1.0944428306697083


Epoch 4/5:   0%|          | 0/6112 [00:00<?, ?it/s]

Epoch 4, Average Loss: 1.0939926988986461


Epoch 5/5:   0%|          | 0/6112 [00:00<?, ?it/s]

Epoch 5, Average Loss: 1.092739153750904


In [None]:
test_dataset = PatchedHandwritingDataset(test_df, images_path, transform=transform)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.cpu().numpy()  # Move labels to CPU and convert to numpy for comparison later
            outputs = model.get_image_features(images)  # Get the features from the model
            outputs = projection_layer(outputs)  # Apply the projection layer to get logits
            _, preds = torch.max(outputs, 1)
            preds = preds.cpu().numpy()  # Move predictions to CPU and convert to numpy
            all_preds.extend(preds)
            all_labels.extend(labels)

    accuracy = accuracy_score(all_labels, all_preds)
    conf_matrix = confusion_matrix(all_labels, all_preds)
    return accuracy, conf_matrix

# Evaluate the model
accuracy, conf_matrix = evaluate_model(model, test_dataloader)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.38461538461538464
Confusion Matrix:
[[   0    0  880]
 [   0    0  784]
 [   0    0 1040]]


## Hybrid Model with ResNet50 and CodeBert

In [None]:
import torch
import pandas as pd
from torch import nn, optim
from google.colab import drive
from torchvision import models, transforms
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from PIL import Image


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def parse_code_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    data = []
    current_image = None
    current_code = []
    for line in lines:
        if ':' in line:
            if current_image:
                data.append([current_image, ''.join(current_code)])
            parts = line.split(':', 1)
            current_image = parts[0].strip()
            current_code = [parts[1].strip()]
        else:
            current_code.append(line.strip())
    if current_image and current_code:
        data.append([current_image, ''.join(current_code)])
    return pd.DataFrame(data, columns=['image_name', 'code'])


drive.mount('/content/drive')
data_folder = '/content/drive/My Drive/capstone_dataset/'
images_folder = data_folder + 'images/'
df_labels = pd.read_excel(data_folder + 'image_labels.xlsx')
df_text = parse_code_file(data_folder + 'full_texts.txt')
df = df_labels.merge(df_text, on='image_name')

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)


class CodeHandwritingDataset(Dataset):
    def __init__(self, dataframe, image_folder, tokenizer, transform=None):
        self.dataframe = dataframe
        self.image_folder = image_folder
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = self.image_folder + row['image_name']
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        text = row['code']
        inputs = self.tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
        label = torch.tensor(int(row['label'] == 'H') * 2 + int(row['label'] == 'M'))
        return image.to(device), inputs.input_ids.squeeze(0).to(device), label.to(device)


class Attention(nn.Module):
    def __init__(self, feature_dim):
        super(Attention, self).__init__()
        self.attention = nn.Sequential(
            nn.Linear(feature_dim, 128),
            nn.Tanh(),
            nn.Linear(128, 1)
        )

    def forward(self, features):
        scores = self.attention(features)
        attention_weights = torch.softmax(scores, dim=1)
        weighted = attention_weights * features
        summarized = torch.sum(weighted, dim=1)
        return summarized


class EnhancedHybridModel(nn.Module):
    def __init__(self):
        super(EnhancedHybridModel, self).__init__()
        self.cnn = models.resnet50(pretrained=True)
        self.cnn.fc = nn.Linear(self.cnn.fc.in_features, 256)
        self.transformer = AutoModel.from_pretrained("microsoft/codebert-base")
        self.img_attention = Attention(256)
        self.text_attention = Attention(self.transformer.config.hidden_size)
        self.fc1 = nn.Linear(256 + self.transformer.config.hidden_size, 128)
        self.relu = nn.ReLU()
        self.classifier = nn.Linear(128, 3)

    def forward(self, images, text_inputs):
        img_features = self.cnn(images)
        text_features = self.transformer(text_inputs).pooler_output
        img_features = self.img_attention(img_features.unsqueeze(1))
        text_features = self.text_attention(text_features.unsqueeze(1))
        combined_features = torch.cat((img_features, text_features), dim=1)
        x = self.relu(self.fc1(combined_features))
        outputs = self.classifier(x)
        return outputs

# Model setup
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = EnhancedHybridModel().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
criterion = nn.CrossEntropyLoss()

# Data loading and augmentation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomRotation(15),
    transforms.ColorJitter(),
    transforms.ToTensor()
])


train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_dataset = CodeHandwritingDataset(train_df, images_folder, tokenizer, transform)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataset = CodeHandwritingDataset(test_df, images_folder, tokenizer, transform)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Training and evaluation loop
model.train()
for epoch in range(20):
    total_loss = 0
    for images, texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(images, texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')


# Evaluation
model.eval()
all_preds, all_labels = [], []
for images, texts, labels in test_loader:
    with torch.no_grad():
        outputs = model(images, texts)
        _, predicted = torch.max(outputs.data, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
conf_mat = confusion_matrix(all_labels, all_preds)
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_mat}')

# Save model weights
torch.save(model.state_dict(), data_folder + 'enhanced_hybrid_model_weights.pth')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Epoch 1, Loss: 1.0987195788007793
Epoch 2, Loss: 1.081331651319157
Epoch 3, Loss: 1.0466744177269214
Epoch 4, Loss: 0.9447934537222891
Epoch 5, Loss: 0.8610879492127534
Epoch 6, Loss: 0.7968803650953553
Epoch 7, Loss: 0.693595656165571
Epoch 8, Loss: 0.5348867246824683
Epoch 9, Loss: 0.512658827232592
Epoch 10, Loss: 0.43110059602468304
Epoch 11, Loss: 0.38820430797270755
Epoch 12, Loss: 0.3594847261962114
Epoch 13, Loss: 0.3243543444709344
Epoch 14, Loss: 0.32895606251038384
Epoch 15, Loss: 0.31208122880056954
Epoch 16, Loss: 0.2717104255024231
Epoch 17, Loss: 0.2771539438628789
Epoch 18, Loss: 0.24862110140648755
Epoch 19, Loss: 0.2356075823702144
Epoch 20, Loss: 0.23582182500059856
Accuracy: 0.7796610169491526
Confusion Matrix:
[[ 5  3  0]
 [ 2 13  6]
 [ 1  1 28]]
