In [None]:
# # ML imports
# import pandas as pd #circular import?
# import torch
# from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
# from torchvision import transforms
# from PIL import Image
# import timm
# from sklearn.model_selection import train_test_split
# from prodigyopt import Prodigy
# from torchsampler import ImbalancedDatasetSampler
#import lightning as L


# #eval imports
# from sklearn.metrics import classification_report
# from concurrent.futures import ThreadPoolExecutor

# #Logging imports
# import csv
# import os
# import datetime
# from pathlib import Path

# #Nice to have imports
# from pathlib import Path
# #from tqdm import tqdm #only for .py world
# from mac_alerts import alerts
# from tqdm.notebook import tqdm, trange
# import wandb

# wandb.init(project='BA1', entity='alexandermittet')

# print(torch.__version__,'hello')

In [65]:
# PARAMS
single_shot = False
learning_rate = 0.001 #Doesn't matter when using prodigy optimizer
epochs = 50 #Max_limit
batch_size = 64
model = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=5) # Get model architecture
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#weight_decay = 0.01  # Adjust based on your problem, 0 by default
#d_coef = 0.5  # Adjust to force a smaller estimate of the learning rate, default is 1.0
#optimizer = Prodigy(model.parameters(), lr=1., weight_decay=weight_decay, d_coef=d_coef)
#optimizer = Prodigy(model.parameters())


# If you're using a GPU, you might need to move the model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Dynamically update W&B configuration
wandb.config.update({
    "single_shot": single_shot,
    "epochs": epochs,
    "batch_size": batch_size,  # Assuming this is how you access batch size
    "learning_rate": optimizer.param_groups[0]['lr'],  # Dynamically get the learning rate from optimizer
    "optimizer": optimizer.__class__.__name__,  # Dynamically get the optimizer class name
    "model_architecture": str(model),
    # Include any other dynamic parameters here
    })

In [105]:
#DATA LABELS
#import pandas as pd
df = pd.read_csv('img_labels_ALL.csv')

import os
import pandas as pd

# Assuming 'df' is your DataFrame
image_folder = 'img'  # The folder where your images are stored

# Check if each image exists and create a mask
image_exists = df['img'].apply(lambda x: os.path.isfile(os.path.join(image_folder, x)))

# Filter the DataFrame using the mask
filtered_df = df[image_exists]

print(f"Original DataFrame size: {len(df)}, Filtered DataFrame size: {len(filtered_df)}")

df = filtered_df
# Now 'filtered_df' contains only the rows for which the images exist.
# You can proceed with using 'filtered_df' for your dataset.



class CustomDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.iloc[idx, 0]
        img_path = f'img/{img_name}'
        image = Image.open(img_path).convert('RGB')
        label = torch.tensor(self.df.iloc[idx, 2], dtype=torch.long)
        if self.transform:
            image = self.transform(image)
        return image, label
    def get_labels(self):
        label = torch.tensor(self.df.iloc[:, 2].tolist(), dtype=torch.long)
        return label

# Define the transformations
# transform = transforms.Compose([
#     transforms.Resize((224,   224)),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485,   0.456,   0.406], std=[0.229,   0.224,   0.225]),
# ])

transform = timm.data.create_transform(
    **timm.data.resolve_data_config(model.pretrained_cfg))

# WEIGHTED RANDOM SAMPLER

# score_counts = df['score'].value_counts()
# percentage = score_counts / len(df) * 100
# print(score_counts, percentage)



'''
score
4.0    823
0.0    822
1.0    452
2.0    351
3.0    113
Name: count, dtype: int64 score
4.0    32.135884
0.0    32.096837
1.0    17.649356
2.0    13.705584
3.0     4.412339
Name: count, dtype: float64
'''

# TEST SET
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
test_data = CustomDataset(test_df, transform)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# VALIDATION SET + TRAINING SET
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

#SAMPLER
# train_class_counts = train_df['score'].value_counts().sort_index()
# print(f'train_class_counts: {train_class_counts}')
# train_class_counts = torch.tensor(train_class_counts.values, dtype=torch.float)
#sampler = WeightedRandomSampler(weights = train_class_counts, num_samples = len(train_df), replacement=True) #JEG KAN ikke få til at virke. Den sampler de samme 4 billeder hele tiden... jeg kan faktisk heller ikke få lov at sige replacement = False


# Create data loaders for training and validation sets
train_data = CustomDataset(train_df, transform)
val_data = CustomDataset(val_df, transform)

train_loader = DataLoader(train_data, batch_size=batch_size, sampler=ImbalancedDatasetSampler(train_data))
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)


# Define loss function
criterion = torch.nn.CrossEntropyLoss()

Original DataFrame size: 2561, Filtered DataFrame size: 2556


In [108]:
# #DEBUGGING TRAIN LOADER / SAMPLER
# # Antag, at du har en DataLoader kaldet 'train_loader'
# iterator = iter(train_loader)
# # Vis det første billede i batchen
# import matplotlib.pyplot as plt
# Hent den næste batch af data
# data, labels = next(iterator)
# # Display the first  10 images in the batch
# for i in range(10):  # Adjust the range to display more or fewer images
#     plt.figure(figsize=(2,  2))  # Set the figure size to  5x5 inches
#     plt.imshow(data[i].permute(1,  2,  0))  # Permute for at  ændre dimensionerne til HWC
#     plt.title(f"Label: {labels[i]}")
#     plt.show()

In [7]:
# SETUP"

# Define a function to evaluate the model on the validation set
def evaluate_on_validation_set(model, val_loader):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # Move inputs and labels to the device
            outputs = model(inputs)
            loss = criterion(outputs, labels.long())
            total_loss += loss.item()
    return total_loss / len(val_loader)


# Get the current working directory
cwd = os.getcwd()

# Define paths for saving models relative to the current working directory
model_dir = os.path.join(cwd, "models")
best_model_path = os.path.join(model_dir, "best_model.pt")
last_model_path = os.path.join(model_dir, "last_model.pt")

best_val_loss = float('inf')  # Initialize best validation loss to infinity
best_val_accuracy = 0.0  # Alternatively, if optimizing for accuracy
model_save_path = 'best_model.pth'  # Define where to save the model

In [None]:
#TRAINING LOOP
# Wrap both loops with tqdm for progress visualization
for epoch in trange(epochs, desc="Training Epochs"):
    # Traning phase
    model.train()  # Set model to training mode
    train_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1}", leave=False)
    for i, data in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1}", leave=False)):
        inputs, labels = [d.to(device) for d in data]
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Update the progress bar with the current batch's loss
        pbar.set_postfix({'Batch Loss': loss.item()})
        train_loss += loss.item()
    # Average loss for the epoch
    train_loss /= len(train_loader)
    tqdm.write(f"Epoch {epoch + 1}: Avg. Loss: {train_loss:.4f}")

    # Validation phase
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():  # No gradients needed for validation
        for inputs, labels in tqdm(val_loader, desc=f"Epoch {epoch + 1}/Validation", leave=False):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_loss /= len(val_loader)
    val_accuracy = correct / total

    # Use tqdm.write to avoid interference with the progress bars
    tqdm.write(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

    # Wandb log epoch metrics
    wandb.log({"epoch": epoch, "train_loss": train_loss, "val_loss": val_loss, "val_accuracy": val_accuracy})
    

    # Save model if validation loss decreased
    if val_loss < best_val_loss:
        tqdm.write(f"Validation loss decreased ({best_val_loss:.4f} --> {val_loss:.4f}). Saving model...")
        best_val_loss = val_loss
        torch.save(model.state_dict(), model_save_path)
        
wandb.finish()
alerts.play_success()
os.system('say "Træning færdig"'); print('\a\a\a')

## parallel evaluations of test set

In [None]:
from sklearn.metrics import classification_report
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def evaluate_single_sample(i):
    # Get the sample from the dataset
    sample, true_label = test_data[i]
    
    # Move the sample to the same device as the model
    sample = sample.to(device)
    
    # Pass the sample through the model
    with torch.no_grad():
        prediction = model(sample.unsqueeze(0))  # Unsqueeze to add batch dimension
    
    # Convert the prediction to a class label
    _, predicted_class = torch.max(prediction, dim=1)
    
    # Move the predicted_class back to CPU for further operations
    predicted_class = predicted_class.to('cpu')
    
    # Return the true label and the predicted class
    return true_label.item(), predicted_class.item()

# Ensure the model is in evaluation mode
model.eval()

# Shared lists to store true labels and predicted classes
true_labels_list = []
predicted_classes_list = []

# Create a thread pool executor
with ThreadPoolExecutor() as executor:
    # Evaluate all samples in parallel and collect results
    results = list(tqdm(executor.map(evaluate_single_sample, range(len(test_data))), total=len(test_data), desc='Evaluating'))

# Collect the true labels and predicted classes from the results
for true_label, predicted_class in results:
    true_labels_list.append(true_label)
    predicted_classes_list.append(predicted_class)

# Generate the classification report
report = classification_report(true_labels_list, predicted_classes_list)

# Print the classification report
print(report)

# Play a success sound
from mac_alerts import alerts
import os
alerts.play_success()
os.system('say "Evaluering færdig"') 
print('\a\a\a')