In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from torchvision import transforms, datasets

In [None]:
import zipfile
with zipfile.ZipFile('/content/five_class.zip', 'r') as zip_ref:
  zip_ref.extractall('.')

In [None]:
transform_gray = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.Grayscale(),
    transforms.ToTensor(),
])

In [None]:
train_data = datasets.ImageFolder(root='/content/five_class/train', transform=transform_gray)
test_data = datasets.ImageFolder(root='/content/five_class/test', transform=transform_gray)

In [None]:
classes = train_data.classes
classes

In [None]:
train = DataLoader(train_data, batch_size=16, shuffle=True)
test = DataLoader(test_data, batch_size=16, )

In [None]:
images, labels = next(iter(train))
plt.figure(figsize=(20, 20))
for i in range(10):
  plt.subplot(5, 5, i + 1)
  img = images[i].squeeze(0)
  plt.imshow(img, cmap='gray')
  plt.title(f'Class:{classes[labels[i]]}')
  plt.axis('off')
plt.show()

In [None]:
class CheckImageGray(nn.Module):
    def __init__(self):
        super().__init__()
        self.first = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )

        self.second = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 16 * 16, 512),
            nn.ReLU(),
            nn.Linear(512, 5),
        )

    def forward(self, x):
        x = self.first(x)
        x = self.second(x)
        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
model = CheckImageGray().to(device)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
for epoch in range(15):
  model.train()
  total_loss = 0
  for x_batch, y_batch in train:
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)

    y_pred = model(x_batch)
    loss = loss_fn(y_pred, y_batch)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f"Эпоха {epoch + 1}, Потерии: {round(total_loss, 2)}")

In [None]:
model.eval()
correct = 0
total = 0


with torch.no_grad():
    for x_batch, y_batch in test:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        y_pred = model(x_batch)
        predicted = torch.argmax(y_pred, dim=1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()


accuracy = 100 * correct / total
print(f'Точность модели на тестовых данных: {round(accuracy, 2)}%')

In [None]:
torch.save(model.state_dict(), 'model_gray.pth')

# Task
The user wants to understand and fix an error occurring during the evaluation of a machine learning model. The error is likely related to loading or processing images in the test dataset. The plan is to identify and exclude problematic images from the dataset, create a new dataloader with the filtered data, and then re-run the evaluation to see if the error is resolved and report the final accuracy.

## Identify problematic images

### Subtask:
Iterate through the test dataset paths and attempt to open each image file using PIL.


**Reasoning**:
The goal is to identify problematic images in the test dataset by attempting to open each image file using PIL and catching the `UnidentifiedImageError`. This involves iterating through the test dataset paths, using a try-except block to handle potential errors during image opening, and storing the paths of problematic images in a list.



In [None]:
from PIL import Image

problematic_images = []
for image_path, _ in test_data.samples:
    try:
        img = Image.open(image_path)
    except UnidentifiedImageError:
        problematic_images.append(image_path)
        print(f"Problematic image file: {image_path}")

print(f"Total problematic images found: {len(problematic_images)}")

**Reasoning**:
The previous command failed because `UnidentifiedImageError` was not imported from `PIL.Image`. This command will import the necessary error type and re-run the code to identify problematic images.



In [None]:
from PIL import Image, UnidentifiedImageError

problematic_images = []
for image_path, _ in test_data.samples:
    try:
        img = Image.open(image_path)
    except UnidentifiedImageError:
        problematic_images.append(image_path)
        print(f"Problematic image file: {image_path}")

print(f"Total problematic images found: {len(problematic_images)}")

## Filter dataset

### Subtask:
Create a new list of image paths and labels, excluding the problematic images.


**Reasoning**:
Create filtered lists of image paths and labels by excluding problematic images.



In [None]:
filtered_image_paths = []
filtered_labels = []

for image_path, label in test_data.samples:
    if image_path not in problematic_images:
        filtered_image_paths.append(image_path)
        filtered_labels.append(label)

print(f"Number of filtered images: {len(filtered_image_paths)}")
print(f"Number of filtered labels: {len(filtered_labels)}")

## Create new dataloader

### Subtask:
Create a new test DataLoader using the filtered data.


**Reasoning**:
Implement a custom dataset class and create a new DataLoader using the filtered image paths and labels.



In [None]:
from torch.utils.data import Dataset

class FilteredImageDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(image_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, label

filtered_test_data = FilteredImageDataset(filtered_image_paths, filtered_labels, transform_rgb)
test_filtered = DataLoader(filtered_test_data, batch_size=16)

## Re-run evaluation

### Subtask:
Run the evaluation loop with the new test DataLoader.


**Reasoning**:
Run the evaluation loop using the new filtered test DataLoader to evaluate the model's performance on the clean data.



In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for x_batch, y_batch in test_filtered:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        y_pred = model(x_batch)
        predicted = torch.argmax(y_pred, dim=1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy of the model on filtered test data: {round(accuracy, 2)}%')

## Summary:

### Data Analysis Key Findings

*   Three problematic images were identified in the original test dataset that caused errors during loading.
*   A filtered test dataset was created by excluding the identified problematic images, resulting in 497 images and labels.
*   A new test DataLoader was successfully created using the filtered data.
*   Evaluating the model with the filtered test data resulted in an accuracy of 50.70%.

### Insights or Next Steps

*   The issue of problematic images causing errors during model evaluation was resolved by filtering the dataset.
*   Further investigation could be done to understand why the three images were problematic (e.g., corrupted files, incorrect format) and if this issue exists in other parts of the dataset or during training.
