## Step 2: Classifier

Develop a classifier for two categories. Create the necessary folders for the test and train datasets. Either create your own model or tranfer a model and revise it. Make sure you incorporate regularization, callbacks, etc., and use data augmentation. Since images may not be so distinct with respect to their categories, you may not get the same kind of performance you had in your assignments.


In [1]:
import numpy as np
import pandas as pd 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from PIL import Image
from collections import deque
from pathlib import Path
import logging, os, glob
from sklearn.model_selection import train_test_split
from _logging import set_logging
from _metrics import display_metrics
from _pckle import save_pickle_object, load_pickle_object
from _utility import gl, get_perc, get_dictionaries_from_list
from _dataset import dataset

set_logging(logging)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
def get_images(classes):
    new_list = deque()
    for i, topic in enumerate(classes):
        image_folder = os.path.join("Images",topic)
        file_path = os.path.join(image_folder,'*.jpg')
        image_paths = glob.glob(file_path)
        for path in image_paths:
            new_list.append([path, i])
            
    image_paths = np.array(new_list)
    df_image_paths = pd.DataFrame(image_paths, columns=[gl.image_path, gl.is_business])
    df_image_paths[gl.is_business] = df_image_paths[gl.is_business].astype("int")
    return df_image_paths
    


In [8]:
classes = ["Other", "Business"]
dict_classes, dict_classes_rev = get_dictionaries_from_list(classes)
df_image_paths = get_images(classes)
df_image_paths

Unnamed: 0,ImagePath,IsBusiness
0,Images\Other\10.jpg,0
1,Images\Other\100.jpg,0
2,Images\Other\10000.jpg,0
3,Images\Other\10001.jpg,0
4,Images\Other\10002.jpg,0
...,...,...
45765,Images\Business\987.jpg,1
45766,Images\Business\988.jpg,1
45767,Images\Business\994.jpg,1
45768,Images\Business\998.jpg,1


In [9]:
tot_images = len(df_image_paths)
tot_business = df_image_paths[gl.is_business].sum()
perc_business = get_perc(tot_business, tot_images)
logging.info(f"There are {tot_images} images of which {tot_business} ({perc_business}%) are for business")

2023-02-01 17:06:13,598 | INFO : There are 45770 images of which 7232 (15.8%) are for business


In [None]:
IMAGE_WIDTH=256
IMAGE_HEIGHT=256
IMAGE_SIZE=(IMAGE_WIDTH, IMAGE_HEIGHT)
batch_size = 1

In [None]:

test_data = dataset(X_test, dict_classes, IMAGE_SIZE, logging)
test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=False)