## Step 2: Classifier

Develop a classifier for two categories. Create the necessary folders for the test and train datasets. Either create your own model or tranfer a model and revise it. Make sure you incorporate regularization, callbacks, etc., and use data augmentation. Since images may not be so distinct with respect to their categories, you may not get the same kind of performance you had in your assignments.


In [1]:
import numpy as np
import pandas as pd 
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
from torchvision.models import ResNet18_Weights
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler, random_split
import matplotlib.pyplot as plt
from PIL import Image
from collections import deque
from pathlib import Path
import logging, os, glob, sys
from _logging import set_logging
from _metrics import display_metrics
from _pckle import save_pickle_object, load_pickle_object
from _utility import gl, get_perc, get_dictionaries_from_list
from _model import train_model

set_logging(logging)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
classes = ["Business", "Other"]
dict_classes, dict_classes_rev = get_dictionaries_from_list(classes)


In [3]:
IMAGE_WIDTH=256
IMAGE_HEIGHT=256
IMAGE_SIZE=(IMAGE_WIDTH, IMAGE_HEIGHT)
batch_size = 32
batch_size_for_display = 4

In [5]:
def get_loader(dataset, batch_size, root_dir):
    # Upsampling for imbalanced dataset
    class_weights = []
    for root, subdir, files in os.walk(root_dir):
        if len(files) > 0:
            # We want more weighting for classes with a smaller number of images
            # To acheive this, take the inverse of the number of files for that classes
            class_weights.append(1/len(files)) 
             
    sample_weights = [0] * len(dataset)     # This initialises a vector with zeros
    for i, (images, label) in enumerate(dataset):
        class_weight = class_weights[label]
        sample_weights[i] = class_weight
    sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)
    loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
    return loader
    

In [6]:

# See S3_TwitterProject_mean_and_std for calculations of mean and standard deviation
_mean = [0.5117, 0.4919, 0.4784]
_std = [0.3312, 0.3193, 0.3272]
_transform = transforms.Compose(
    [transforms.Resize([IMAGE_WIDTH, IMAGE_HEIGHT]),
    transforms.ToTensor(),
    transforms.Normalize(torch.Tensor(_mean), torch.Tensor(_std))]
)
root_dir = "Images"
_dataset = datasets.ImageFolder(root=root_dir, transform=_transform)
size = _dataset.__len__()
test_size = int(size * 0.05)
new_size = size - test_size
train_size = int(new_size * 0.8)
val_size = new_size - train_size
train_dataset, val_dataset, test_dataset = random_split(_dataset, lengths=[train_size, val_size, test_size])
display_loader = DataLoader(dataset = _dataset, batch_size=batch_size_for_display, shuffle=True)
train_loader = get_loader(train_dataset, batch_size, root_dir)
val_loader = get_loader(val_dataset, batch_size, root_dir)
test_loader = get_loader(test_dataset, batch_size, root_dir)
save_pickle_object(train_loader, gl.pkl_train_loader)
save_pickle_object(val_loader, gl.pkl_val_loader)
save_pickle_object(test_loader, gl.pkl_test_loader)
sys.exit()



2023-02-04 07:16:51,333 | INFO : Saving pickle file from: pickle\pkle_train_loader.pkl
2023-02-04 07:16:52,136 | INFO : Saving pickle file from: pickle\pkle_val_loader.pkl
2023-02-04 07:16:59,576 | INFO : Saving pickle file from: pickle\pkle_test_loader.pkl


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


The upsampling of the data loaders takes a long time, so the next part for the model training can be found in <br>
S4_TwitterProject.ipynb