In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import torchvision
import torchvision.transforms as ttf

import csv
import math

import os
import os.path as osp

from tqdm import tqdm
from PIL import Image
from sklearn.metrics import roc_auc_score
import numpy as np

In [None]:
with open("/home/ubuntu/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"chrisweddle","key":"504463625d95239beccef5da3e740f6d"}') # Put your kaggle username & key here

In [None]:
!kaggle competitions download -c 11-785-s22-hw2p2-classification
!kaggle competitions download -c 11-785-s22-hw2p2-verification

!unzip -o 11-785-s22-hw2p2-classification.zip
!unzip -o 11-785-s22-hw2p2-verification.zip


In [None]:
"""
The well-accepted SGD batch_size & lr combination for CNN classification is 256 batch size for 0.1 learning rate.
When changing batch size for SGD, follow the linear scaling rule - halving batch size -> halve learning rate, etc.
This is less theoretically supported for Adam, but in my experience, it's a decent ballpark estimate.
"""
#batch_size = 256
#lr = 0.1
#batch_size=128
#lr=.05
batch_size=64
#lr = 0.025
lr = 0.0185
#batch_size = 32
#lr = 0.012
epochs = 20 # Just for the early submission. We'd want you to train like 50 epochs for your main submissions.

In [None]:
class InvertedResidualBlock(nn.Module):
    """
    Intuitively, layers in MobileNet can be split into "feature mixing" 
    and "spatial mixing" layers. You can think of feature mixing as each pixel
    "thinking on its own" about its own featuers, and you can think of spatial
    mixing as pixels "talking with each other". Alternating these two builds
    up a CNN.

    In a bit more detail:

    - The purpose of the "feature mixing" layers is what you've already seen in 
    hw1p2. Remember, in hw1p2, we went from some low-level audio input to
    semantically rich representations of phonemes. Featuring mixing is simply a 
    linear layer (a weight matrix) that transforms simpler features into 
    something more advanced.

    - The purpose of the "spatial mixing" layers is to mix features from different
    spatial locations. You can't figure out a face by looking at each pixel on
    its own, right? So we need 3x3 convolutions to mix features from neighboring
    pixels to build up spatially larger features.
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 expand_ratio):
        super().__init__() # Just have to do this for all nn.Module classes

        # Can only do identity residual connection if input & output are the
        # same channel & spatial shape.
        if stride == 1 and in_channels == out_channels:
            self.do_identity = True
        else:
            self.do_identity = False
        
        # Expand Ratio is like 6, so hidden_dim >> in_channels
        hidden_dim = in_channels * expand_ratio

        """
        Also, note that bias = False since BatchNorm2d has a bias term built-in.

        As you go, note the relationship between kernel_size and padding. As you
        covered in class, padding = kernel_size // 2 (kernel_size being odd) to
        make sure input & output spatial resolution is the same.
        """

        """
        Change in_channels to in_channels, out_channels to hidden_dim
        Change activation to GELU()
        """
        self.feature_mixing = nn.Sequential(
            # TODO: Fill this in!
            nn.Conv2d(in_channels=in_channels, out_channels=hidden_dim, kernel_size = 1, stride = 1, padding = 1//2, bias=True),
            nn.GELU()

        )

        """
        For Convnext: Change kernel_size to 7, in_channels and out_channels from hidden_dim to in_channels, padding to 7//2
        Get rid of activation
        """
        self.spatial_mixing = nn.Sequential(
            # TODO: Fill this in!
            nn.Conv2d(in_channels = in_channels, out_channels = in_channels, kernel_size = 7, stride = stride, groups=in_channels, padding = 7//2, bias=False),
            nn.BatchNorm2d(in_channels)
        )

        """
        What's this? Remember that hidden_dim is quite large - six times the 
        in_channels. So it was nice to do the above operations in this high-dim
        space, where some patterns might be more clear. But we still want to 
        bring it back down-to-earth.

        Intuitively, you can takeaway two reasons for doing this:
        - Reduces computational cost by a lot. 6x in & out channels means 36x
          larger weights, which is crazy. We're okay with just one of input or 
          output of a convolutional layer being large when mixing channels, but 
          not both.
        
        - We also want a residual connection from the input to the output. To 
          do that without introducing another convolutional layer, we want to
          condense the # of channels back to be the same as the in_channels.
          (out_channels and in_channels are usually the same).
        """
        self.bottleneck_channels = nn.Sequential(
            # TODO: Fill this in!
            nn.Conv2d(in_channels = hidden_dim, out_channels = out_channels, kernel_size=1, stride=1, padding=0, bias=True)
            #nn.BatchNorm2d(out_channels)
        )
    """
    For convnext: switch the spatial_mixing and feature_mixing layers
    """
    def forward(self, x):
        #out = self.feature_mixing(x)
        out = self.spatial_mixing(x)
        out = self.feature_mixing(out)
        out = self.bottleneck_channels(out)

        if self.do_identity:
            return x + out
        else:
            return out

class Convnext(nn.Module):
    
    def __init__(self, num_classes= 7000):
        super().__init__()

        self.num_classes = num_classes


        """
        For convnext: Change out_channels to 96, kernel_size to 4
        """
        self.stem = nn.Sequential(
            # TODO: Fill this in!
            
            #nn.Conv2d(in_channels=3, out_channels = 96, kernel_size = 4, stride=4, padding = 4//2, bias=False),
            nn.Conv2d(in_channels=3, out_channels = 32, kernel_size = 3, stride=2, padding = 3//2, bias=False),
            nn.BatchNorm2d(32),
            nn.GELU()
            #nn.ReLU6()
        )

        """
        Since we're just repeating InvertedResidualBlocks again and again, we
        want to specify their parameters like this.
        The four numbers in each row (a stage) are shown below.
        - Expand ratio: We talked about this in InvertedResidualBlock
        - Channels: This specifies the channel size before expansion
        - # blocks: Each stage has many blocks, how many?
        - Stride of first block: For some stages, we want to downsample. In a
          downsampling stage, we set the first block in that stage to have
          stride = 2, and the rest just have stride = 1.

        Again, note that almost every stage here is downsampling! By the time
        we get to the last stage, what is the image resolution? Can it still
        be called an image for our dataset? Think about this, and make changes
        as you want.
        """
        self.stage_cfgs = [
            # expand_ratio, channels, # blocks, stride of first block
            [6,96,3,2],
            [8,192,3,2],
            [8,384,7,2],
            [8,512,3,2]
        ]

        # Remember that our stem left us off at 16 channels. We're going to 
        # keep updating this in_channels variable as we go
        in_channels = 32

        # Let's make the layers
        layers = []
        for curr_stage in self.stage_cfgs:
            expand_ratio, num_channels, num_blocks, stride = curr_stage
            
            for block_idx in range(num_blocks):
                out_channels = num_channels
                layers.append(InvertedResidualBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    # only have non-trivial stride if first block
                    stride=stride if block_idx == 0 else 1, 
                    expand_ratio=expand_ratio
                ))
                # In channels of the next block is the out_channels of the current one
                in_channels = out_channels 
            
        self.layers = nn.Sequential(*layers) # Done, save them to the class

        # Some final feature mixing
        self.final_block = nn.Sequential(
            nn.Conv2d(in_channels, 1280, kernel_size=1, padding=0, stride=1, bias=False),
            nn.BatchNorm2d(1280),
            nn.GELU()
        )

        # Now, we need to build the final classification layer.
        self.cls_layer = nn.Sequential(
            # TODO: Fill this in!
            #nn.AvgPool2d((7,7)),
            nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten(),
            #nn.Linear(1280, num_classes)
            nn.Linear(1280, 512),
            nn.BatchNorm1d(512),
            nn.ReLU6(),
            nn.Linear(512, num_classes)

            
            # Pool over & collapse the spatial dimensions to (1, 1)
            # Collapse the trivial (1, 1) dimensions
            # Project to our # of classes
        )

        self._initialize_weights()

    def _initialize_weights(self):
        """
        Usually, I like to use default pytorch initialization for stuff, but
        MobileNetV2 made a point of putting in some custom ones, so let's just
        use them.
        """
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def forward(self, x, return_feats=False):
        out = self.stem(x)
        out = self.layers(out)
        out = self.final_block(out)
        if return_feats:
            return out
        out = self.cls_layer(out)

        return out


In [None]:
"""
Transforms (data augmentation) is quite important for this task.
Go explore https://pytorch.org/vision/stable/transforms.html for more details
"""
DATA_DIR = "/home/ubuntu"
#TRAIN_DIR = osp.join(DATA_DIR, "train_subset/train_subset") # This is a smaller subset of the data. Should change this to classification/classification/train
TRAIN_DIR = osp.join(DATA_DIR, "classification/classification/train")
VAL_DIR = osp.join(DATA_DIR, "classification/classification/dev")
TEST_DIR = osp.join(DATA_DIR, "classification/classification/test")

#brightnes and contast: .6 - 1.4
train_transforms = [ttf.RandomAffine(5),
                    ttf.RandomApply(nn.ModuleList([ttf.ColorJitter(brightness=(.6,1.4), contrast=(.6,1.4))]), p=0.5), 
                    ttf.RandomHorizontalFlip(p=0.5), ttf.RandomAdjustSharpness(0, p=0.5),ttf.RandomAutocontrast(p=0.5),
                   ttf.ToTensor()]
val_transforms = [ttf.ToTensor()]

train_dataset = torchvision.datasets.ImageFolder(TRAIN_DIR,
                                                 transform=ttf.Compose(train_transforms))
val_dataset = torchvision.datasets.ImageFolder(VAL_DIR,
                                               transform=ttf.Compose(val_transforms))


train_loader = DataLoader(train_dataset, batch_size=batch_size,
                          shuffle=True, drop_last=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                        drop_last=True, num_workers=4)

In [None]:
#model = Network()
model = Convnext()
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.cuda()

# For this homework, we're limiting you to 35 million trainable parameters, as
# outputted by this. This is to help constrain your search space and maintain
# reasonable training times & expectations
num_trainable_parameters = 0
for p in model.parameters():
    num_trainable_parameters += p.numel()
print("Number of Params: {}".format(num_trainable_parameters))

# TODO: What criterion do we use for this task?
criterion = nn.CrossEntropyLoss(label_smoothing=.1)
#criterion = LabelSmoothingCrossEntropy(label_smoothing=.1)
#criterion = LabelSmoothing(smoothing = 0.2)
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(len(train_loader) * epochs))
# T_max is "how many times will i call scheduler.step() until it reaches 0 lr?"

# For this homework, we strongly strongly recommend using FP16 to speed up training.
# It helps more for larger models.
# Go to https://effectivemachinelearning.com/PyTorch/8._Faster_training_with_mixed_precision
# and compare "Single precision training" section with "Mixed precision training" section
scaler = torch.cuda.amp.GradScaler()

In [None]:
best_val = 0
#model.load_state_dict(torch.load('/home/ubuntu/model_state.pth'))

for epoch in range(epochs):
    # Quality of life tip: leave=False and position=0 are needed to make tqdm usable in jupyter
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

    num_correct = 0
    total_loss = 0
    val_num_correct = 0

    for i, (x, y) in enumerate(train_loader):
        optimizer.zero_grad()


        x = x.cuda()
        y = y.cuda()

        # Don't be surprised - we just wrap these two lines to make it work for FP16
        with torch.cuda.amp.autocast():     
            outputs = model(x)
            loss = criterion(outputs, y)

        # Update # correct & loss as we go
        num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
        total_loss += float(loss)

        # tqdm lets you add some details so you can monitor training as you train.
        batch_bar.set_postfix(
            acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)),
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            num_correct=num_correct,
            lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
        
        # Another couple things you need for FP16. 
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16

        scheduler.step() # We told scheduler T_max that we'd call step() (len(train_loader) * epochs) many times.

        batch_bar.update() # Update tqdm bar
    batch_bar.close() # You need this to close the tqdm bar
    
    if epoch%5 == 0:
        torch.save(model.state_dict(), '/home/ubuntu/model_state.pth')

    # You can add validation per-epoch here if you would like
    for i, (x, y) in enumerate(val_loader):
        optimizer.zero_grad()

        x = x.cuda()
        y = y.cuda()

        with torch.cuda.amp.autocast():     
            val_outputs = model(x)
        val_num_correct += int((torch.argmax(val_outputs, axis=1) == y).sum())
        
    if val_num_correct/(len(val_loader)*batch_size) > .86:
        break
    

        

    print("Epoch {}/{}: Train Acc {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f}, Val Acc {:.04f}%".format(
        epoch + 1,
        epochs,
        100 * num_correct / (len(train_loader) * batch_size),
        float(total_loss / len(train_loader)),
        float(optimizer.param_groups[0]['lr']),
        100 * val_num_correct / (len(val_loader)*batch_size)))

In [None]:
#model.load_state_dict(torch.load('/home/ubuntu/model_state.pth'))
model.eval()
batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')
num_correct = 0
for i, (x, y) in enumerate(val_loader):

    x = x.cuda()
    y = y.cuda()
    print(x.size())

    with torch.no_grad():
        outputs = model(x)

    num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
    batch_bar.set_postfix(acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)))

    batch_bar.update()
    
batch_bar.close()
print("Validation: {:.04f}%".format(100 * num_correct / len(val_dataset)))

In [None]:
class ClassificationTestSet(Dataset):
    # It's possible to load test set data using ImageFolder without making a custom class.
    # See if you can think it through!

    def __init__(self, data_dir, transforms):
        self.data_dir = data_dir
        self.transforms = transforms

        # This one-liner basically generates a sorted list of full paths to each image in data_dir
        self.img_paths = list(map(lambda fname: osp.join(self.data_dir, fname), sorted(os.listdir(self.data_dir))))

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        return self.transforms(Image.open(self.img_paths[idx]))

In [None]:
test_dataset = ClassificationTestSet(TEST_DIR, ttf.Compose(val_transforms))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
                         drop_last=False, num_workers=1)

In [None]:
model.eval()
batch_bar = tqdm(total=len(test_loader), dynamic_ncols=True, position=0, leave=False, desc='Test')

res = []
for i, (x) in enumerate(test_loader):
    optimizer.zero_grad()
    x = x.cuda()

    with torch.cuda.amp.autocast():
        outputs = model(x)

    pred_y = torch.argmax(outputs, axis=1)

    res.extend(pred_y.data.cpu().numpy())

    # TODO: Finish predicting on the test set.
    

    batch_bar.update()
    
batch_bar.close()

with open("submission.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "label"])
    for num in range(len(res)):
        id_num = str(num)
        id = id_num.zfill(5)
        id = id + '.jpg'
        writer.writerow([id, res[num]])

In [None]:
with open("classification_early_submission.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(test_dataset)):
        f.write("{},{}\n".format(str(i).zfill(6) + ".jpg", res[i]))

In [None]:
!kaggle competitions submit -c face-recognition-slack -f classification_early_submission.csv -m "Initial submission"

In [None]:
!kaggle competitions submit -c face-recognition-slack -f submission.csv -m "Initial submission"

In [None]:
!ls verification/verification/dev | wc -l
!cat verification/verification/verification_dev.csv | wc -l

In [None]:
class VerificationDataset(Dataset):
    def __init__(self, data_dir, transforms):
        self.data_dir = data_dir
        self.transforms = transforms

        # This one-liner basically generates a sorted list of full paths to each image in data_dir
        self.img_paths = list(map(lambda fname: osp.join(self.data_dir, fname), sorted(os.listdir(self.data_dir))))

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        # We return the image, as well as the path to that image (relative path)
        return self.transforms(Image.open(self.img_paths[idx])), osp.relpath(self.img_paths[idx], self.data_dir)

In [None]:
val_veri_dataset = VerificationDataset(osp.join(DATA_DIR, "verification/verification/dev"),
                                       ttf.Compose(val_transforms))
val_ver_loader = torch.utils.data.DataLoader(val_veri_dataset, batch_size=batch_size, 
                                             shuffle=False, num_workers=1)

In [None]:
model.eval()

feats_dict = dict()
for batch_idx, (imgs, path_names) in tqdm(enumerate(val_ver_loader), total=len(val_ver_loader), position=0, leave=False):
    imgs = imgs.cuda()
    print(imgs[0].size())

    with torch.no_grad():
        # Note that we return the feats here, not the final outputs
        # Feel free to try the final outputs too!
        #print(imgs)
        #print(path_names)
        #print(batch_idx)
        #print(imgs, path_names)
        #for item in (imgs, path_names):
            #for i in item:
            #    print(len(i))
            #print(img)
            #print(path_name)
        #    img = item[0]
        feats = model(imgs, return_feats=False)
        feats = F.gelu(feats)
        print(len(feats))
        for i in range(len(feats)):
            feat = feats[i]
            #img = imgs[i]
            path_name = path_names[i]

            #print(len(feats))
            #path_name = item[1]
            #feats = model(img, return_feats=True) 

            feats_dict[path_name] = feat
            #print(len(feat))
    
    # TODO: Now we have features and the image path names. What to do with them?
    # Hint: use the feats_dict somehow.

In [None]:
# We use cosine similarity between feature embeddings.
# TODO: Find the relevant function in pytorch and read its documentation.
similarity_metric = nn.CosineSimilarity(dim=0)

val_veri_csv = osp.join(DATA_DIR, "verification/verification/verification_dev.csv")

"""
val_veri_dataset = VerificationDataset(osp.join(DATA_DIR, "verification/verification/dev"),
                                       ttf.Compose(val_transforms))
val_ver_loader = torch.utils.data.DataLoader(val_veri_dataset, batch_size=batch_size, 
                                             shuffle=False, num_workers=1)
"""


# Now, loop through the csv and compare each pair, getting the similarity between them
pred_similarities = []
gt_similarities = []
for line in tqdm(open(val_veri_csv).read().splitlines()[1:], position=0, leave=False): # skip header
    img_path1, img_path2, gt = line.split(",")
    #print(img_path1)
    #print(img_path1.split('/')[1])
    img_path1 = img_path1.split('/')[1]
    img_path2 = img_path2.split('/')[1]
    #print(feats_dict[img_path1])
    #print(feats_dict[img_path2])

    # TODO: Use the similarity metric
    # How to use these img_paths? What to do with the features?
    similarity = similarity_metric(feats_dict[img_path1], feats_dict[img_path2])
    #print(float(similarity.cpu()))

    gt_similarities.append(int(gt))
    pred_similarities.append(similarity.cpu())
#pred_similarities = pred_similarities
#gt_similarities = gt_similarities.numpy()
#pred_similarities = np.array(pred_similarities)
#gt_similarities = np.array(gt_similarities)
print(gt_similarities[0])
print(pred_similarities[0])
print("AUC:", roc_auc_score(gt_similarities, pred_similarities))

In [None]:
test_veri_dataset = VerificationDataset(osp.join(DATA_DIR, "verification/verification/test"),
                                        ttf.Compose(val_transforms))
test_ver_loader = torch.utils.data.DataLoader(test_veri_dataset, batch_size=batch_size, 
                                              shuffle=False, num_workers=1)

In [None]:
model.eval()

feats_dict = dict()
for batch_idx, (imgs, path_names) in tqdm(enumerate(test_ver_loader), total=len(test_ver_loader), position=0, leave=False):
    imgs = imgs.cuda()

    with torch.no_grad():
        # Note that we return the feats here, not the final outputs
        # Feel free to try to final outputs too!

        feats = model(imgs, return_feats=False)
        feats = F.gelu(feats)

        for i in range(len(feats)):
            feat = feats[i]

            path_name = path_names[i]

            feats_dict[path_name] = feat
    
    # TODO: Now we have features and the image path names. What to do with them?
    # Hint: use the feats_dict somehow.

In [None]:
# We use cosine similarity between feature embeddings.
# TODO: Find the relevant function in pytorch and read its documentation.
similarity_metric = nn.CosineSimilarity(dim=0)
val_veri_csv = osp.join(DATA_DIR, "verification/verification/verification_test.csv")

# Now, loop through the csv and compare each pair, getting the similarity between them
pred_similarities = []
for line in tqdm(open(val_veri_csv).read().splitlines()[1:], position=0, leave=False): # skip header
    img_path1, img_path2= line.split(",")
    #print(img_path1)
    #print(img_path1.split('/')[1])
    img_path1 = img_path1.split('/')[1]
    img_path2 = img_path2.split('/')[1]
    #print(feats_dict[img_path1])
    #print(feats_dict[img_path2])

    # TODO: Use the similarity metric
    # How to use these img_paths? What to do with the features?
    similarity = similarity_metric(feats_dict[img_path1], feats_dict[img_path2])
    #print(float(similarity.cpu()))

    pred_similarities.append(similarity.cpu())
#pred_similarities = pred_similarities
#gt_similarities = gt_similarities.numpy()
#pred_similarities = np.array(pred_similarities)
#gt_similarities = np.array(gt_similarities)

print(pred_similarities[0])


In [None]:
with open("verification_early_submission.csv", "w+") as f:
    f.write("id,match\n")
    for i in range(len(pred_similarities)):
        f.write("{},{}\n".format(i, pred_similarities[i]))

In [None]:
!kaggle competitions submit -c face-verification-slack -f verification_early_submission.csv -m "first_submission"