In [0]:
import os
import shutil
from zipfile import ZipFile
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np

import torch
import torchvision
from torch.utils.data import DataLoader, Dataset, random_split
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets
import torchvision.transforms as transforms
from tqdm import tqdm_notebook
import copy
from collections import Counter
train_on_gpu = torch.cuda.is_available()

In [0]:
if 'dataset' not in os.listdir():
    os.environ['KAGGLE_USERNAME'] = "arohanajit232" # username from the json file
    os.environ['KAGGLE_KEY'] = "5289e13af33762d697c1d3c18c444f52" # key from the json file
    os.system('kaggle competitions download -c padhai-hindi-vowel-consonant-classification')
    with ZipFile('test.zip', 'r') as zipObj:
        zipObj.extractall()
    with ZipFile('train.zip', 'r') as zipObj:
        zipObj.extractall()
    os.mkdir('dataset')
    shutil.move('train','dataset/')
    shutil.move('test','dataset/')
    os.remove('train.zip')
    os.remove('test.zip')
    print(os.listdir("dataset"))
else:
    print("Dataset already present!",os.listdir("dataset"))

In [0]:
#For converting the dataset to torchvision dataset format
class VowelConsonantDataset(Dataset):
    def __init__(self, file_path,train=True,transform=None):
        self.transform = transform
        self.file_path=file_path
        self.train=train
        self.file_names=[file for _,_,files in os.walk(self.file_path) for file in files]
        self.len = len(self.file_names)
        if self.train:
            self.classes_mapping=self.get_classes()
    def __len__(self):
        return len(self.file_names)
    
    def __getitem__(self, index):
        file_name=self.file_names[index]
        image_data=self.pil_loader(self.file_path+"/"+file_name)
        if self.transform:
            image_data = self.transform(image_data)
        if self.train:
            file_name_splitted=file_name.split("_")
            Y1 = self.classes_mapping[file_name_splitted[0]]
            Y2 = self.classes_mapping[file_name_splitted[1]]
            z1,z2=torch.zeros(10),torch.zeros(10)
            z1[Y1-10],z2[Y2]=1,1
            label=torch.stack([z1,z2])

            return image_data, label

        else:
            return image_data, file_name
          
    def pil_loader(self,path):
        with open(path, 'rb') as f:
            img = Image.open(f)
            return img.convert('RGB')

      
    def get_classes(self):
        classes=[]
        for name in self.file_names:
            name_splitted=name.split("_")
            classes.extend([name_splitted[0],name_splitted[1]])
        classes=list(set(classes))
        classes_mapping={}
        for i,cl in enumerate(sorted(classes)):
            classes_mapping[cl]=i
        return classes_mapping

In [0]:
transform1 = transforms.Compose([transforms.ToTensor()])
"../input/train/train"

In [0]:
full_data=VowelConsonantDataset("../content/dataset/train",train=True,transform=transform1)
train_size = int(0.9 * len(full_data))
test_size = len(full_data) - train_size

train_data, validation_data = random_split(full_data, [train_size, test_size])

train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_data, batch_size=64, shuffle=True)
test_data=VowelConsonantDataset("../content/dataset/test",train=False)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=64,shuffle=False)

In [0]:
print(len(train_data))
print(len(validation_data))
print(len(full_data))
print(full_data.get_classes())
vclass,cclass=[],[]
for i in full_data.get_classes():
    if i[0]=='V':
        vclass.append(int(i[1]))
    else:
        cclass.append(int(i[1]))
print(vclass,cclass)

In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [0]:
data_iter = iter(train_loader)
images, labels = next(data_iter)
print(images[0].shape,images[0].size(0))
fig = plt.figure(figsize=(25, 4))
for idx in np.arange(20):
    ax = fig.add_subplot(2, 20/2, idx+1, xticks=[], yticks=[])
    img = np.transpose(np.squeeze(images[idx]))
    ax.imshow(img)
print("\n\n\n",torch.max(labels[:,0,:],1))
print(labels.shape)

In [0]:
vowelCounter = Counter()
consonantCounter =  Counter()
for _,label in train_loader:
    _,vowel = torch.max(label[:,0,:],1)
    _,consonant = torch.max(label[:,1,:],1)
    vowelCounter.update(vowel.tolist())
    consonantCounter.update(consonant.tolist())

In [0]:
print(vowelCounter)
print(consonantCounter)
vvalues,cvalues=[],[]
for i in vclass:
    vvalues.append(vowelCounter[i])
for i in cclass:
    cvalues.append(consonantCounter[i])
print(vclass,vvalues)
print(cclass,cvalues)
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(20,5))
ax1.bar(vclass,vvalues)
ax1.set_title('Vowel class distribution')
ax2.bar(cclass,cvalues)
ax2.set_title('Consonant class distribution')

In [0]:
v_model = torchvision.models.resnet50(pretrained=True)
c_model = torchvision.models.resnet50(pretrained=True)
v_model.fc = nn.Linear(2048,10,bias=True)
c_model.fc = nn.Linear(2048,10,bias=True)

In [0]:
opt_v = optim.Adam(v_model.parameters())
opt_c = optim.Adam(c_model.parameters())
loss_fnv = nn.CrossEntropyLoss()
loss_fnc = nn.CrossEntropyLoss()

In [0]:
def evaluation(dataloader,vmodel,cmodel):
    total,correctv,correctc=0,0,0
    for data in dataloader:
        inputs,labels=data
        inputs,labels=inputs.to(device),labels.to(device)
        _,outv = torch.max(vmodel(inputs),1)
        _,outc = torch.max(cmodel(inputs),1)
        _,labelsv=torch.max(labels[:,0,:].data,1)
        _,labelsc=torch.max(labels[:,1,:].data,1)
        total+=labels.size(0)
        correctv+=(outv==labelsv).sum().item()
        correctc+=(outc==labelsc).sum().item()
        vacc = 100*correctv/total
        cacc = 100*correctc/total
        
    return vacc,cacc

In [0]:
def Graphing(loss_arr,loss_epoch_arr,trainv_acc,trainc_acc,valv_acc,valc_acc):
    fig,(axes) = plt.subplots(2,2,figsize=(20,5))
    axes[0,0].plot(loss_arr)
    axes[0,0].set_title('Overall Loss')
    axes[0,1].plot(loss_epoch_arr)
    axes[0,1].set_title('Loss per Epoch')
    axes[1,0].plot(trainv_acc)
    axes[1,0].plot(trainc_acc)
    axes[1,0].legend(['Vowel','Consonant'])
    axes[1,0].set_title('Train Accuracy')
    axes[1,1].plot(valv_acc)
    axes[1,1].plot(valc_acc)
    axes[1,1].legend(['Vowel','Consonant'])
    axes[1,1].set_title('Validation Accuracy')

In [0]:
loss_epoch_arr = []
loss_arr = []
batch_size = 64
n_iters = np.ceil(9000/batch_size)
epochs = 45
min_loss = 100000
v_model.to(device)
c_model.to(device)
trainv_acc = []
trainc_acc = []
valv_acc = []
valc_acc = []
for epoch in tqdm_notebook(range(epochs)):
    for i,data in enumerate(tqdm_notebook(train_loader),0):
        v_model.train()
        c_model.train()
        image,label = data
        image, label = image.to(device), label.to(device)
        opt_v.zero_grad()
        opt_c.zero_grad()
        out_v = v_model(image)
        out_c = c_model(image)
        _,ind_v = torch.max(label[:,0,:],1)
        _,ind_c = torch.max(label[:,1,:],1)
        loss_v = loss_fnv(out_v,ind_v)
        loss_c = loss_fnc(out_c,ind_c)
        loss = torch.add(loss_v,loss_c)
        loss.backward()
        opt_v.step()
        opt_c.step()
        if min_loss > loss.item():
            min_loss = loss.item()
            best_modelv = copy.deepcopy(v_model.state_dict())
            best_modelc = copy.deepcopy(c_model.state_dict())
            print('Min loss %0.2f' % min_loss)
        if i % 100 == 0:
            print('Iteration: %d/%d, Loss: %0.2f' % (i, n_iters, loss.item()))
        del image, label, out_v,out_c
        torch.cuda.empty_cache()
        loss_arr.append(loss.item())
    loss_epoch_arr.append(loss.item())
    print("Epoch: {}".format(epoch))
    tempv,tempc = evaluation(train_loader,v_model,c_model)
    trainv_acc.append(tempv)
    trainc_acc.append(tempc)
    print("Train Accuarcy: Vowel: {} Consonants: {}".format(tempv,tempc))
    tempv,tempc = evaluation(validation_loader,v_model,c_model)
    valv_acc.append(tempv)
    valc_acc.append(tempc)
    print("Validation Accuarcy: Vowel: {} Consonants: {}".format(tempv,tempc))
Graphing(loss_arr,loss_epoch_arr,trainv_acc,trainc_acc,valv_acc,valc_acc)