In [None]:
from warnings import filterwarnings
import sys
filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2 as cv
import tensorflow as tf
import scipy as sc

from tensorflow.keras import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from gc import collect
from time import time,sleep
from os import path,system
from json import dumps
from tqdm.notebook import tqdm

In [None]:
labels = pd.read_csv("./train.csv")

In [None]:
labels.pop("grapheme");
labels.nunique()

In [None]:
grapheme_root_ohe = OneHotEncoder(dtype=np.uint16,sparse=False)
vowel_diacritic_ohe = OneHotEncoder(dtype=np.uint16,sparse=False)
consonant_diacritic_ohe = OneHotEncoder(dtype=np.uint16,sparse=False)

grapheme_root_ohe.fit(labels[['grapheme_root']])
vowel_diacritic_ohe.fit(labels[['vowel_diacritic']])
consonant_diacritic_ohe.fit(labels[['consonant_diacritic']])

In [None]:
inputs = Input(shape = (48, 64, 1),name="inputs")
model = Conv2D(filters=32, kernel_size=(10, 10), padding='SAME', activation='relu', input_shape=(48,64,1))(inputs)
model = Conv2D(filters=32, kernel_size=(8, 8), padding='SAME', activation='relu')(model)
model = MaxPool2D(pool_size=(2, 2))(model)
model = Conv2D(filters=64, kernel_size=(6, 6), padding='SAME', activation='relu')(model)
model = Conv2D(filters=64, kernel_size=(5, 5), padding='SAME', activation='relu')(model)
model = Conv2D(filters=64, kernel_size=(5, 5), padding='SAME', activation='relu')(model)
model = MaxPool2D(pool_size=(2, 2))(model)
model = Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation='relu')(model)
model = Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation='relu')(model)
model = BatchNormalization(momentum=0.15)(model)
model = Dropout(rate=0.3)(model)

model = Flatten()(model)
model = Dense(4096, activation = "relu")(model)
model = Dropout(rate=0.3)(model)
dense = Dense(2048, activation = "relu")(model)

head_root = Dense(168, activation = 'softmax',name="grapheme_root")(dense)
head_vowel = Dense(11, activation = 'softmax',name='vowel_diacritic')(dense)
head_consonant = Dense(7, activation = 'softmax',name='consonant_diacritic')(dense)

outputs = [
        head_root, 
        head_vowel, 
        head_consonant
]

model = Model(inputs=inputs, outputs=outputs)

In [None]:
model.compile(optimizer="adam",loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
def crop(img,pad=True):
    W_THRESH = 15
    H_THRESH = 15
    PAD = 3 if pad else 0

    W_MIN,W_MAX = np.where(img.std(axis=0) > W_THRESH)[0][[0,-1]]
    H_MIN,H_MAX = np.where(img.std(axis=1) > H_THRESH)[0][[0,-1]]
    
    return np.pad(img[H_MIN:H_MAX,W_MIN:W_MAX],PAD,constant_values=253)

def resize(img):
    img = crop(img.reshape(137,236).astype(np.uint8))
    ret,img = cv.threshold(img,110,255,cv.THRESH_BINARY_INV)    
    return cv.resize(img,(48,64)).astype(np.uint8).reshape(48,64,1)
        
def get_train(file_id):
    
    columns=['image_id','grapheme_root','vowel_diacritic','consonant_diacritic']
    df = pd.read_parquet(f"./train_image_data_{file_id}.parquet")
    df = pd.merge(
        df,
        labels,
        on='image_id'
    )

    imgs = np.zeros((1,48,64,1))
    label = df[columns]
    
    df = df.drop(columns=columns)
    batch_size = 2000
    for i in range(batch_size,df.shape[0],batch_size):
        batch = np.apply_along_axis(resize,1,df.iloc[i-batch_size:i].values[:,:])
        imgs = np.r_[imgs,batch]
        
    batch = np.apply_along_axis(resize,1,df.iloc[i:].values[:,:])
    imgs = np.r_[imgs,batch]
    
    grapheme_root = grapheme_root_ohe.transform(label.grapheme_root.values.reshape(-1,1))
    vowel_diacritic = vowel_diacritic_ohe.transform(label.vowel_diacritic.values.reshape(-1,1))
    consonant_diacritic = consonant_diacritic_ohe.transform(label.consonant_diacritic.values.reshape(-1,1))
    
    del df
    collect()
    
    imgs = imgs[1:]

    return imgs.reshape(-1,48*64),(grapheme_root,vowel_diacritic,consonant_diacritic)

In [None]:
X,Y = get_train(0)

Y = list(Y)
print (collect())

for i in range(1,4):
    X_, Y_ = get_train(i)
    X = np.concatenate((X,X_),axis=0)
    
    Y[0] = np.concatenate((Y[0],Y_[0]),axis=0)
    Y[1] = np.concatenate((Y[1],Y_[1]),axis=0)
    Y[2] = np.concatenate((Y[2],Y_[2]),axis=0)
    
    del X_,Y_
    print (collect())
    
X = sc.sparse.csc_matrix(X)
    
collect()
sleep(10)

In [None]:
def input_flow(x,y,batch_size=200,epochs=20):
    for _ in range(epochs):
        for i in range(batch_size,x.shape[0],batch_size):
            yield (
                    {
                        "inputs":x[i-batch_size:i].toarray().reshape(-1,48,64,1)/255
                    },
                    {
                        "grapheme_root":y[0][i-batch_size:i],
                        'vowel_diacritic':y[1][i-batch_size:i],
                        'consonant_diacritic':y[2][i-batch_size:i]
                    }
                )

        yield (
                    {
                        "inputs":x[i:].toarray().reshape(-1,48,64,1)/255
                    },
                    {
                        "grapheme_root":y[0][i:],
                        'vowel_diacritic':y[1][i:],
                        'consonant_diacritic':y[2][i:]
                    }
                )

In [None]:
EPOCHS = 25
BATCH_SIZE = 300

for epoch in range(EPOCHS):
    print (f"Epoch : {epoch}")
    gen = input_flow(X,Y,batch_size=BATCH_SIZE,epochs=1)
    model.fit_generator(gen,steps_per_epoch=X.shape[0]//BATCH_SIZE,epochs=1,verbose = 1)
    print (collect())
    sleep(10)

In [None]:
del X,Y
collect()
sleep(30)

In [None]:
def get_test(file_id):
    df = pd.read_parquet(f"/kaggle/input/bengaliai-cv19/test_image_data_{file_id}.parquet")
    imgs = np.zeros((1,48,48,1))
    imageId = []
    batch_size = 2000
    i = 0
    
    for i in range(batch_size,df.shape[0],batch_size):
        batch = np.apply_along_axis(resize,1,df.iloc[i-batch_size:i].values[:,1:])
        imgs = np.r_[imgs,batch]
        imageId += df.iloc[i-batch_size:i].values[:,0].reshape(-1).tolist()
        
    batch = np.apply_along_axis(resize,1,df.iloc[i:].values[:,1:])
    imgs = np.r_[imgs,batch]
    imageId += df.iloc[i:].values[:,0].reshape(-1).tolist()
        
    del df
    collect()
    
    imgs = imgs[1:]
    return imgs.reshape(-1,48*48),imageId

def input_flow_test(x,batch_size=200):
    i = 0
    for i in range(batch_size,x.shape[0],batch_size):
        yield  x[i-batch_size:i].reshape(-1,48,48,1)/255
    yield x[i:].reshape(-1,48,48,1)/255

In [None]:
row_ids = []
targets = []
    
for i in range(4):
    X, Y = get_test(i)
    gen = input_flow_test(X,batch_size=700)
    preds = model.predict_generator(gen)
    
    for i,g,v,c in zip(Y,*preds):
        row_ids.append(f"{i}_consonant_diacritic")
        targets.append(c.argmax())

        row_ids.append(f"{i}_grapheme_root")
        targets.append(g.argmax())

        row_ids.append(f"{i}_vowel_diacritic")
        targets.append(v.argmax())
    
    del X,Y
    print (collect())

In [None]:
submit = pd.DataFrame({
    "row_id":row_ids,
    "target":targets
})

In [None]:
submit

In [None]:
submit.to_csv("./submission.csv",index=False)