In [None]:
import os
import re
import sys
import cv2
import tqdm
import time
import spacy 
import random
import scipy.io
import itertools
import numpy as np
from math import ceil
import pandas as pd
from itertools import chain
import matplotlib.pyplot as plt
from skimage.io import imread
from scipy.ndimage.filters import gaussian_filter
from sklearn.model_selection import train_test_split

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence 
import torchvision.transforms as transforms

import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

In [None]:
pattern_symbol = re.compile('^[!"#$%&\\\'()*+,-./:;<=>?@[\\]^_`{|}~]|[!"#$%&\\\'()*+,-./:;<=>?@[\\]^_`{|}~]$|/')
pattern_replace = re.compile('\u200c')

In [None]:
# training_data_path_en = './gdrive/MyDrive/3/Data/AFEC-merged-all/AFEC-merged.en'
# training_data_path_fa = './gdrive/MyDrive/3/Data/AFEC-merged-all/AFEC-merged.fa'
# test_data_path = './gdrive/MyDrive/3/Data/Test/test.en'
training_data_path_en = '../../Data/AFEC-merged-all/AFEC-merged.en'
training_data_path_fa = '../../Data/AFEC-merged-all/AFEC-merged.fa'
test_data_path = '../../Data/Test/test.en'

In [None]:
import pyonmttok
print('BPE English ...')
eng_tokenizer = pyonmttok.Tokenizer("aggressive", joiner_annotate=True, segment_numbers=True)
eng_learner = pyonmttok.BPELearner(tokenizer=eng_tokenizer, symbols=22000)
eng_learner = pyonmttok.SentencePieceLearner(vocab_size=20000, character_coverage=0.98)
eng_learner.ingest_file(training_data_path_en)
eng_tokenizer = eng_learner.learn("./BPE_ENG")

print('BPE Persian ...')
per_tokenizer = pyonmttok.Tokenizer("aggressive", joiner_annotate=True, segment_numbers=True)
per_learner = pyonmttok.BPELearner(tokenizer=per_tokenizer, symbols=18000)
per_learner = pyonmttok.SentencePieceLearner(vocab_size=15000, character_coverage=0.98)
per_learner.ingest_file(training_data_path_fa)
per_tokenizer = per_learner.learn("./BPE_PER")

In [None]:
def create_vocabulary(sentences, min_word_freq = 7):
        vocabulary = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        rev_vocabulary = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        min_word_frequency = min_word_freq
        
        frequencies = {}
        idx = 4
        
        counter, count = 1, len(sentences)
        for sentence in sentences:
            for word in sentence:
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1

                if frequencies[word] == min_word_frequency:
                    rev_vocabulary[word] = idx
                    vocabulary[idx] = word
                    idx += 1
        counter += 1
        print(str(round(counter/count*100, 2))+'%', end="\r")
        print('---done!---')
        return vocabulary, rev_vocabulary

In [None]:
def numericalize_sentence(text, rev_vocabulary):
    numericalized_caption = [rev_vocabulary["<SOS>"]]
    numericalized_caption += [
        rev_vocabulary[token] if token in rev_vocabulary else rev_vocabulary["<UNK>"]
        for token in text
    ]
    numericalized_caption.append(rev_vocabulary["<EOS>"])
    return numericalized_caption

def reverse_numericalize(sentence, vocab):

        strings = []
            
        for token in sentence:
            token = int(token.item())
            if token in vocab:
                strings.append(vocab[token])
            else:
                strings.append(vocab[3])
            if strings[-1] == '<EOS>':
                break
        
        
        if '<SOS>' in strings:
            strings.remove('<SOS>')
        if '<EOS>' in strings:
            strings.remove('<EOS>')
        while '<PAD>' in strings:
            strings.remove('<PAD>')
            
        # sentence = ''
        # for i in strings:
        #     sentence += i
        #     sentence += ' '
            
        return strings

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, eng, per):
        self.X, self.y = eng, per
    def __len__(self):
        return len(self.X)
    def __getitem__(self, index):
        per = self.y[index]
        eng = self.X[index]
        return torch.tensor(eng), torch.tensor(per)

def PadSequence(batch):
    eng_sequences = [x[0] for x in batch]
    per_sequences = [x[1] for x in batch]
    eng_sequences_padded = torch.nn.utils.rnn.pad_sequence(eng_sequences, batch_first=True)
    per_sequences_padded = torch.nn.utils.rnn.pad_sequence(per_sequences, batch_first=True)
    lengths = torch.LongTensor([len(x) for x in eng_sequences])
    return eng_sequences_padded, per_sequences_padded

In [None]:
def get_test_data(dp):
    english = []
    f0 = open(test_data_path)
    for eng in f0:
        english.append(eng_tokenizer.tokenize(eng)[0])

    num_english = []

    for i in english:
        num_english.append(numericalize_sentence(i, dp.eng_rev_vocab))
    
    return num_english

In [None]:
class Data_prep():
    def __init__(self):
        english, persian = [], []
        f0 = open(training_data_path_en)
        f1 = open(training_data_path_fa)
        for eng, per in zip(f0, f1):
            english.append(eng_tokenizer.tokenize(eng)[0])
            persian.append(per_tokenizer.tokenize(per)[0])
        print('creating vocabulary.')
        self.eng_vocab, self.eng_rev_vocab = create_vocabulary(english, min_word_freq = 8)
        self.per_vocab, self.per_rev_vocab = create_vocabulary(persian, min_word_freq = 6)

        self.num_english, self.num_persian = [], []

        for i in english:
            self.num_english.append(numericalize_sentence(i, self.eng_rev_vocab))
        for i in persian:
            self.num_persian.append(numericalize_sentence(i, self.per_rev_vocab))
            
    def get_data(self):
        return train_test_split(self.num_english, self.num_persian, test_size = 0.1, random_state=42)

In [None]:
def final_test_data(dp):
    per_path0 = '../../Data/Test/test.fa0'
    per_path1 = '../../Data/Test/test.fa1'
    per_path2 = '../../Data/Test/test.fa2'
    per_path3 = '../../Data/Test/test.fa3'
    eng_path = '../../Data/Test/test.en'
    persian = []
    english = []
    f0 = open(per_path0)
    f1 = open(per_path1)
    f2 = open(per_path2)
    f3 = open(per_path3)
    e0 = open(eng_path)
    for p0, p1, p2, p3, eng in zip(f0, f1, f2, f3, e0):
        while pattern_symbol.search(eng):
                eng = pattern_symbol.sub('', eng)
                
        while pattern_symbol.search(p0):
            p0 = pattern_symbol.sub('', p0)
        while pattern_replace.search(p0):
            p0 = pattern_replace.sub(' ', p0)
            
        while pattern_symbol.search(p1):
            p1 = pattern_symbol.sub('', p1)
        while pattern_replace.search(p1):
            p1 = pattern_replace.sub(' ', p1)
            
        while pattern_symbol.search(p2):
            p2 = pattern_symbol.sub('', p2)
        while pattern_replace.search(p2):
            p2 = pattern_replace.sub(' ', p2)
        english.append(eng.split())
        persian.append([p0.split(), p1.split(), p2.split(), p3.split()])
        
    num_english = []

    for i in english:
        num_english.append(numericalize_sentence(i, dp.eng_rev_vocab))
    
    return num_english, persian