In [2]:
from queue import Queue
import pandas as pd
from indicnlp.tokenize.indic_tokenize import trivial_tokenize_indic
import random
from data_preprocessing import *
import unicodedata

In [22]:
class devnagri_preprocessing:
        
        # --------------------------------------- Constructor --------------------------------------- 
        
        def __init__(self, vowels = True, trivial_split = False):
                
                self.vowels = vowels
                self.trivial_split = trivial_split
                


        # --------------------------------------- Clean Text ---------------------------------------  	
        
        def clean_devnagri(self,text: str) -> str:
            try:
                if not(isinstance(text, str)): text = str(text)

                #Removing unprintable characters
                text = ''.join(x for x in text if x.isprintable())

                # Cleaning the urls
                text = re.sub(r'https?://\S+|www\.\S+', '', text)

                # Cleaning the html elements
                text = re.sub(r'<.*?>', '', text)

                # Removing the punctuations
                text = re.sub('[!#?,.:";-@#$%^&*_~<>()/\-]', '', text)


                # Removing stop words
                text = ' '.join([word for word in text.split() if word not in self.stopword_list])

                # Expanding noisy concatenations (Eg: algorithmआणि  -> algorithm आणि ) 
                text = ' '.join([self.expand_concatenations(word) for word in text.split()])

            return text
        
             except ValueError as ve:
                print('Error processing:\t',text)
                return ''
                
                
                
        # --------------------------------------- Word Splitter --------------------------------------- #
    
        def split_devanagari_word(self,word: str) -> str:
            try:
                q = Queue()
                l_index = 0
                
                if not(isinstance(word, str)): word = str(word)
                tokens = []
                
                if self.trivial_split == True:
                    tokens = [char for char in word]
                    return tokens
                
                for char in word:

                    if not 'devanagari' in unicodedata.name(char).lower():
                        tokens.append(char)
                        continue

                    if not 'sign' in unicodedata.name(char).lower():
                        if q.empty():
                            tokens.append(char)
                        else:
                            while not q.empty():
                                tokens[len(tokens)-1] += q.get() 
                            tokens.append(char)   
                    else:
                        if self.vowels == True:
                            q.put(char)

                for i, char in reversed(list(enumerate(tokens.copy()))):
                    if('devanagari' in unicodedata.name(char).lower()):
                        l_index = i
                        break

                while not q.empty():
                        tokens[l_index] += q.get() 

                return tokens

            except Exception as e:
                return ''


        # --------------------------- String to character-sequence converter --------------------------------------- #

        def text2characters(self,text:str)->str:
            try:
                if not(isinstance(text, str)): text = str(text)
                char_sequence = ""
                char_list = []
                
                for word in text.split():
                    seq = ' '.join([char for char in self.split_devanagari_word(word)])                
                    char_sequence = char_sequence + seq + ' '
                    
                return char_sequence
            
            except ValueError as ve:
                print('Error processing:\t',text)
                return ''
            
            
    # ------------------------------------- Tokenize a document --------------------------------------------- #
        
        """
            This function builds a vocabulary of each unique token (character) from the given document.
            Each token from the vocabulary is assigned a unique integer id.
            
            Working of this funcion is similar to keras.preprocess.tokenizer.fit_to_text()
        """
        
        def tokenize_characters(self, document):
            vocab = set()
            cnt = 0
            token_dict = {}
            
            if isinstance(document, list):
                for text in document:
                    char_sequence = self.text2characters(text)
                    tokens_indic = pd.Series(trivial_tokenize_indic(char_sequence))
                    word_counts = tokens_indic.value_counts()
                    
                    vocab = vocab.union(set(word_counts.keys()))

                print('Total Unique Tokens (Characters): {}'.format(len(vocab)))

                for char in vocab:
                    cnt += 1
                    token_dict[char] = cnt
            
            else:
                char_sequence = self.text2characters(document)
                tokens_indic = pd.Series(trivial_tokenize_indic(char_sequence))
                word_counts = tokens_indic.value_counts()  
                vocab = vocab.union(set(word_counts.keys()))

                print('Total Unique Tokens (Characters): {}'.format(len(vocab)))

                for char in vocab:
                    cnt += 1
                    token_dict[char] = cnt
                
            return token_dict

        
    # -------------------------Text-to-sequence converter --------------------------------------- #
        """
            This function converts the input sentence into sequence of integers. 
            Each integer corresponds to the unique token (or character) id in the vocabulary.

            Working of this funcion is similar to keras.preprocess.tokenizer.text_to_sequence()
        """

        def text_to_sequence(self,document,token_dict, pad_len=300, padding_type='post', truncating_type='post'):
            
            sequence_doc = []
            if isinstance(document, list):
                cnt = 0
                for text in document:
                    try:
                        char_array = self.text2characters(text).split()
                        text_sequence = [token_dict[x] for x in char_array]
                        sequence_doc.append(text_sequence)
                        cnt+=1
                    except:
                        print(text)
                        
                print('Records converted: ',cnt)
                
            else:
                char_array = self.text2characters(document).split()
                text_sequence = [token_dict[x] for x in char_array]
                sequence_doc.append(text_sequence)
            
            sequence_doc = np.asarray(pad_sequences(sequence_doc, 
            					    padding=padding_type, 
                                            	    truncating=truncating_type, 
                                                    maxlen=pad_len))    
            return sequence_doc

SyntaxError: invalid syntax (<ipython-input-22-9940ebc3efc6>, line 37)

# Trial Run

In [5]:
df = pd.read_csv('../dataset/original-dataset/marathi-training-data.csv')
x_train = df['text'].apply(lambda x : clean_text(x)).tolist()
len(x_train)

41997

In [6]:
dev1 = Devanagari()
dev2 = Devanagari(vowels = False)
dev3 = Devanagari(trivial_split = True)

In [7]:
# Trial on a single word
sample_word =  'संदर्भामुळे'

tokens1 = dev1.split_devanagari_word(sample_word)
print(tokens1)

['सं', 'द', 'र्', 'भा', 'मु', 'ळे']


In [8]:
tokens2 = dev2.split_devanagari_word(sample_word)
tokens3 = dev3.split_devanagari_word(sample_word)

print('Word: \t',sample_word,'\n\nSplits:','\n\nType1\t',tokens1,'\n\nType2\t',tokens2,'\n\nType3\t',tokens3)

Word: 	 संदर्भामुळे 

Splits: 

Type1	 ['सं', 'द', 'र्', 'भा', 'मु', 'ळे'] 

Type2	 ['स', 'द', 'र', 'भ', 'म', 'ळ'] 

Type3	 ['स', 'ं', 'द', 'र', '्', 'भ', 'ा', 'म', 'ु', 'ळ', 'े']


In [10]:
# Trial on single text string
text = 'पहिला,  स्तंभ आपल्याला अंदाज देतो ?? '

cleaned_text = clean_text(text)
char_sequence_1 = dev1.text2characters(cleaned_text)
char_sequence_2 = dev2.text2characters(cleaned_text)
char_sequence_3 = dev3.text2characters(cleaned_text)

print('\nText: ',cleaned_text,'\n\nWith Vowels: ',char_sequence_1,'\n\nWithout vowels: ',char_sequence_2,
      '\n\nVowels Seperated: ',char_sequence_3)


Text:  पहिला स्तंभ आपल्याला अंदाज देतो 

With Vowels:  प हि ला स् तं भ आ प ल् या ला अं दा ज दे तो  

Without vowels:  प ह ल स त भ आ प ल य ल अ द ज द त  

Vowels Seperated:  प ह ि ल ा स ् त ं भ आ प ल ् य ा ल ा अ ं द ा ज द े त ो 


In [11]:
# Text to Sequence conversion 

text = 'हरिदास होय मला वाटते की हा एक महत्त्वाचा मुद्दा आहे की भारतीय संदर्भामुळे आपण विशेषतः आमच्या शैक्षणिक प्रक्रियेद्वारे प्रवेश करू शकता जिथे प्रवेश परीक्षा असते आणि जी आपल्याला विभागांमध्ये ठेवते आणि काही आपण त्या विशिष्ट क्रियाकलापांवर अडकले आहात असे कसे वाटते'
cleaned_text = clean_text(text)

token_dict = dev1.tokenize_characters(cleaned_text)
text_seq = dev1.text_to_sequence(cleaned_text, token_dict)

print('\n Text:\n\n',clean_text,'---->' ,len((cleaned_text).split()))
print('\n Character array:\n\n',dev1.text2characters(cleaned_text),'---->' ,len(dev1.text2characters(cleaned_text).split()))
print('\n Sequence array:\n\n',text_seq[0],'---->',len(text_seq[0]))

Total Unique Tokens (Characters): 66
Records converted: 1

 Text:

 <function clean_text at 0x7faf1a779dc0> ----> 41

 Character array:

 ह रि दा स हो य म ला वा ट ते की हा ए क म ह त् त् वा चा मु द् दा आ हे की भा र ती य सं द र् भा मु ळे आ प ण वि शे ष तः आ म च् या शै क् ष णि क प् र क् रि ये द् वा रे प् र वे श क रू श क ता जि थे प् र वे श प री क् षा अ स ते आ णि जी आ प ल् या ला वि भा गां म ध् ये ठे व ते आ णि का ही आ प ण त् या वि शि ष् ट क् रि या क ला पां व र अ ड क ले आ हा त अ से क से वा ट ते  ----> 135

 Sequence array:

 [3, 4, 31, 14, 17, 1, 58, 48, 60, 59, 32, 34, 7, 47, 5, 58, 3, 25, 25, 60, 16, 45, 21, 31, 39, 63, 34, 41, 51, 55, 1, 12, 44, 8, 41, 45, 36, 39, 46, 62, 66, 38, 18, 53, 39, 58, 42, 10, 52, 56, 18, 6, 5, 50, 51, 56, 4, 11, 21, 60, 43, 50, 51, 49, 2, 5, 61, 2, 5, 30, 26, 37, 50, 51, 49, 2, 46, 24, 56, 23, 13, 14, 32, 39, 6, 9, 39, 46, 54, 10, 48, 66, 41, 40, 58, 15, 11, 57, 28, 32, 39, 6, 64, 22, 39, 46, 62, 25, 10, 66, 19, 33, 59, 56, 4, 10, 5, 48, 27, 28, 51, 13, 29, 5, 20

In [12]:
#------------------------Trial on marathi dataset-------------------------------------#

dev = Devanagari()
token_dict = dev.tokenize_characters(x_train)
x_train_tokenized = dev.text_to_sequence(x_train, token_dict)

Total Unique Tokens (Characters): 791
Total records:  41997
Records converted:  41997


In [13]:
i = random.randint(0,len(x_train))
print('Text: \n {} \nNum Words: {} \n\n Sequence: \n {}\n Sequence len: {}'.format(x_train[i],len(x_train[i].split()),
                                                                    x_train_tokenized[i],len(x_train_tokenized[i])))

Text: 
 तर आता आपण या प्रयोगशाळेत जे काही केले त्याबद्दल आपण या योजनेकडे पहाल तेव्हा जे सामान्यतः ईंधन सेल ऑपरेशन आहे जे तेथे आहेत त्यात विशिष्ट फरक आहेत स्वारस्य तर हे फरक काय आहेत ते पहा 
Num Words: 35 

 Sequence: 
 [497, 772, 749, 623, 749, 67, 679, 114, 76, 772, 91, 57, 240, 633, 497, 58, 391, 221, 183, 716, 228, 114, 267, 422, 159, 482, 749, 67, 679, 114, 91, 445, 19, 694, 750, 67, 515, 482, 35, 280, 515, 58, 420, 363, 690, 689, 770, 563, 281, 759, 40, 482, 458, 67, 256, 100, 759, 749, 685, 58, 35, 741, 749, 685, 497, 228, 114, 497, 499, 321, 36, 186, 378, 772, 694, 749, 685, 497, 342, 782, 772, 342, 689, 497, 772, 685, 378, 772, 694, 391, 689, 749, 685, 497, 35, 67, 515]
 Sequence len: 97


In [14]:
char_count = pd.Series([len(x) for x in x_train_tokenized])
char_count.describe()

count    41997.000000
mean        72.168917
std         58.518590
min          0.000000
25%         35.000000
50%         58.000000
75%         93.000000
max       1137.000000
dtype: float64

In [21]:
cnt = 0
for x in char_count:
    if x > 300:
        cnt = cnt + 1
        
print(cnt)

358


In [160]:
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

char_count.iplot(kind = 'hist', xTitle = "char count", yTitle = 'records', 
                            title = "Histogram depicting distibution of character count across training data" )

In [169]:
from keras.preprocessing.sequence import pad_sequences

x_train_pad = pad_sequences(x_train_tokenized, 200, padding= 'post')

In [170]:
i = random.randint(0,len(x_train))
print('Text: \n {} \nNum Words: {} \n\n Sequence: \n {}\n Sequence len: {}'.format(x_train[i],len(x_train[i].split()),
                                                                    x_train_pad[i],len(x_train_pad[i])))

Text: 
 तसेच अधिकृत वापरकर्त्यांना डॉक्युमेंटमध्ये अयोग्य सुधारण्यापासून रोखते 
Num Words: 7 

 Sequence: 
 [258 610 110 682  24 363 258 739 628 255 553 548 646 189 122 514 445 269
 464 250 798  33  76 682 781 176 239 131 358 255 470 797 515 711 750 701
 274  80   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]
 Sequence len: 200
