In [6]:
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd
from nltk import pos_tag, word_tokenize
from textblob import TextBlob, Word
import unicodedata

In [121]:
class Preprocess:
        
        # --------------------------------------- Constructor --------------------------------------- 
        
        def __init__(self,stopword_list):
            self.data_path = ''
            self.stopword_list = stopword_list
            
    
        # --------------------------------------- Preprocess --------------------------------------- 
        
        def expand_concatenations(self, word):
            
            if not re.match('[a-zA-Z]+', word) or re.match('/d+',word):
                for i in range(len(word)):
                    if not('DEVANAGARI ' in unicodedata.name(word[i])):
                        word = word[:i] if( len(word[i:]) < 2 ) else word[:i] + " " + word[i:]
                        break
            else:
                for i in range(len(word)):
                    if ('DEVANAGARI ' in unicodedata.name(word[i])):
                        word = word[i:] if( len(word[:i]) < 2 ) else word[:i] + " " + word[i:]
                        break

            return(word)
    
        
        
        def clean_text(self,text):
            
            special_chars = r'''!()-[]{};:'"\,<>./?@#$%^&*_~'''
            stemmer = PorterStemmer()
            lemmatizer = WordNetLemmatizer()
            
            text = str(text)

            # Cleaning the urls
            text = re.sub(r'https?://\S+|www\.\S+', '', text)

            # Cleaning the html elements
            text = re.sub(r'<.*?>', '', text)
            
            # Removing the punctuations
            text = re.sub('[!#?,.:";-@#$%^&*_~<>()-]', ' ', text)
                    
            # Removing stop words
            text = ' '.join([word for word in text.split() if word not in self.stopword_list])
            
            # Expanding noisy concatenations (Eg: algorithmआणि  -> algorithm आणि ) 
            text = ' '.join([self.expand_concatenations(word) for word in text.split()])
            
            preprocessed_text = ""
            
            for word in text.split(): 
                if (re.match('\d+', word)):
                    if(word.isnumeric()):
                        preprocessed_text = preprocessed_text + '#N' + " "

                else:
                    if(re.match('[a-zA-Z]+', word) and len(word) > 1):
                            word = word.lower()
#                             word = lemmatizer.lemmatize(word, pos='v')
                            preprocessed_text = preprocessed_text + word + " "

                    else:
                        preprocessed_text = preprocessed_text + word + " "
            
            return preprocessed_text

In [124]:
def analyze_vocab(vocab_words):
    numbers = []
    english_words = []
    marathi_words = []
    for word in vocab_words:
        if re.match('\d+', word):
            numbers.append(word)
        elif re.match('[a-zA-Z]+', word):
            english_words.append(word)
        else:
            marathi_words.append(word)
    return numbers, english_words, marathi_words

def custom_analyzer(text):
    # extract words of at least 1 letters
    words = re.findall(r'\w{1,}', text)
    for w in words:
        yield w

def bow_vectorize(x_train, x_val):
        bow_vectorizer = CountVectorizer(analyzer=custom_analyzer)
        bow_vectorizer.fit(x_train)
        bow_x_train = bow_vectorizer.transform(x_train)
        bow_x_val = bow_vectorizer.transform(x_val)
        return bow_vectorizer, bow_x_train, bow_x_val

def check_alphanumeric_words(text):
    alpha_numeric_set = set()
    for t in text:
        for word in t.split(): 
            if any(chr.isalpha() for chr in word) and any(chr.isdigit() for chr in word): 
#                 print(word)
                alpha_numeric_set.add(word)
    return alpha_numeric_set

In [125]:
from collections import Counter

df = pd.read_csv('../Technodifacation/Data/training_data_marathi.csv')

pp = Preprocess([])
text = df['text'].apply(lambda x : pp.clean_text(x)).tolist()
alpha_numeric = set()

In [126]:
from keras.preprocessing.text import Tokenizer


tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
word_index = tokenizer.word_index
        
print('Gensim tokenizer: Unique tokens = {} , Alphanumeric = {}'.format(len(word_index),len(check_alphanumeric_words(word_index))))

num , en1 , mar1 = analyze_vocab(word_index)
print('#Num = {} , #Eng = {}, #Mar = {} '.format(len(num),len(en1),len(mar1)))

Gensim tokenizer: Unique tokens = 51915 , Alphanumeric = 96
#Num = 0 , #Eng = 937, #Mar = 50978 


In [127]:
from indicnlp.tokenize import indic_tokenize

indic_nlp_tokens = set()
for record in text:
    tokens = indic_tokenize.trivial_tokenize(record, lang = 'mar')
    for t in tokens:
        indic_nlp_tokens.add(t)


print('Gensim tokenizer: Unique tokens = {} , Alphanumeric = {}'.format(len(indic_nlp_tokens),len(check_alphanumeric_words(indic_nlp_tokens))))

num , en2 , mar2 = analyze_vocab(indic_nlp_tokens)
print('#Num = {} , #Eng = {}, #Mar = {} '.format(len(num),len(en2),len(mar2)))

Gensim tokenizer: Unique tokens = 51942 , Alphanumeric = 96
#Num = 0 , #Eng = 961, #Mar = 50981 


In [129]:
ex1 = [x for x in en2 if x not in en1]
ex2 = [x for x in mar2 if x not in mar1]

print('Extra words tokenized by Indic - NLP:',ex1,ex2)

Extra words tokenized by Indic - NLP: ['D', 'S', 'V', 'L', 'F', 'H', 'A', 'Z', 'R', 'B', 'W', 'O', 'X', 'N', 'I', 'P', 'Y', 'G', 'U', 'K', 'C', 'M', 'T', 'E'] ['', '°C', 'σD', '`', '#']


In [99]:
from sklearn.feature_extraction.text import CountVectorizer

training_data = pd.read_csv("../Technodifacation/Data/training_data_marathi.csv")
training_data['text'] = training_data.text.apply(lambda x: pp.clean_text(x))
x_train = training_data.text.values.tolist()
val_data = pd.read_csv("../Technodifacation/Data/test_data_marathi.csv")
val_data['text'] = val_data.text.apply(lambda x: pp.clean_text(x))
x_val = val_data.text.values.tolist()

bow_vectorizer, bow_x_train, bow_x_val = bow_vectorize(x_train, x_val)
print(bow_x_train.shape)
print(bow_x_val.shape)

(41997, 5501)
(3780, 5501)


In [123]:
if __name__ == '__main__':
   
    df = pd.read_csv('../Technodifacation/Data/training_data_marathi.csv')
    stopword_list = []
    lemmatizer = WordNetLemmatizer()

#     with open ('../Technodifacation/Data/marathi_stopwords.txt','r',encoding='utf') as st:
#         st_content = st.read()
#         st_list = set(st_content.split())
#         stopword_list = st_list
    
    pp = Preprocess([])
    
#     df['text'] = df['text'].apply(lambda x : pp.clean_text(x))
#     sample_text = df.sample()['text'].values[0]

    sample_text = "mस्केलेबल algorithmआणि  एमएनओMOM आयपीIPs  वारेB ०००० किलोमीट विशिष्ट-- 19022323239  ० great 2T2 ,H2O, 9909च, Having Caring Sharing शब्दाचा उच्चार कसा केला गेला आणि 99 Working समन्वय साधण्याचा प्रयत्न करा जेव्हा 87929999 एका बिंदूबरोबर इतर गोष्टींचा एका!!! ११ 00 १ Google computer architecture graphic show.!!!"
    preprocessed_text = pp.clean_text(sample_text)
    print('\nBefore:\t',sample_text,'\n\nAfter:\t',preprocessed_text)


Before:	 mस्केलेबल algorithmआणि  एमएनओMOM आयपीIPs  वारेB ०००० किलोमीट विशिष्ट-- 19022323239  ० great 2T2 ,H2O, 9909च, Having Caring Sharing शब्दाचा उच्चार कसा केला गेला आणि 99 Working समन्वय साधण्याचा प्रयत्न करा जेव्हा 87929999 एका बिंदूबरोबर इतर गोष्टींचा एका!!! ११ 00 १ Google computer architecture graphic show.!!! 

After:	 स्केलेबल algorithm आणि एमएनओ mom आयपी ips वारे #N किलोमीट विशिष्ट #N #N great h2o having caring sharing शब्दाचा उच्चार कसा केला गेला आणि #N working समन्वय साधण्याचा प्रयत्न करा जेव्हा #N एका बिंदूबरोबर इतर गोष्टींचा एका #N #N #N google computer architecture graphic show 
