In [1]:
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import unicodedata
from nltk import pos_tag, word_tokenize
from textblob import TextBlob, Word

In [2]:
class Preprocess:
        
    # --------------------------------------- Constructor ------------------------------------------------

    def __init__(self):
        file = open("../marathi-stopwords.txt")
        self.stopword_list = []
        for row in file:
            self.stopword_list.append(re.sub('\n', '', row))


    # --------------------------------------- Clean Data -------------------------------------------------    
    
    def read_data(self, path):
        return pd.read_csv(path)

    
    # --------------------------------------- Expand concatenations --------------------------------------

    def expand_concatenations(self, word):
        if re.match('[a-zA-Z]+', word):
            for i in range(len(word)):
                if('DEVANAGARI ' in unicodedata.name(word[i])):
                    word = word[i:] if( len(word[:i]) < 2 ) else word[:i] + " " + word[i:]
                    break
        return(word)


    # --------------------------------------- Preprocess -------------------------------------------------

    def preprocess_data(self, text):

        # Cleaning the urls
        text = re.sub(r'https?://\S+|www\.\S+', '', str(text))

        # Cleaning the html elements
        text = re.sub(r'<.*?>', '', text)

        # Removing the punctuations
        text = re.sub('[!#?,.:";-@#$%^&*_~<>()-]', ' ', text)

        # Removing stop words
        text = [word for word in text.split() if word not in self.stopword_list]

        # Expanding noisy concatenations (Eg: algorithmआणि  -> algorithm आणि ) 
        text = [self.expand_concatenations(word) for word in text]

        preprocessed_text = ""
        lemmatizer = WordNetLemmatizer()
        for word in text: 
            if (re.match('\d+', word)):
                if(word.isnumeric()):
                    preprocessed_text = preprocessed_text + '<Numeric>' + " "

            else:
                if(re.match('[a-zA-Z]+', word)):
                    word = word.lower()
    #                             word = lemmatizer.lemmatize(word, pos='v')
                    preprocessed_text = preprocessed_text + word + " "

                else:
                    preprocessed_text = preprocessed_text + word + " "

        return preprocessed_text.strip()

In [6]:
if __name__ == '__main__':
   
    #df = pd.read_csv('../Technodifacation/Data/training_data_marathi.csv')
    stopword_list = []
    lemmatizer = WordNetLemmatizer()

#     with open ('../Technodifacation/Data/marathi_stopwords.txt','r',encoding='utf') as st:
#         st_content = st.read()
#         st_list = set(st_content.split())
#         stopword_list = st_list
    
    pp = Preprocess([])
    
#     df['text'] = df['text'].apply(lambda x : pp.clean_text(x))
#     sample_text = df.sample()['text'].values[0]

    sample_text = "mस्केलेबल algorithmआणि  १ २ ३ ४  ६ ५ ७ ८ ९ विशिष्ट-- 19022323239  ० great 2T2 ,H2O, 9909च, Having Caring Sharing शब्दाचा उच्चार कसा केला गेला आणि 99 Working समन्वय साधण्याचा प्रयत्न करा जेव्हा 87929999 एका बिंदूबरोबर इतर गोष्टींचा एका!!! ११ 00 १ Google ०computer architecture 5 graphic show.!!!"
    preprocessed_text = pp.clean_text(sample_text)
    print('\nBefore:\t',sample_text,'\n\nAfter:\t',preprocessed_text)


Before:	 mस्केलेबल algorithmआणि  १ २ ३ ४  ६ ५ ७ ८ ९ विशिष्ट-- 19022323239  ० great 2T2 ,H2O, 9909च, Having Caring Sharing शब्दाचा उच्चार कसा केला गेला आणि 99 Working समन्वय साधण्याचा प्रयत्न करा जेव्हा 87929999 एका बिंदूबरोबर इतर गोष्टींचा एका!!! ११ 00 १ Google ० computer architecture 5 graphic show.!!! 

After:	 स्केलेबल algorithm आणि <Numeric> <Numeric> <Numeric> <Numeric> <Numeric> <Numeric> <Numeric> <Numeric> <Numeric> विशिष्ट <Numeric> <Numeric> great h2o having caring sharing शब्दाचा उच्चार कसा केला गेला आणि <Numeric> working समन्वय साधण्याचा प्रयत्न करा जेव्हा <Numeric> एका बिंदूबरोबर इतर गोष्टींचा एका <Numeric> <Numeric> <Numeric> google <Numeric> computer architecture <Numeric> graphic show 


In [66]:
def analyze_vocab(vocab_words):
    numbers = []
    english_words = []
    marathi_words = []
    for word in vocab_words:
        if re.match('\d+', word):
            numbers.append(word)
        elif re.match('[a-zA-Z]+', word):
            english_words.append(word)
        else:
            marathi_words.append(word)
    return numbers, english_words, marathi_words

In [107]:
from collections import Counter

df = pd.read_csv('../Technodifacation/Data/training_data_marathi.csv')

pp = Preprocess([])
text = df['text'].apply(lambda x : pp.clean_text(x)).to_list()

alpha_numeric = set()
vocab = set()

for t in text:
        for word in t.split(): 
            word = word.lower()
            vocab.add(word)
            if any(chr.isalpha() for chr in word) and any(chr.isdigit() for chr in word): 
#                 print(word)
                alpha_numeric.add(word)
        
print(len(vocab),len(alpha_numeric))

num , en , mar = analyze_vocab(vocab)
print(len(set(en)))

51968 124
928


In [108]:
# checking if any contractions are not expanded
import unicodedata

for word in en:
    for i in range(len(word)):
        if('DEVANAGARI ' in unicodedata.name(word[i])):
#             word = word[:i] + " " + word[i:]
            exp_word = word[i:] if( len(word[:i]) < 2 ) else word[:i] + " " + word[i:]
            print(word , '\t--->\t', exp_word)
            break