In [61]:
import os

In [16]:
from itertools import chain

In [1]:
import regex as re

In [2]:
def get_ngram(word, n):
    '''
    Get all possible ngram of a string;
    For example:
    string = 'abcdef'
    possible ngrams:
    ['abc', 'bcd', 'cde', 'def']
    '''
    
    pat_1 = fr'(?=(\w{{{n}}}))'
    pat_1 = re.compile(pat_1)
    
    result = re.findall(pat_1, word)
    
    return result

In [3]:
test = "Hello my name is Akbar"

In [5]:
with open('Quran_Aya_Per_Line.txt', encoding='utf-8') as q:
    quran = [line.strip() for line in q.readlines()]

In [9]:
test = quran[:7]

In [12]:
list_quran_bigram = [re.findall(r'(?=(\b\w+\s+\w+\b))', i) for i in quran]

In [19]:
list_quran_bigram = list(chain(*list_quran_bigram))

In [28]:
with open('english_to_be_translated.txt', encoding='utf-8') as en:
    test = [line.strip() for line in en.readlines()]

In [29]:
list_test_bigram = [re.findall(r'(?=(\b\w+\s+\w+\b))', i) for i in test]

In [32]:
list_test_bigram = [re.findall(r'(?=(\b\S+\s+\S+\b))', i) for i in test]

In [33]:
list_test_bigram = list(chain(*list_test_bigram))

In [36]:
list_test_bigram[:3]

['7. Nuisances', '. Nuisances', 'The Tenant']

In [35]:
en_puncs = ".?!,\"\'/'"

In [42]:
def en_clean(word):
    '''
    given an english word, remove punctuations, excess space and 
    leave normal letters in lowercase format.
    '''
    ar_punc_regex = f"[{''.join(['،', '؛', '؟', '.', '/'])}]"
    word = re.sub(ar_punc_regex, ' ', word)
    ar_c = r'[^a-zA-Z\-\s]'
    word = re.sub(ar_c, '', word)
    word = re.sub(r' {2,}', ' ', word)
    word = re.sub(r'\t+', ' ', word)
    word = word.strip()
    return word.lower()

In [44]:
list_test_bigram = [en_clean(bigram) for bigram in list_test_bigram]

In [45]:
list_test_bigram

['nuisances',
 'nuisances',
 'the tenant',
 'tenant will',
 'will not',
 'not do',
 'do anything',
 'anything on',
 'on the',
 'the demised',
 'demised premises',
 'premises which',
 'which may',
 'may be',
 'be illegal',
 'illegal or',
 'or immoral',
 'immoral or',
 'or be',
 'be noisy',
 'noisy noisome',
 'noisome',
 'noisome noxious',
 'noxious',
 'noxious dangerous',
 'dangerous',
 'dangerous or',
 'or offensive',
 'offensive or',
 'or which',
 'which may',
 'may be',
 'be a',
 'a nuisance',
 'nuisance damage',
 'damage',
 'damage annoyance',
 'annoyance',
 'annoyance disturbance',
 'disturbance',
 'disturbance or',
 'or inconvenience',
 'inconvenience or',
 'or which',
 'which may',
 'may in',
 'in any',
 'any way',
 'way be',
 'be injurious',
 'injurious to',
 'to the',
 'the neighborhood',
 'neighborhood or',
 'or to',
 'to the',
 'the public',
 'public local',
 'local',
 'local or',
 'or any',
 'any other',
 'other authorities',
 'authorities or',
 'or to',
 'to the',
 'the lan

In [46]:
test

['7. Nuisances',
 'The Tenant will not do anything on the Demised Premises which may be illegal or immoral or be noisy, noisome, noxious, dangerous or offensive or which may be a nuisance, damage, annoyance, disturbance or inconvenience or which may in any way be injurious to the neighborhood or to the public, local or any other authorities or to the Landlord or to any owners, tenants, or occupiers of Neighboring Property or The Financial Center or the Development.',
 '8. Alienation',
 'The Tenant will not assign, transfer or sublet the Demised Premises, without written consent of the Landlord. The Landlord may assign, sublet or transfer its obligations and benefits under this Lease in whole upon providing the Tenant with twenty-eight (28) days prior written notice.',
 '9. Observe Covenants',
 'The Tenant will observe and perform all matters contained or referred to in the deeds and documents referred to in this Lease and (without prejudice to any other rights of the Landlord) will kee

In [82]:
# Generator Function
def normalize_text_files(folder):
    '''
    A generator function that yields lines of cleaned text with no stopwords or nonwords.
    '''
    ar_punc_regex = f"[{''.join(['،', '؛', '؟', '.', '/'])}]"
    ar_punc_regex = re.compile(ar_punc_regex)
    ar_c = r'[^a-zA-Z\-\s]'
    ar_c = re.compile(ar_c)
    
    ##Stopwords Set
    stop_words = "how very nbsp will years year be other only often first how him rather another but he have many three near yours won us to we'll usually ours haven't their couldn't together thing please with through an is been ain within since vi one do much your she's after all iii is't self for could away done thus always some also doing will where and does iv aren wasn than well about a truth till even mrs just get are therefore little didn't yet until me who won't they almost far doesn't in we because third else once enough himself father's such be from while should things when too weren't thoughts speed you've not hadn't less on between without was ii why them here's i ha never v that's yourself each last his being say age any shouldn't if make shall had themselves nor there which herself said her though half it's i'll these viii needn't by so out same were didn am ll myself my give hour now shouldn she you'd way ve vii still keep take no here mustn't cannot indeed our as you'll having under couldn gone you're can yes at theirs i' before long let's whose hers the two nothing few may over whom what wish isn't might des there's again ix mightn't should've hadn given wrong come yourselves shan old wasn't own he's sit this its thousand seen that'll o must what's hasn enter very alone below wouldn aren't x tell it mine love more during don't name those wouldn't hasn't others has every forth then itself further against would into both none mightn did most ourselves or you back ever that"
    stop_words = stop_words.split()
    stop_words = set(stop_words)

    
    list_txt_files = [file for file in os.listdir(folder) if file[-4:].lower()=='.txt']
    if list_txt_files:
        for file in list_txt_files:
            file_abs = os.path.join(os.path.abspath(f'{folder}/'), file)
            print(f'Processing {file} ...')
            with open(f'{file_abs}', encoding='utf-8') as file_01:
                for line in file_01.readlines():
                    list_words = []
                    for word in line.split():
                        word = re.sub(ar_punc_regex, ' ', word)
                        word = re.sub(ar_c, '', word)
                        word = word.strip()
                        word =  word.lower()
                        if word not in stop_words:
                            list_words.append(word)
                    list_words.append('\n')
                    yield ' '.join(list_words).strip()

In [83]:
words_gen = normalize_text_files('test_folder/')

In [84]:
with open('normalized_text_2.txt', mode='w', encoding='utf-8') as new_file:
    for line in words_gen:
        new_file.write(f"{line}\n")

Processing english_to_be_translated.txt ...
