## Normalization Functions

In [1]:
#Segmentaion

def segmentation(txt):
    import nltk
    punkt = nltk.data.load(r'./tokenizers/punkt/english.pickle')
    
    with open(f'{txt}','r') as f:
        comingText = f.read()
        
    segmentedText = punkt.tokenize(comingText)
    
    with open(r'./Output_Files/Segmented_Text', 'w') as outputFile:
        outputFile.write(f'{segmentedText}')


# segmentation('society2.txt')

In [2]:
# Tokenization

def tokenization(txt):
    
    with open(f'{txt}','r') as f:
        comingText = f.read()
    
    import nltk
    tokenizedText = nltk.word_tokenize(comingText)
    
    with open(r'./Output_Files/Tokenized_Text', 'w') as outputFile:
        outputFile.write(f'{tokenizedText}')
    
#tokenization('algeria.txt')

In [3]:
#Stopword Removing

def stopwordRemoving(txt):
    
    stopWordList = open("arabic-stop-words.txt").read().splitlines()
    
    with open(f'{txt}','r') as f:
        comingText = f.read()
    
    SWCleanText = []
    tokensText = comingText.split(' ')
    for token in tokensText:
        if token not in stopWordList:
            SWCleanText.append(token)
    resultText = ' '.join(SWCleanText)        
    with open(r'./Output_Files/Stopword_Removed_Text', 'w') as outputFile:
        outputFile.write(f'{resultText}')

# stopwordRemoving('society2.txt')

In [4]:
# Punctuations Removing

def punctuationsRemoving(txt):
    import unicodedata as ud
    
    with open(f'{txt}','r') as f:
        comingText = f.read()
    
    ResultText = ''.join(c for c in comingText if not ud.category(c).startswith('P'))
    
    with open(r'./Output_Files/Punctuations_Removed_Text', 'w') as outputFile:
        outputFile.write(f'{ResultText}')
    
# punctuationsRemoving(text)

In [5]:
# ISRI Stemmer (Root-based stemmer)

def ISRI_Stemmer(txt):
    import nltk
    st = nltk.ISRIStemmer()
    
    with open(f'{txt}','r') as f:
        comingText = f.read()
    
    tokensText = comingText.split(' ')
    
    resultStem = ' '.join([st.stem(w) for w in tokensText])
    
    with open(r'./Output_Files/Root-Based_Stemming_Text', 'w') as outputFile:
        outputFile.write(f'{resultStem}')

# ISRI_Stemmer('society2.txt')

In [6]:
#Light Stemmer (tashaphyne)

def lightStemmer(txt):
    stemCleanText = []
    from tashaphyne.stemming import ArabicLightStemmer
    lightStem = ArabicLightStemmer()
    
    with open(f'{txt}','r') as f:
        comingText = f.read()
        
    tokensText = comingText.split(' ')
    for token in tokensText:
        stem = lightStem.light_stem(token)
        stemCleanText.append(stem)
    
    resultText = ' '.join(stemCleanText)
    
    with open(r'./Output_Files/Light_Stemming_Text', 'w') as outputFile:
        outputFile.write(f'{resultText}')

# lightStemmer('society2.txt')

In [7]:
#All preprocessing steps (Normalization of data)

def normalization(txt):
    with open(f'{txt}','r') as f:
        comingText = f.read()
    #Tokenization
    tokenText = comingText.split(' ')
    
    #StopWord Removing
    SWList = []
    stopWordList = open("arabic-stop-words.txt").read().splitlines()
    for token in tokenText:
        if token not in stopWordList:
            SWList.append(token)
    SWText = ' '.join(SWList)
    
    # Punctuations Removing
    import unicodedata as ud
    puncText = ''.join(c for c in SWText if not ud.category(c).startswith('P'))
    
    # ISRI Stemmer (Root-based stemmer)
    import nltk
    st = nltk.ISRIStemmer()
    tkText = puncText.split(' ')
    result = ' '.join([st.stem(w) for w in tkText])
    
    with open(r'./Output_Files/Normalized_Text', 'w') as outputFile:
        outputFile.write(f'{result}')

## GUI with tkinter

In [None]:
from tkinter import *
import tkinter.messagebox

main = Tk()
main.title('Home')
main.geometry("500x300")

#---------- Entry (textbox)-----------------
text = StringVar()
# text.set("Enter your path")
textbox = Entry(main, textvariable=text)
textbox.place(x=200 ,y=50)

# call function when we click on entry box
def click(*args):
    if textbox.get() == 'Enter Path of Dataset:- ':
        textbox.delete(0, 'end')
    
#call function when we leave entry box
def leave(*args):
    if textbox.get() == '':
        textbox.delete(0, 'end')
        textbox.insert(0, 'Enter Path of Dataset:- ')
        main.focus()
    else:
        main.focus()
    
# Add text in Entry box
textbox.insert(0, 'Enter Path of Dataset:- ')
textbox.pack(pady=10)
# Use bind method
textbox.bind("<Button-1>", click)
textbox.bind("<Leave>", leave)

textbox.place(x=150 ,y=15)

#------------Buttons-------------
segmentationBtn = Button(main, text="Segmentation",fg="Red", command=lambda : segmentation(textbox.get()))
tokenizationBtn = Button(main, text="Tokenization",fg="Red", command=lambda : tokenization(textbox.get()))
stopwordRemovingBtn = Button(main, text="Stopword Removing",fg="Red", command=lambda : stopwordRemoving(textbox.get()))
punctuationsRemovingBtn = Button(main, text="Punctuations Removing",fg="Red", command=lambda : punctuationsRemoving(textbox.get()))
rootBasedStemmerBtn = Button(main, text="Root-Based Stemmer",fg="Red", command=lambda : ISRI_Stemmer(textbox.get()))
lightStemmerBtn = Button(main, text="Light-Based Stemmer",fg="Red", command=lambda : lightStemmer(textbox.get()))
normalizationBtn = Button(main, text="Normalization",fg="Red", command=lambda : normalization(textbox.get()))

segmentationBtn.place(x=150 , y=80)
tokenizationBtn.place(x=150 , y=110)
stopwordRemovingBtn.place(x=150 , y=140)
punctuationsRemovingBtn.place(x=150 , y=170)
rootBasedStemmerBtn.place(x=150 , y=200)
lightStemmerBtn .place(x=150 , y=230)
normalizationBtn.place(x=150, y=269)


main.mainloop()