In [3]:
import nltk
import re
import html
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer


In [8]:
import pandas as pd
import unicodedata as ud

In [9]:

class PreProcess:
    '''This class contains all text pre-processing function
    # Input parameters: Dataframe, Column_name on which function needs to be applied
    # Output parameters: Return dataframe after applying operations
    '''
    # todo: Pass functions as a list of arguments to apply in the class
    # todo: make set of words before applying all operations to reduce processing time.
    def __init__(self, data, column_name):
        self.data = data
        self.column_name = column_name
        self.stemmer = PorterStemmer()
        self.lemmatiser = WordNetLemmatizer()
        # pass

    def remove_non_arabic(self):
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: "".join(i for i in x if ((ord(i) > 1536 and ord(i) < 1791) or ord(i) == 32)))
        return self.data

    def remove_arabic_numbers_puncts(self):
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: ''.join(c for c in x if not ud.category(c).startswith('P')))
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: ''.join(c for c in x if not ud.category(c).startswith('Nd')))
        return self.data
        

    def clean_html(self):
        """remove html entities"""
        self.data[self.column_name] = self.data[self.column_name].apply(html.unescape)
        return self.data

    def remove_spaces(self):
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: x.replace('\n', ' '))
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: x.replace('\t', ' '))
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: x.replace('  ', ' '))
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: x.lower())
        return self.data

    def remove_punctuation(self):
        tr = str.maketrans("", "", string.punctuation)
        # self.data[self.column_name] = self.data[self.column_name].apply(lambda x: " ".join([item.translate(tr)
        #                                                                 for item in x.split()]))
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: x.translate(tr))
        return self.data

    def stemming(self):
        # todo: provide option of selecting stemmer.
        snowball_stemmer = SnowballStemmer('arabic')
        # self.data[self.column_name] = self.data[self.column_name].apply(lambda x: " ".join([snowball_stemmer.stem(item)
        #                                                                 for item in x.split()]))
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: " ".join([self.stemmer.stem(item)
                                                                        for item in x.split()]))
        return self.data

    def lemmatization(self):
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: " ".join([self.lemmatiser.lemmatize(item)
                                                                        for item in x.split()]))
        return self.data

    def stop_words(self):
        stop = stopwords.words('arabic')
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: " ".join(set([item for item in x.split() if
                                                                                       item not in stop])))
        return self.data

### For the different datasets

## 1. The bot or not dataset

### train

In [12]:
df=pd.read_csv('/home/ashhadulislam/projects/hbku/fall_2019/app_deep_learning/nlp_arabic/data/QICC_Bot_OR_Not/train.csv')
print(df.head())
pre_processor = PreProcess(df, "Tweet Text")
#the pre process class is defined at the end. Run that first
data = pre_processor.clean_html()
data = pre_processor.remove_non_arabic()
data = pre_processor.remove_arabic_numbers_puncts()
data = pre_processor.remove_spaces()
data = pre_processor.remove_punctuation()
data = pre_processor.stemming()
data = pre_processor.lemmatization()
data = pre_processor.stop_words()



   Sl Number                                         Tweet Text  Is_Bot
0          1  الوطن حكومةً وشعبًا يفتخر ويعتز بــ #نجاح_موسم...       0
1          2  اللهم صل على سيدنا محمد #Bahrain #البحرين #الس...       0
2          3  #محمد_بن_سلمان_يهدم_ايران من أقوال محمد بن سلم...       0
3          4  #اعجبتني #السعوديه_اليابان #السعودية #جبس #ديك...       0
4          5  وزير النفط: قريباً.. عودة الإنتاج بالمنطقة الم...       0


In [20]:
data.to_csv("/home/ashhadulislam/projects/hbku/fall_2019/app_deep_learning/nlp_arabic/data/QICC_Bot_OR_Not/train_pre_processed.csv",index=False)

### test

In [14]:
df_super_test=pd.read_csv("/home/ashhadulislam/projects/hbku/fall_2019/app_deep_learning/nlp_arabic/data/QICC_Bot_OR_Not/test.csv")
print(df_super_test.head())
pre_processor = PreProcess(df_super_test, "Tweet Text")
#the pre process class is defined at the end. Run that first
data_super_test = pre_processor.clean_html()
data_super_test = pre_processor.remove_non_arabic()
data_super_test = pre_processor.remove_arabic_numbers_puncts()
data_super_test = pre_processor.remove_spaces()
data_super_test = pre_processor.remove_punctuation()
data_super_test = pre_processor.stemming()
data_super_test = pre_processor.lemmatization()
data_super_test = pre_processor.stop_words()

   Sl No                                         Tweet Text  Is_Bot
0      1  هذه طائرة تدريب في #السعودية للشباب الصغار ......       0
1      2  د. عبدالرحمن العناد: تدرك ميليشيات الحوثي أن ا...       1
2      3  د. محمد الثقفي: نجاح حج هذا العام وسلامة ضيوف ...       1
3      4  سلطة بابا غنوج baba ganoush #حلا #كيك #معجنات ...       0
4      5  #السعودية.. النيابة تطالب بإعدام الشيخ #سلمان_...       0


In [19]:
data_super_test.to_csv("/home/ashhadulislam/projects/hbku/fall_2019/app_deep_learning/nlp_arabic/data/QICC_Bot_OR_Not/test_pre_processed.csv",index=False)


In [18]:
data_super_test.Is_Bot.value_counts()

0    925
1    690
Name: Is_Bot, dtype: int64