In [1]:
#Imports
import pandas as pd
import numpy as np
import missingno as msno
import re

In [2]:
#load Dataset
df=pd.read_csv('../data/apps_ar_reviews_dataset.csv')

# Print Shape of dataset
print('Shape of dataset ',df.shape)
# Print Columns
print(df.columns)
# Print dataframe
df

Shape of dataset  (51766, 13)
Index(['UserName', 'Review', 'Score', 'Date/Time', 'thumbsUp', 'Version',
       'app', 'lan_review', 'firstName', 'Gender', 'ReviewLength',
       'words_count', 'Polarity sentiment'],
      dtype='object')


Unnamed: 0,UserName,Review,Score,Date/Time,thumbsUp,Version,app,lan_review,firstName,Gender,ReviewLength,words_count,Polarity sentiment
0,نواف الحربي,تحميلك للتطبيق حتى لو بالخطأ يدخلك في عزل لمد...,1,2021-01-27 08:41:10,1,1.7,tetaman,Arabic,نواف,Male,164,31,neutral
1,ahmed ghazala,لا يعمل ولا يستجيب في مرحلة التسجيل لايتم ارس...,1,2021-01-23 21:49:16,0,1.7,tetaman,Arabic,ahmed,Male,55,10,neutral
2,ELSyed Attia,للأسف تطبيق سيئ جداً بقالي يومين بحاول اسجل و...,1,2021-01-23 00:51:06,1,1.7,tetaman,Arabic,ELSyed,Male,67,12,negative
3,انور القدسي,ما قدرت اسجل في التطبيق ، اوصل ل آخر خطوه وهي...,1,2021-01-21 23:55:30,0,,tetaman,Arabic,انور,Male,89,19,neutral
4,اسلام سعيد,نزلت البرنامج اكثر من مره وسجلت بياناتي كلها ...,1,2021-01-21 15:25:36,0,1.7,tetaman,Arabic,اسلام,Male,248,44,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51761,عبدالله الأسمري,ممتاز,5,2019-09-25 12:49:11,2,1,sehhaty,Non-Arabic,عبدالله,Male,7,1,positive
51762,خالد عقيل,ممتاز,5,2019-09-24 18:03:41,1,1,sehhaty,Non-Arabic,خالد,Male,7,1,positive
51763,عبدالله عتودي,ممتاز,5,2019-09-23 19:49:51,1,1,sehhaty,Non-Arabic,عبدالله,Male,7,1,positive
51764,خوالي تاج راسي,Ok,5,2019-09-23 18:50:00,0,1,sehhaty,Non-Arabic,خوالي,Male,4,1,negative


In [3]:
# Summary about the dataframe 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51766 entries, 0 to 51765
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   UserName            51766 non-null  object
 1   Review              51766 non-null  object
 2   Score               51766 non-null  int64 
 3   Date/Time           51766 non-null  object
 4   thumbsUp            51766 non-null  int64 
 5   Version             51766 non-null  object
 6   app                 51766 non-null  object
 7   lan_review          51766 non-null  object
 8   firstName           51766 non-null  object
 9   Gender              51766 non-null  object
 10  ReviewLength        51766 non-null  int64 
 11  words_count         51766 non-null  int64 
 12  Polarity sentiment  51766 non-null  object
dtypes: int64(4), object(9)
memory usage: 5.1+ MB


## Cleaning

In [4]:
#pip install PyArabic

In [6]:
import re 
def cleaning(text):
    
 Arabic_numbers = ['٤','١','٢','٣','٥','٦','٧','٨','٩','٠']
 special_character = ['؟','،','?',',','!','.',':','"','""','‘‘','‘','؛','↓',"'", '‰',
                      '`','€',';','ç','ı','À','@','٬','~᷂','٫','⁩◕','.',
                      '=','#','$','%','^','&','*','()',')','(','\\','/',
                      '((', '_', '"','"', '…','-','×','ツ','+','%','٪','⁩ლ']

#remove emojis
 text= remove_emoji(text)
    
#replace special characters with whitespaces 
 for word in range(0, len(special_character)):
     text = text.replace(special_character[word], ' ') 
              
#replace  arabic numbers with whitespaces 
 for word in range(0, len(Arabic_numbers)):
     text = text.replace(Arabic_numbers[word], ' ') 
    
#remove english words letters and numbers
 text = re.sub(r'[0-9a-zA-Z]+',' ', text)

 return text

In [8]:
import emoji
def remove_emoji(text):
    return emoji.demojize(text)

## Preprocessing

In [5]:
# tokenize to remove stopwords and join into sentences again
from nltk.corpus import stopwords
import pyarabic.araby as araby
def stop_word_removal(text):
 stop_words = set(stopwords.words("arabic"))
 words = araby.tokenize(text)
 text = " ".join([w for w in words if not w in stop_words])
 return text

In [9]:
import pyarabic.araby as araby
def normalization(text):
#replace Ta'a and Hamza'a and Ya'a
 text = re.sub("[إأٱآا]", "ا", text)
 text = re.sub("ى", "ي", text)
 text = re.sub("ة", "ه", text)
#remove extra whitespace
 text = re.sub('\s+', ' ', text)   
#remove tashkeel
 text = araby.strip_tashkeel(text)
 return text

In [10]:
def pre_processing(text):
#Normalization 
 text = normalization(text)
#stop words removal
 text = stop_word_removal(text)
#Cleaning
 text = cleaning(text)
 return text

In [11]:
#pre_processing the review column
df['Review'] = df['Review'].apply(lambda x:pre_processing(x))
df

Unnamed: 0,UserName,Review,Score,Date/Time,thumbsUp,Version,app,lan_review,firstName,Gender,ReviewLength,words_count,Polarity sentiment
0,نواف الحربي,تحميلك للتطبيق حتي بالخطا يدخلك عزل لمده يو...,1,2021-01-27 08:41:10,1,1.7,tetaman,Arabic,نواف,Male,164,31,neutral
1,ahmed ghazala,يعمل يستجيب مرحله التسجيل لايتم ارسال الكود,1,2021-01-23 21:49:16,0,1.7,tetaman,Arabic,ahmed,Male,55,10,neutral
2,ELSyed Attia,للاسف تطبيق سيئ جدا بقالي يومين بحاول اسجل يرس...,1,2021-01-23 00:51:06,1,1.7,tetaman,Arabic,ELSyed,Male,67,12,negative
3,انور القدسي,قدرت اسجل التطبيق اوصل اخر خطوه وهي كود التح...,1,2021-01-21 23:55:30,0,,tetaman,Arabic,انور,Male,89,19,neutral
4,اسلام سعيد,نزلت البرنامج اكثر مره وسجلت بياناتي كلها لاني...,1,2021-01-21 15:25:36,0,1.7,tetaman,Arabic,اسلام,Male,248,44,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51761,عبدالله الأسمري,ممتاز,5,2019-09-25 12:49:11,2,1,sehhaty,Non-Arabic,عبدالله,Male,7,1,positive
51762,خالد عقيل,ممتاز,5,2019-09-24 18:03:41,1,1,sehhaty,Non-Arabic,خالد,Male,7,1,positive
51763,عبدالله عتودي,ممتاز,5,2019-09-23 19:49:51,1,1,sehhaty,Non-Arabic,عبدالله,Male,7,1,positive
51764,خوالي تاج راسي,,5,2019-09-23 18:50:00,0,1,sehhaty,Non-Arabic,خوالي,Male,4,1,negative


### Remove Reviews Without Sentiment

In [12]:
df["Polarity sentiment"].value_counts()

positive                 39132
negative                  9327
neutral                   3293
No_sentment_extracted       14
Name: Polarity sentiment, dtype: int64

In [13]:
df = df.drop(df[df['Polarity sentiment'] == "No_sentment_extracted"].index)

### Remove Non Arabic Reviews

In [14]:
#count of non arabic reviews
df["lan_review"].value_counts()

Arabic        30566
Non-Arabic    21186
Name: lan_review, dtype: int64

In [15]:
# drop non arabic reviews
df = df.drop(df[df['lan_review'] == "Non-Arabic"].index)
df["lan_review"].value_counts()

Arabic    30566
Name: lan_review, dtype: int64

### Extra cleaning

In [16]:
#remove punctuation 
p = re.compile(r'[^\w\s]+')
df['Review'] = [p.sub('', x) for x in df['Review'].tolist()]

#remove tabs and new lines from the text 
df['Review'] = df.Review.str.replace("\xa0"," ") 
df['Review'] = df.Review.str.replace("\n"," ")
df['Review'] = df['Review'].replace("\t"," ", regex=True)

In [17]:
# check for nulls 
df['Review'].isnull().sum()

0

In [18]:
#remove blanks by replacing them with Nan
df['Review'].replace(r'^\s*$', np.nan, regex=True,inplace=True)

In [19]:
# check for nulls 
df['Review'].isnull().sum()

84

In [20]:
# drop nans
df.dropna(subset=['Review'], inplace=True)
# drop duplicates 
df=df.drop_duplicates(subset=['Review'])

In [21]:
df

Unnamed: 0,UserName,Review,Score,Date/Time,thumbsUp,Version,app,lan_review,firstName,Gender,ReviewLength,words_count,Polarity sentiment
0,نواف الحربي,تحميلك للتطبيق حتي بالخطا يدخلك عزل لمده يو...,1,2021-01-27 08:41:10,1,1.7,tetaman,Arabic,نواف,Male,164,31,neutral
1,ahmed ghazala,يعمل يستجيب مرحله التسجيل لايتم ارسال الكود,1,2021-01-23 21:49:16,0,1.7,tetaman,Arabic,ahmed,Male,55,10,neutral
2,ELSyed Attia,للاسف تطبيق سيئ جدا بقالي يومين بحاول اسجل يرس...,1,2021-01-23 00:51:06,1,1.7,tetaman,Arabic,ELSyed,Male,67,12,negative
3,انور القدسي,قدرت اسجل التطبيق اوصل اخر خطوه وهي كود التح...,1,2021-01-21 23:55:30,0,,tetaman,Arabic,انور,Male,89,19,neutral
4,اسلام سعيد,نزلت البرنامج اكثر مره وسجلت بياناتي كلها لاني...,1,2021-01-21 15:25:36,0,1.7,tetaman,Arabic,اسلام,Male,248,44,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51736,محمد ابو ابراهيم,تجريبي,5,2019-10-30 13:15:57,0,1.1.0,sehhaty,Arabic,محمد,Male,8,1,negative
51741,a-_-33 m,كانت تجربه سهله وميسره,5,2019-10-21 11:00:28,0,1,sehhaty,Arabic,a-_-33,Not_found,24,4,positive
51745,ريما المحيميد,برنامج جيد جدا واجهت مشكله فتحه الكمبيوتر وايض...,5,2019-10-20 01:35:25,1,1,sehhaty,Arabic,ريما,Female,103,18,positive
51746,استغفر الله سبحان الله,اطبع التقرير الطبي,5,2019-10-16 12:54:24,0,1,sehhaty,Arabic,استغفر,Not_found,24,4,neutral


In [22]:
#write the dataframe into csv file
df.to_csv('../data/Clean_apps_ar_reviews.csv',index=False)