In [102]:
import pandas as pd 
import numpy as np 

In [103]:
df = pd.read_csv("10_Reviews.csv" , index_col=0)

In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62884 entries, 0 to 62883
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   BusinessId     62884 non-null  int64 
 1   review_id      62884 non-null  int64 
 2   rating         62884 non-null  int64 
 3   comment        41021 non-null  object
 4   creation_date  62884 non-null  object
dtypes: int64(3), object(2)
memory usage: 2.9+ MB


In [105]:
df['BusinessId'].value_counts().head(50)

BusinessId
39976     4596
77092     3333
48246     1813
24519     1553
257509    1204
216993    1186
15908     1167
51267      967
1045       881
2897       808
24974      676
44461      656
258697     623
86675      585
214933     507
17184      493
96402      476
89771      466
7855       444
218848     435
42825      408
64629      393
238860     393
49372      392
669        389
212567     388
235945     377
90277      375
235942     375
1491       371
205        368
27327      368
176761     345
204828     334
235951     325
287679     312
117172     308
1428       302
3114       299
242704     287
55142      279
54686      275
200629     257
8889       255
39658      249
205289     235
15853      213
4341       211
107585     207
93932      206
Name: count, dtype: int64

In [106]:
df['creation_date'] = df['creation_date'].astype('datetime64[ns]')

In [107]:
df['comment'].fillna('No Comment' , inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['comment'].fillna('No Comment' , inplace=True)


In [108]:
df.isnull().sum()

BusinessId       0
review_id        0
rating           0
comment          0
creation_date    0
dtype: int64

### clean data for seg

In [109]:
import pyarabic.araby as ar

# import Stemmer
import functools, operator
import logging

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [110]:
arabic_stop_words=[]
with open ('Arabic_stop_words.txt',encoding='utf-8') as f :
    for i in f.readlines() :
        arabic_stop_words.append(i)
        arabic_stop_words[-1]=arabic_stop_words[-1][:-1]

In [111]:
arabic_stop_words

['،',
 'ء',
 'ءَ',
 'آ',
 'آب',
 'آذار',
 'آض',
 'آل',
 'آمينَ',
 'آناء',
 'آنفا',
 'آه',
 'آهاً',
 'آهٍ',
 'آهِ',
 'أ',
 'أبدا',
 'أبريل',
 'أبو',
 'أبٌ',
 'أجل',
 'أجمع',
 'أحد',
 'أخبر',
 'أخذ',
 'أخو',
 'أخٌ',
 'أربع',
 'أربعاء',
 'أربعة',
 'أربعمئة',
 'أربعمائة',
 'أرى',
 'أسكن',
 'أصبح',
 'أصلا',
 'أضحى',
 'أطعم',
 'أعطى',
 'أعلم',
 'أغسطس',
 'أفريل',
 'أفعل به',
 'أفٍّ',
 'أقبل',
 'أكتوبر',
 'أل',
 'ألا',
 'ألف',
 'ألفى',
 'أم',
 'أما',
 'أمام',
 'أمامك',
 'أمامكَ',
 'أمد',
 'أمس',
 'أمسى',
 'أمّا',
 'أن',
 'أنا',
 'أنبأ',
 'أنت',
 'أنتم',
 'أنتما',
 'أنتن',
 'أنتِ',
 'أنشأ',
 'أنه',
 'أنًّ',
 'أنّى',
 'أهلا',
 'أو',
 'أوت',
 'أوشك',
 'أول',
 'أولئك',
 'أولاء',
 'أولالك',
 'أوّهْ',
 'أى',
 'أي',
 'أيا',
 'أيار',
 'أيضا',
 'أيلول',
 'أين',
 'أيّ',
 'أيّان',
 'أُفٍّ',
 'ؤ',
 'إحدى',
 'إذ',
 'إذا',
 'إذاً',
 'إذما',
 'إذن',
 'إزاء',
 'إلى',
 'إلي',
 'إليكم',
 'إليكما',
 'إليكنّ',
 'إليكَ',
 'إلَيْكَ',
 'إلّا',
 'إمّا',
 'إن',
 'إنَّ',
 'إى',
 'إياك',
 'إياكم',
 'إياكما',
 'إياكن',


In [112]:
import numpy as np
import pandas as pd
import re

#============= Read CSV and apply data preperation =============#


def data_preprocessing (data_frame):
    # clean-up: remove #tags, http links and special symbols
    # data_frame['comment']= data_frame['comment'].apply(lambda x: x[2:-2])
    data_frame['comment']= data_frame['comment'].apply(lambda x: re.sub(r'http\S+', '', x))
    data_frame['comment'] = data_frame['comment'].apply(lambda x: re.sub(r'[@|#]\S*', '', x))
    data_frame['comment'] = data_frame['comment'].apply(lambda x: re.sub(r'"+', '', x))

    # Remove arabic signs
    data_frame['comment'] = data_frame['comment'].apply(lambda x: re.sub(r'([@A-Za-z0-9_ـــــــــــــ]+)|[^\w\s]|#|http\S+', '', x))

    # Remove repeated letters like "الللللللللللللللله" to "الله"
    data_frame['comment'] = data_frame['comment'].apply(lambda x: x[0:2] + ''.join([x[i] for i in range(2, len(x)) if x[i]!=x[i-1] or x[i]!=x[i-2]]))

    # remove stop words
    data_frame['comment'] = data_frame['comment'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in arabic_stop_words]))
    
    from nltk.stem.isri import ISRIStemmer
    data_frame['comment']=data_frame['comment'].apply(lambda x:ISRIStemmer().stem(x))

    return data_frame

In [113]:
import re
import string
import functools
import operator
import pyarabic.araby as ar
import emoji

def data_cleaning(text):
    # Remove URLs
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"https\S+", "", text)
    
    # Replace multiple whitespaces with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Remove numbers
    text = re.sub(r"\d+", " ", text)
    
    # Strip tashkeel and tatweel (diacritics and elongation)
    text = ar.strip_tashkeel(text)
    text = ar.strip_tatweel(text)
    
    # Replace special characters
    text = text.replace("#", " ")
    text = text.replace("@", " ")
    text = text.replace("_", " ")
    
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    
    # Remove emojis using emoji.replace_emoji
    text = emoji.replace_emoji(text, replace="")  # Replace emojis with an empty string
    
    # Remove repeated characters
    text = re.sub(r'(.)\1+', r'\1', text)
    
    # Normalize Arabic letters
    text = text.replace("آ", "ا")
    text = text.replace("إ", "ا")
    text = text.replace("أ", "ا")
    text = text.replace("ؤ", "و")
    text = text.replace("ئ", "ي")
    text = text.replace("ة", "ه")
    
    return text


In [114]:
# df['comment'].apply(lambda x: data_cleaning(x))
df=data_preprocessing(df)
df 

Unnamed: 0,BusinessId,review_id,rating,comment,creation_date
0,60202,364189,1,كنت أتعامل معهم أسعارهم أرخص آخر مرة حصلت لي ع...,2024-05-11 05:56:29.327
1,60202,356203,1,مصداقية اسعار المنتج,2024-03-06 08:08:24.190
2,60202,317781,1,متجر سئ مصداقيته بيانات البضاعة وايضا مواعيد ا...,2023-02-18 15:41:20.320
3,60202,312616,1,,2023-01-18 10:17:30.503
4,60202,309154,1,,2023-01-01 03:54:11.487
...,...,...,...,...,...
62879,44461,61446,5,مايميز المتجر سرعة الرد والتوصيل السريع وجودة ...,2018-08-28 19:16:42.617
62880,44461,60746,5,تعامل متجر توليب تجربه تستحق اعادتها مره النوا...,2018-07-31 02:51:11.543
62881,44461,60443,5,تجربتي جمييله وانسانه ذوق واخلاق ومتعاونه ربي ...,2018-07-22 16:43:45.897
62882,44461,60180,5,متجر توليب جميل والتعامل اجمل,2018-07-16 08:09:20.407


In [64]:
df['comment'].apply(lambda x: data_cleaning(x))
df=data_preprocessing(df)
df 

Unnamed: 0,BusinessId,review_id,rating,comment,creation_date
0,60202,364189,1,كنت أتعامل معهم أسعارهم أرخص لكن آخر مرة حصلت ...,2024-05-11 05:56:29.327
1,60202,356203,1,عدم المصداقية في اسعار المنتجات,2024-03-06 08:08:24.190
2,60202,317781,1,متجر سئ جدا في مصداقيته في بيانات البضاعة وايض...,2023-02-18 15:41:20.320
3,60202,312616,1,,2023-01-18 10:17:30.503
4,60202,309154,1,,2023-01-01 03:54:11.487
...,...,...,...,...,...
62879,44461,61446,5,اكثر مايميز المتجر سرعة الرد والتوصيل السريع و...,2018-08-28 19:16:42.617
62880,44461,60746,5,تعامل مع متجر توليب تجربه تستحق اعادتها اكثر م...,2018-07-31 02:51:11.543
62881,44461,60443,5,تجربتي معها جمييله وانسانه ذوق واخلاق ومتعاونه...,2018-07-22 16:43:45.897
62882,44461,60180,5,متجر توليب جميل جدا والتعامل اجمل,2018-07-16 08:09:20.407


In [115]:
df.to_csv('10_reviews_without_Arabic_stop_words.csv')

In [66]:
# import pandas as pd
# import requests
# from concurrent.futures import ThreadPoolExecutor, as_completed

# # Farasa API endpoint
# farasa_api = "http://farasa.qcri.org/msa/webapi/segment"

# # Function to segment text using Farasa API
# def segment_comment(comment):
#     try:
#         response = requests.post(farasa_api, data={'text': comment})
#         if response.status_code == 200:
#             return response.json()['text']
#         else:
#             return f"Error {response.status_code}"
#     except Exception as e:
#         return f"Error: {str(e)}"

# # Function to process the DataFrame with threading
# def process_comments(df):
#     results = []
    
#     # Use ThreadPoolExecutor to send requests concurrently
#     with ThreadPoolExecutor(max_workers=5) as executor:
#         # Submit tasks to the executor for each comment
#         future_to_comment = {executor.submit(segment_comment, comment): comment for comment in df['comment']}
        
#         for future in as_completed(future_to_comment):
#             comment = future_to_comment[future]
#             try:
#                 result = future.result()
#                 results.append(result)
#             except Exception as exc:
#                 results.append(f'Error: {exc}')
    
#     return results

# # Apply segmentation using threading
# df['segmented_comment'] = process_comments(df)

# # Display the DataFrame with segmented comments
# print(df[['comment', 'segmented_comment']])
