In [1]:
import pandas as pd 
import numpy as np 

In [2]:
df = pd.read_csv("reviews.csv" , index_col=0)

In [3]:
df

Unnamed: 0,BusinessId,review_id,rating,comment,creation_date
0,268517,368276,5,تعامل راقي ومتعاونين جدا يعطيكم العافيه والطلب...,2024-09-03T10:46:56.273
1,268517,366180,5,التعامل يفتح النفس ويسون نفس المطلوب والتوصيل ...,2024-06-10T17:02:09.137
2,268517,366102,5,اسلوب وذوق ماشاءالله,2024-06-07T19:02:15.573
3,268517,361135,5,تعامل رائع ان شاء الله مب اخر مره اطلب منها,2024-05-08T15:44:14.163
4,268517,359092,5,اشكرهم على حسن التعامل والاسلوب والتنسيق الحلو...,2024-04-03T21:42:03.303
...,...,...,...,...,...
47172,277496,326234,5,,2023-03-30T20:29:55.22
47173,277496,326233,5,,2023-03-30T20:29:52.063
47174,277496,326232,5,,2023-03-30T20:29:49.333
47175,277496,326231,5,,2023-03-30T20:29:46.557


In [4]:
df['comment'][46805]

nan

In [5]:
df['creation_date'][504]

'2021-05-01T21:59:58.75'

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47177 entries, 0 to 47176
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   BusinessId     47177 non-null  int64 
 1   review_id      47177 non-null  int64 
 2   rating         47177 non-null  int64 
 3   comment        29505 non-null  object
 4   creation_date  47177 non-null  object
dtypes: int64(3), object(2)
memory usage: 3.2+ MB


In [7]:
df['BusinessId'].value_counts().head(50)

BusinessId
39976     4596
77092     3333
48246     1813
24519     1553
257509    1204
216993    1186
15908     1167
51267      967
1045       881
2897       808
24974      676
44461      656
258697     623
86675      585
214933     507
17184      493
96402      476
89771      466
7855       444
218848     435
42825      408
64629      393
238860     393
49372      392
669        389
212567     388
235945     377
90277      375
235942     375
1491       371
205        368
27327      368
176761     345
204828     334
235951     325
287679     312
117172     308
1428       302
3114       299
242704     287
55142      279
54686      275
200629     257
8889       255
39658      249
205289     235
15853      213
4341       211
107585     207
40924      206
Name: count, dtype: int64

In [8]:
df['creation_date'] = df['creation_date'].astype('datetime64[ns]')

In [9]:
df['comment'].fillna('No Comment' , inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['comment'].fillna('No Comment' , inplace=True)


In [10]:
df.isnull().sum()

BusinessId       0
review_id        0
rating           0
comment          0
creation_date    0
dtype: int64

In [11]:
import pyarabic.araby as ar

# import Stemmer
import functools, operator
import logging

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [12]:
arabic_stop_words=[]
with open ('./Arabic_stop_words.txt',encoding='utf-8') as f :
    for i in f.readlines() :
        arabic_stop_words.append(i)
        arabic_stop_words[-1]=arabic_stop_words[-1][:-1]

In [13]:
import numpy as np
import pandas as pd
import re

#============= Read CSV and apply data preperation =============#


def data_preprocessing (data_frame):
    # clean-up: remove #tags, http links and special symbols
    # data_frame['comment']= data_frame['comment'].apply(lambda x: x[2:-2])
    data_frame['comment']= data_frame['comment'].apply(lambda x: re.sub(r'http\S+', '', x))
    data_frame['comment'] = data_frame['comment'].apply(lambda x: re.sub(r'[@|#]\S*', '', x))
    data_frame['comment'] = data_frame['comment'].apply(lambda x: re.sub(r'"+', '', x))

    # Remove arabic signs
    data_frame['comment'] = data_frame['comment'].apply(lambda x: re.sub(r'([@A-Za-z0-9_ـــــــــــــ]+)|[^\w\s]|#|http\S+', '', x))

    # Remove repeated letters like "الللللللللللللللله" to "الله"
    data_frame['comment'] = data_frame['comment'].apply(lambda x: x[0:2] + ''.join([x[i] for i in range(2, len(x)) if x[i]!=x[i-1] or x[i]!=x[i-2]]))

    # remove stop words
    data_frame['comment'] = data_frame['comment'].apply(lambda x: '' if x in arabic_stop_words else x)

    from nltk.stem.isri import ISRIStemmer
    data_frame['comment']=data_frame['comment'].apply(lambda x:ISRIStemmer().stem(x))

    return data_frame

In [14]:
import re
import string
import functools
import operator
import pyarabic.araby as ar
import emoji

def data_cleaning(text):
    # Remove URLs
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"https\S+", "", text)
    
    # Replace multiple whitespaces with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Remove numbers
    text = re.sub(r"\d+", " ", text)
    
    # Strip tashkeel and tatweel (diacritics and elongation)
    text = ar.strip_tashkeel(text)
    text = ar.strip_tatweel(text)
    
    # Replace special characters
    text = text.replace("#", " ")
    text = text.replace("@", " ")
    text = text.replace("_", " ")
    
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    
    # Remove emojis using emoji.replace_emoji
    text = emoji.replace_emoji(text, replace="")  # Replace emojis with an empty string
    
    # Remove repeated characters
    text = re.sub(r'(.)\1+', r'\1', text)
    
    # Normalize Arabic letters
    text = text.replace("آ", "ا")
    text = text.replace("إ", "ا")
    text = text.replace("أ", "ا")
    text = text.replace("ؤ", "و")
    text = text.replace("ئ", "ي")
    text = text.replace("ة", "ه")
    
    return text


In [15]:
df['comment'].apply(lambda x: data_cleaning(x))
df=data_preprocessing(df)
df 

Unnamed: 0,BusinessId,review_id,rating,comment,creation_date
0,268517,368276,5,تعامل راقي ومتعاونين جدا يعطيكم العافيه والطلب...,2024-09-03 10:46:56.273
1,268517,366180,5,تعامل يفتح النفس ويسون نفس المطلوب والتوصيل سرريع,2024-06-10 17:02:09.137
2,268517,366102,5,اسلوب وذوق ماشاءالله,2024-06-07 19:02:15.573
3,268517,361135,5,تعامل رائع ان شاء الله مب اخر مره اطلب من,2024-05-08 15:44:14.163
4,268517,359092,5,اشكرهم على حسن التعامل والاسلوب والتنسيق الحلو...,2024-04-03 21:42:03.303
...,...,...,...,...,...
47172,277496,326234,5,,2023-03-30 20:29:55.220
47173,277496,326233,5,,2023-03-30 20:29:52.063
47174,277496,326232,5,,2023-03-30 20:29:49.333
47175,277496,326231,5,,2023-03-30 20:29:46.557


In [16]:
(df[df['comment'].index == 17]['comment'].values.tolist()[0])

In [23]:
# df.to_csv('try_model.csv')

In [24]:
# import pandas as pd
# import requests
# from concurrent.futures import ThreadPoolExecutor, as_completed

# # Farasa API endpoint
# farasa_api = "http://farasa.qcri.org/msa/webapi/segment"

# # Function to segment text using Farasa API
# def segment_comment(comment):
#     try:
#         response = requests.post(farasa_api, data={'text': comment})
#         if response.status_code == 200:
#             return response.json()['text']
#         else:
#             return f"Error {response.status_code}"
#     except Exception as e:
#         return f"Error: {str(e)}"

# # Function to process the DataFrame with threading
# def process_comments(df):
#     results = []
    
#     # Use ThreadPoolExecutor to send requests concurrently
#     with ThreadPoolExecutor(max_workers=5) as executor:
#         # Submit tasks to the executor for each comment
#         future_to_comment = {executor.submit(segment_comment, comment): comment for comment in df['comment']}
        
#         for future in as_completed(future_to_comment):
#             comment = future_to_comment[future]
#             try:
#                 result = future.result()
#                 results.append(result)
#             except Exception as exc:
#                 results.append(f'Error: {exc}')
    
#     return results

# # Apply segmentation using threading
# df['segmented_comment'] = process_comments(df)

# # Display the DataFrame with segmented comments
# print(df[['comment', 'segmented_comment']])


                                                 comment segmented_comment
0      تعامل راقي ومتعاونين جدا يعطيكم العافيه والطلب...         Error 403
1      تعامل يفتح النفس ويسون نفس المطلوب والتوصيل سرريع         Error 403
2                                   اسلوب وذوق ماشاءالله         Error 403
3              تعامل رائع ان شاء الله مب اخر مره اطلب من         Error 403
4      اشكرهم على حسن التعامل والاسلوب والتنسيق الحلو...         Error 403
...                                                  ...               ...
47172                                                            Error 403
47173                                                            Error 403
47174                                                            Error 403
47175                                                            Error 403
47176                                                            Error 403

[47177 rows x 2 columns]


In [None]:
# df.to_csv('mohammed dataset.csv')