In [491]:
import pandas as pd 
import numpy as np 
import re

In [492]:
df = pd.read_csv("10_Reviews.csv" , index_col=0)

In [493]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62884 entries, 0 to 62883
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   BusinessId     62884 non-null  int64 
 1   review_id      62884 non-null  int64 
 2   rating         62884 non-null  int64 
 3   comment        41021 non-null  object
 4   creation_date  62884 non-null  object
dtypes: int64(3), object(2)
memory usage: 2.9+ MB


In [494]:
df['creation_date'] = df['creation_date'].astype('datetime64[ns]')

In [495]:
df['comment'].fillna('لا يوجد' , inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['comment'].fillna('لا يوجد' , inplace=True)


In [496]:
df['comment'][df['comment'] == 'ممتاز '] = df['comment'][df['comment'] == 'ممتاز '].replace('ممتاز ' , 'ممتاز') 

In [497]:
df.isnull().sum()

BusinessId       0
review_id        0
rating           0
comment          0
creation_date    0
dtype: int64

### clean data for seg

In [498]:
import pyarabic.araby as ar

# import Stemmer
import functools, operator
import logging

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [499]:
arabic_stop_words=[]
with open ('Stop_Words.txt',encoding='utf-8') as f :
    for i in f.readlines() :
        arabic_stop_words.append(i)
        arabic_stop_words[-1]=arabic_stop_words[-1][:-1]

In [500]:
import re
import string
import emoji
from nltk.stem.isri import ISRIStemmer

def data_preprocessing(df, arabic_stop_words):
  
    # Function for general text cleaning
    def data_cleaning(text):

        # Remove URLs
        text = re.sub(r'http\S+', '', text)
        
        # Remove usernames and non-Arabic words (alphanumeric words)
        text = re.sub(r'\b[A-Za-z0-9_]+\b', '', text)  # Remove any non-Arabic alphanumeric words

        # Handle Windows line breaks (\r\n)
        text = text.replace('\r\n', ' ')
        
        # Handle Unix line breaks (\n)
        text = text.replace('\n', ' ')
        
        # Handle old Mac line breaks (\r)
        text = text.replace('\r', ' ')
        
        # Handle tabs (\t)
        text = text.replace('\t', ' ')

        # Remove numbers
        text = re.sub(r"\d+", "", text)
        
        # Remove special characters and punctuation, keeping Arabic letters and spaces
        text = re.sub(r'[^\w\sء-ي]', '', text)
        
        # Remove emojis
        text = emoji.replace_emoji(text, replace="")
        
        # Normalize Arabic letters
        text = text.replace("آ", "ا").replace("إ", "ا").replace("أ", "ا")
        text = text.replace("ؤ", "و").replace("ئ", "ي").replace("ة", "ه")
        
        return text
    
    # Function to handle additional cleaning for spaces and punctuation
    def clean_text(text):
        # Remove all punctuation
        translator = str.maketrans('', '', string.punctuation)
        text = text.translate(translator)
        
        # Replace multiple spaces with a single space
        text = re.sub(r'\s+', ' ', text)
        
        # Remove leading and trailing spaces
        text = text.strip()
        
        return text

    # Remove repeated letters (e.g., "الللللله" to "الله")
    def remove_repeated_letters(text):
        return text[0:2] + ''.join([text[i] for i in range(2, len(text)) if text[i] != text[i-1] or text[i] != text[i-2]])

    # Apply the cleaning functions
    df['comment'] = df['comment'].apply(data_cleaning)
    df['comment'] = df['comment'].apply(clean_text)
    df['comment'] = df['comment'].apply(remove_repeated_letters)
    
    # Remove stop words
    df['comment'] = df['comment'].apply(
        lambda x: ' '.join([word for word in x.split() if word not in arabic_stop_words])
    )
    

    return df

In [501]:
df = data_preprocessing(df, arabic_stop_words)

In [502]:
df['comment'][df['comment'] == ''] = df['comment'][df['comment'] == ''].replace('' , 'لا يوجد') 
df['comment'][df['comment'] == 'يوجد'] = df['comment'][df['comment'] == 'يوجد'].replace('يوجد' , 'لا يوجد') 

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['comment'][df['comment'] == ''] = df['comment'][df['comment'] == ''].replace('' , 'لا يوجد')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#

In [504]:
df.to_csv('10_reviews_Stop_Words.csv')

In [506]:
# import re
# import pandas as pd

# # Function to check for cleanliness issues
# def check_data_cleanliness(text):
#     issues = {}
    
#     # Check for line breaks, tabs
#     if re.search(r'[\r\n\t]', text):
#         issues['Line Breaks or Tabs'] = True
    
#     # Check for special characters (excluding Arabic letters, numbers, and spaces)
#     if re.search(r'[^\w\sء-ي]', text):
#         issues['Special Characters'] = True
    
#     # Check for any digits (assuming we do not want them)
#     if re.search(r'\d+', text):
#         issues['Contains Numbers'] = True
    
#     # Check for multiple spaces
#     if re.search(r'\s{2,}', text):
#         issues['Multiple Spaces'] = True
    
#     return issues

# # Apply the function to the 'comment' column
# df['cleanliness_issues'] = df['comment'].apply(check_data_cleanliness)

# # Filter out rows with any issues and show the problematic comments
# issues_found = df[df['cleanliness_issues'].apply(bool)]

# # Display the problematic comments
# print(issues_found[['comment', 'cleanliness_issues']])

Empty DataFrame
Columns: [comment, cleanliness_issues]
Index: []
