In [90]:
import numpy as np 
import pandas as pd 
from afinn import Afinn
from langdetect import detect




In [91]:
reviews = pd.read_csv("data_reviews.csv")
reviews.head()

Unnamed: 0.1,Unnamed: 0,Review_Text,Language,Rating (1-5),Country,Sentiment,Aspect_Focus
0,0,La livraison a été très rapide et le repas éta...,French,5,Algeria,Positive,Delivery Speed
1,1,الأكل وصل باردًا ولم يكن طازجًا كما توقعت.,Arabic,2,Algeria,Negative,Food Quality
2,2,The food was delicious and the delivery was pr...,English,5,Algeria,Positive,Food Quality
3,3,La plateforme a des problèmes de connexion fré...,French,2,Algeria,Negative,App Usability
4,4,خدمة التوصيل ممتازة والأسعار مناسبة.,Arabic,5,Algeria,Positive,Pricing


# Data Exploration 

In [92]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1138 entries, 0 to 1137
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    1138 non-null   int64 
 1   Review_Text   1138 non-null   object
 2   Language      1138 non-null   object
 3   Rating (1-5)  1138 non-null   object
 4   Country       1138 non-null   object
 5   Sentiment     1138 non-null   object
 6   Aspect_Focus  1138 non-null   object
dtypes: int64(1), object(6)
memory usage: 62.4+ KB


In [93]:
reviews.isnull().sum()


Unnamed: 0      0
Review_Text     0
Language        0
Rating (1-5)    0
Country         0
Sentiment       0
Aspect_Focus    0
dtype: int64

# Countries

In [94]:
countries = reviews['Country'].unique()

print("Unique countries:", countries)


Unique countries: ['Algeria' 'Country']


In [95]:
reviews['Country'].value_counts()


Country
Algeria    1136
Country       2
Name: count, dtype: int64

In [96]:
aspects = reviews['Aspect_Focus'].unique()
print("Unique aspects:", aspects)


Unique aspects: ['Delivery Speed' 'Food Quality' 'App Usability' 'Pricing'
 'Order Accuracy' 'Packaging' 'Customer Service' 'Aspect_Focus'
 'Burger Quality' 'Food Temperature']


In [97]:
reviews['Aspect_Focus'].value_counts()

Aspect_Focus
Food Quality        339
Delivery Speed      312
App Usability       181
Pricing              87
Packaging            78
Order Accuracy       72
Customer Service     60
Burger Quality        6
Aspect_Focus          2
Food Temperature      1
Name: count, dtype: int64

In [98]:
language = reviews['Language'].unique()
print("Languages :", language)


Languages : ['French' 'Arabic' 'English' 'Language']


In [99]:
cleaned_data = reviews[~reviews['Language'].isin(['French', 'Arabic', 'English'])]


cleaned_data.head()


Unnamed: 0.1,Unnamed: 0,Review_Text,Language,Rating (1-5),Country,Sentiment,Aspect_Focus
521,521,Review_Text,Language,Rating (1-5),Country,Sentiment (Positive/Negative),Aspect_Focus
622,622,Review_Text,Language,Rating (1-5),Country,Sentiment (Positive/Negative),Aspect_Focus


In [100]:
reviews['Language'].value_counts()

Language
French      392
English     377
Arabic      367
Language      2
Name: count, dtype: int64

In [101]:
packaging_data = reviews[reviews['Aspect_Focus'] == 'packaging']

print(packaging_data)


Empty DataFrame
Columns: [Unnamed: 0, Review_Text, Language, Rating (1-5), Country, Sentiment, Aspect_Focus]
Index: []


# Restaurant Reviews


In [102]:
reviews2 = pd.read_csv('restaurant_reviews.csv')

In [103]:
reviews2.head()

Unnamed: 0,title,text,stars
0,gAAAAABnUH4NgkTQ9eyZpcZr0JpVZKT2A_-T3BUnPBPZps...,Sauce vipo infecte piquante sans aucun goût ni...,2
1,gAAAAABnUH4NyGLVcUk0aT-6YySNpN-Atb905EoDedSWV2...,"La cuisine est médiocre, la sauce Vipo est san...",1
2,gAAAAABnUH4NDFwRDSycnVfF_qiCYlFhB60ebSobqCmoWh...,الاجواء و الاضاءة روعة\n النظافة في القمة\n ال...,5
3,gAAAAABnUH4NRqzylGPMu-ODANAHON7KKDiz4uJhnHJUfQ...,"J'ai vraiment aimé cette place, le service est...",4
4,gAAAAABnUH4NiAgbqy6d_5sxsXSJ5Ly_TOLP2DMYqka9_4...,"Cozy place, good burgers for the price u pay, ...",4


In [104]:
reviews2['review_sentiment'] = reviews2['stars'].apply(lambda x: 'negative' if x < 3 else 'positive')
reviews2 = reviews2.drop(columns=['title'])
reviews2.head()



Unnamed: 0,text,stars,review_sentiment
0,Sauce vipo infecte piquante sans aucun goût ni...,2,negative
1,"La cuisine est médiocre, la sauce Vipo est san...",1,negative
2,الاجواء و الاضاءة روعة\n النظافة في القمة\n ال...,5,positive
3,"J'ai vraiment aimé cette place, le service est...",4,positive
4,"Cozy place, good burgers for the price u pay, ...",4,positive


In [105]:
afinn = Afinn()

In [106]:
# Multilingual lexicons for specific issues
bad_words = ['bad', 'terrible', 'awful', 'horrible', 'worse', 'mauvais', 'horrible', 'فضيع', 'سيء']
sexualization_keywords = [
    'sexual', 'inappropriate', 'harassment', 'sexuel', 'inapproprié', 'تحرش', 'غير لائق'
]
security_keywords = [
    'robbery', 'stolen', 'violence', 'attack', 'scam', 'fraud', 
    'vol', 'violence', 'attaque', 'escroquerie', 'سرقة', 'عنف', 'هجوم', 'احتيال'
]
extreme_keywords = bad_words + sexualization_keywords + security_keywords

In [107]:
# Function to detect language and process text
def preprocess_text(review):
    try:
        lang = detect(review)
    except:
        lang = "unknown"  # Handle cases where language detection fails
    return review.lower(), lang

In [108]:
def calculate_gravity_with_afinn(review):
    review_lower, lang = preprocess_text(review)
    
    gravity = 0 
    
    for keyword in extreme_keywords:
        if keyword in review_lower:
            gravity += 1  
    # Calculate AFINN sentiment score (only for English reviews)
    sentiment_score = 0
    if lang == 'en':
        sentiment_score = afinn.score(review)
    
    # Adjust gravity based on sentiment score
    if sentiment_score < -3:  # Strongly negative
        gravity += 2
    elif sentiment_score < 0:  # Mildly negative
        gravity += 1
    
    return gravity, lang

In [109]:
reviews2['text'] = reviews2['text'].astype(str).fillna('')

In [110]:
reviews2[['gravity', 'language']] = reviews2['text'].apply(
    lambda review: pd.Series(calculate_gravity_with_afinn(review))
)


In [111]:
reviews2.head()

Unnamed: 0,text,stars,review_sentiment,gravity,language
0,Sauce vipo infecte piquante sans aucun goût ni...,2,negative,0,fr
1,"La cuisine est médiocre, la sauce Vipo est san...",1,negative,0,fr
2,الاجواء و الاضاءة روعة\n النظافة في القمة\n ال...,5,positive,0,ar
3,"J'ai vraiment aimé cette place, le service est...",4,positive,0,fr
4,"Cozy place, good burgers for the price u pay, ...",4,positive,0,en


In [112]:
reviews2['gravity'].unique()

array([0, 2, 1, 3, 5, 4])

In [113]:
reviews2['gravity'].value_counts()

gravity
0    8198
1     339
2      84
3      24
4       7
5       2
Name: count, dtype: int64

In [114]:
reviews2['gravity']

0       0
1       0
2       0
3       0
4       0
       ..
8649    0
8650    0
8651    0
8652    0
8653    0
Name: gravity, Length: 8654, dtype: int64

In [115]:
reviews2['language'].unique()

array(['fr', 'ar', 'en', 'it', 'tr', 'unknown', 'tl', 'fi', 'so', 'da',
       'ro', 'sw', 'lt', 'fa', 'cy', 'id', 'nl', 'es', 'ca', 'hu', 'sk',
       'de', 'pt', 'et', 'no', 'sl', 'af', 'vi', 'ur', 'lv', 'hr', 'cs',
       'pl', 'ru', 'sv', 'zh-cn', 'ko', 'sq'], dtype=object)

In [116]:
high_gravity_reviews = reviews2[reviews2['gravity'].isin([4, 5])]

In [117]:
high_gravity_reviews.head(20)

Unnamed: 0,text,stars,review_sentiment,gravity,language
660,"Disgusting burger horrible service, too much h...",1,negative,5,en
716,horrible service. this is the 3rd time we’ve o...,1,negative,4,en
1171,- Les livreurs dyalhom ytbelaw bnat ness ki yw...,1,negative,4,en
2803,The food was good but there was a lack of the ...,3,positive,4,en
4618,- service was terrible.\n - waiters not friend...,1,negative,4,en
4620,The service was terrible. The waiters were ver...,1,negative,4,en
5964,"This place fell off hard,i called for a takeou...",1,negative,4,en
7098,"Such a bad experience, the service was awful, ...",1,negative,4,en
7866,Horrible service\nRude waitress\nHow can't we ...,1,negative,5,en


In [118]:
for text in high_gravity_reviews['text']:
    print(text)
    print('\n')

Disgusting burger horrible service, too much hype too much sauce, terrible experience, i found a hair, the food was cold, never going back thanks for nothing,


horrible service. this is the 3rd time we’ve ordered from them where they’ve forgotten something but they still charge you for it. if you have ran out of a certain item, call and apologise instead of sending the order without and still charging the customer you absolute idiots


- Les livreurs dyalhom ytbelaw bnat ness ki yweslolhom la commande, mon amie lyouma ki wsltha la commande 9alha omb3d n3aytlek.
 - Aujourd’hui, un de leurs livreurs quand nous avons eu notre commande, il a dit à ma meilleure ami qu’il la rappellera plus tard, quel horrible service !
 - Today One of their delivery guys when we had our order, he told my bestfriend that he will call her back later, such a horrible service !


The food was good but there was a lack of the menu, so many things weren't available, and the wait was horrible, so I ordered a burg

In [119]:
reviews2['text'].iloc[1001]

"J'ai récemment eu l'occasion de visiter le restaurant American Burger et j'ai été séduit par leur délicieux burger Hollandia. La combinaison de saveurs était excellente et la taille généreuse du burger était impressionnante. La viande était juteuse et les accompagnements ajoutaient une touche de fraîcheur.\n \n Cependant, j'ai été un peu déçu de constater qu'ils ne fournissaient pas de gants avec le burger. Étant donné que j'ai pris un Triple qui est assez copieux et savoureux, il peut être un peu désordonné à déguster. Des gants jetables seraient très appréciés pour faciliter la dégustation sans se salir les mains.\n \n Je tiens à souligner que la qualité du burger en lui-même était excellente et que cela vaut la peine d'être essayé. Cependant, j'encouragerais le restaurant American Burger à envisager d'inclure des gants jetables avec leurs burgers triples. Cela permettrait aux clients de profiter pleinement de leur repas sans se soucier de se salir les mains.\n \n Malgré cet aspect,

In [120]:
detected_languages = reviews2['language'].unique()

In [121]:
print(detected_languages)

['fr' 'ar' 'en' 'it' 'tr' 'unknown' 'tl' 'fi' 'so' 'da' 'ro' 'sw' 'lt'
 'fa' 'cy' 'id' 'nl' 'es' 'ca' 'hu' 'sk' 'de' 'pt' 'et' 'no' 'sl' 'af'
 'vi' 'ur' 'lv' 'hr' 'cs' 'pl' 'ru' 'sv' 'zh-cn' 'ko' 'sq']


In [122]:
reviews2['language'].value_counts()

language
fr         5242
en         1260
ar         1008
it          179
ca          120
tl          106
so           75
id           71
es           59
de           58
ro           55
ur           43
unknown      42
pt           40
nl           33
af           30
sw           26
tr           22
et           21
cy           18
fi           16
fa           15
sl           10
hu           10
lt            9
sk            9
pl            9
cs            9
da            8
no            8
zh-cn         8
vi            8
hr            7
lv            6
ru            4
sv            4
ko            4
sq            2
Name: count, dtype: int64

In [123]:
filtered_reviews = reviews2[reviews2['language'].isin(['fr', 'en', 'ar'])]

In [124]:
filtered_reviews.head()

Unnamed: 0,text,stars,review_sentiment,gravity,language
0,Sauce vipo infecte piquante sans aucun goût ni...,2,negative,0,fr
1,"La cuisine est médiocre, la sauce Vipo est san...",1,negative,0,fr
2,الاجواء و الاضاءة روعة\n النظافة في القمة\n ال...,5,positive,0,ar
3,"J'ai vraiment aimé cette place, le service est...",4,positive,0,fr
4,"Cozy place, good burgers for the price u pay, ...",4,positive,0,en


In [125]:
language_mapping = {'fr': 'French', 'en': 'English', 'ar': 'Arabic'}
filtered_reviews['language'] = filtered_reviews['language'].map(language_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_reviews['language'] = filtered_reviews['language'].map(language_mapping)


In [126]:
filtered_reviews.head()

Unnamed: 0,text,stars,review_sentiment,gravity,language
0,Sauce vipo infecte piquante sans aucun goût ni...,2,negative,0,French
1,"La cuisine est médiocre, la sauce Vipo est san...",1,negative,0,French
2,الاجواء و الاضاءة روعة\n النظافة في القمة\n ال...,5,positive,0,Arabic
3,"J'ai vraiment aimé cette place, le service est...",4,positive,0,French
4,"Cozy place, good burgers for the price u pay, ...",4,positive,0,English


In [127]:
filtered_reviews.drop(columns=['stars', 'gravity'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_reviews.drop(columns=['stars', 'gravity'], inplace=True)


In [128]:
filtered_reviews.head()

Unnamed: 0,text,review_sentiment,language
0,Sauce vipo infecte piquante sans aucun goût ni...,negative,French
1,"La cuisine est médiocre, la sauce Vipo est san...",negative,French
2,الاجواء و الاضاءة روعة\n النظافة في القمة\n ال...,positive,Arabic
3,"J'ai vraiment aimé cette place, le service est...",positive,French
4,"Cozy place, good burgers for the price u pay, ...",positive,English


In [129]:
# Rename columns in filtered_reviews to match the final format
filtered_reviews = filtered_reviews[['text', 'review_sentiment']].rename(columns={'text': 'review_text', 'review_sentiment': 'sentiment'})

# Rename columns in reviews to match the final format
reviews = reviews[['Review_Text', 'Sentiment']].rename(columns={'Review_Text': 'review_text', 'Sentiment': 'sentiment'})

# Concatenate the DataFrames
merged_reviews = pd.concat([filtered_reviews[['review_text', 'sentiment']], reviews[['review_text', 'sentiment']]])

# Display the merged DataFrame
print(merged_reviews)


                                            review_text sentiment
0     Sauce vipo infecte piquante sans aucun goût ni...  negative
1     La cuisine est médiocre, la sauce Vipo est san...  negative
2     الاجواء و الاضاءة روعة\n النظافة في القمة\n ال...  positive
3     J'ai vraiment aimé cette place, le service est...  positive
4     Cozy place, good burgers for the price u pay, ...  positive
...                                                 ...       ...
1133  The delivery was fast and food was good. The b...  Positive
1134  Fast service, but the food was disappointing. ...  Negative
1135  Delivery was on time, and the food was hot. Th...  Positive
1136  Service was fast, but the burger was not good,...  Negative
1137  The food arrived quickly, but it was cold. The...  Negative

[8648 rows x 2 columns]


In [130]:
# Standardize sentiment labels to lowercase
merged_reviews['sentiment'] = merged_reviews['sentiment'].str.lower()

# Handle possible variations and replace them
merged_reviews['sentiment'] = merged_reviews['sentiment'].replace({
    'positive': 'positive',
    'negative': 'negative',
    'positif': 'positive',  # If 'Positif' is meant to be 'positive'
    'sentiment (positive/negative)': 'positive',  # If you want to treat this as positive
    '': 'negative'  # If there's any empty value, consider it negative
})

# Check the distribution of the sentiment column
print(merged_reviews['sentiment'].value_counts())


sentiment
positive    5991
negative    2657
Name: count, dtype: int64


Naive Bayes


In [131]:
merged_reviews['sentiment'].value_counts()

sentiment
positive    5991
negative    2657
Name: count, dtype: int64

# UTILS

In [132]:
import re
import string
import emoji
from langdetect import detect, LangDetectException
import nltk
nltk.download('stopwords')  # For stopwords
nltk.download('punkt')      # For tokenization
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions
from nltk.tokenize import word_tokenize



# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Load stopwords for English, French, and Arabic
stop_words_en = set(stopwords.words('english'))
stop_words_fr = set(stopwords.words('french'))
stop_words_ar = set(stopwords.words('arabic'))

# # Clean emojis from text
# def strip_emoji(text):
#     """Remove emojis from the review text."""
#     return emoji.get_emoji_regexp().sub("", text)

# Remove punctuations, stopwords, links, mentions, and non-ASCII characters
def strip_all_entities(text, lang='en'):
    """Clean review text by removing links, mentions, punctuation, stopwords, and non-ASCII characters."""
    # Normalize newlines and convert to lowercase
    text = re.sub(r'\r|\n', ' ', text.lower()) 
    
    # Remove links and mentions
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)  
    
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7f]', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords based on the detected language
    if lang == 'fr':
        stop_words = stop_words_fr
    elif lang == 'ar':
        stop_words = stop_words_ar
    else:
        stop_words = stop_words_en
    
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    
    return text

# Clean hashtags in review text
def clean_hashtags(review):
    """Remove hashtags at the end and clean middle hashtags in the review."""
    review = re.sub(r'(\s+#[\w-]+)+\s*$', '', review).strip()  # Remove ending hashtags
    return re.sub(r'#([\w-]+)', r'\1', review).strip()  # Remove '#' symbol from middle hashtags

# Filter special characters such as & and $ present in some words
def filter_chars(text):
    """Filter out unwanted characters like $ and & from the review text."""
    return ' '.join('' if ('$' in word) or ('&' in word) else word for word in text.split())

# Remove multiple spaces
def remove_mult_spaces(text):
    """Remove multiple spaces from review text."""
    return re.sub(r"\s\s+", " ", text)

# Function to check if the review text is in English, and return an empty string if it's not
# def filter_non_english(text):
#     """Filter out non-English reviews."""
#     try:
#         lang = detect(text)
#     except LangDetectException:
#         lang = "unknown"
#     return text if lang == "en" else ""

# Expand contractions like "don't" to "do not"
def expand_contractions(text):
    """Expand contractions in the review text."""
    return contractions.fix(text)

# Remove numbers from the review text
def remove_numbers(text):
    """Remove numbers from the review text."""
    return re.sub(r'\d+', '', text)

# Lemmatize words in the review
def lemmatize(text, lang='en'):
    """Lemmatize words in the review text to their base forms."""
    if lang != 'en':
        return text  # Skip lemmatization for non-English text
    
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Remove short words from the review
def remove_short_words(text, min_len=2):
    """Remove words that are shorter than the specified length."""
    words = text.split()
    long_words = [word for word in words if len(word) >= min_len]
    return ' '.join(long_words)

# Replace elongated words (like "sooooon") with their base form ("soon")
def replace_elongated_words(text):
    """Replace elongated words in the review with their base form."""
    regex_pattern = r'\b(\w+)((\w)\3{2,})(\w*)\b'
    return re.sub(regex_pattern, r'\1\3\4', text)

# Remove repeated punctuation marks (e.g., "!!!", "???")
def remove_repeated_punctuation(text):
    """Remove repeated punctuation marks in the review text."""
    return re.sub(r'[\?\.\!]+(?=[\?\.\!])', '', text)

# Remove extra whitespace
def remove_extra_whitespace(text):
    """Remove extra whitespace from the review text."""
    return ' '.join(text.split())

# Remove URL shorteners
def remove_url_shorteners(text):
    """Remove common URL shorteners from the review text."""
    return re.sub(r'(?:http[s]?://)?(?:www\.)?(?:bit\.ly|goo\.gl|t\.co|tinyurl\.com|tr\.im|is\.gd|cli\.gs|u\.nu|url\.ie|tiny\.cc|alturl\.com|ow\.ly|bit\.do|adoro\.to)\S+', '', text)

# Remove spaces at the beginning and end of the review
def remove_spaces_reviews(review):
    """Remove spaces at the beginning and end of the review text."""
    return review.strip()

# Remove short reviews based on the number of words
def remove_short_reviews(review, min_words=3):
    """Remove reviews that are shorter than a specified number of words."""
    words = review.split()
    return review if len(words) >= min_words else ""


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zackb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zackb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [133]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import make_pipeline

In [None]:
def preprocess_reviews(reviews):
    # reviews['review_text'] = reviews['review_text'].apply(lambda x: strip_emoji(x))
    reviews['review_text'] = reviews['review_text'].apply(lambda x: strip_all_entities(x))
    reviews['review_text'] = reviews['review_text'].apply(lambda x: clean_hashtags(x))
    reviews['review_text'] = reviews['review_text'].apply(lambda x: filter_chars(x))
    reviews['review_text'] = reviews['review_text'].apply(lambda x: remove_mult_spaces(x))
    reviews['review_text'] = reviews['review_text'].apply(lambda x: expand_contractions(x))
    reviews['review_text'] = reviews['review_text'].apply(lambda x: remove_numbers(x))
    reviews['review_text'] = reviews['review_text'].apply(lambda x: lemmatize(x))
    reviews['review_text'] = reviews['review_text'].apply(lambda x: remove_short_words(x))

    return reviews

SyntaxError: unterminated string literal (detected at line 7) (1843806987.py, line 7)

In [135]:
reviews.head()

Unnamed: 0,review_text,sentiment
0,La livraison a été très rapide et le repas éta...,Positive
1,الأكل وصل باردًا ولم يكن طازجًا كما توقعت.,Negative
2,The food was delicious and the delivery was pr...,Positive
3,La plateforme a des problèmes de connexion fré...,Negative
4,خدمة التوصيل ممتازة والأسعار مناسبة.,Positive


In [138]:
# Apply preprocessing to merged_reviews
merged_reviews = preprocess_reviews(merged_reviews)

# Check the result
merged_reviews.head()


KeyError: 'cleaned_text'