# Imports 

In [7]:
# Data Processing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Optional Text Preprocessing
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# Optional Visualization
import matplotlib.pyplot as plt
import seaborn as sns


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zackb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zackb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zackb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data preprocessing 

In [5]:
reviews = pd.read_csv('merged_reviews.csv')
reviews.head()

Unnamed: 0,review_text,sentiment
0,sauce vipo infecte piquante sans aucun got ni ...,negative
1,la cuisine est mdiocre la sauce vipo est sans ...,negative
2,menu,positive
3,jai vraiment aim cette place le service est ch...,positive
4,cozy place good burgers price pay chicken twis...,positive


# String preprocessing

In [6]:
reviews['review_text'] = reviews['review_text'].str.lower()
reviews['review_text'] = reviews['review_text'].str.replace('[^\w\s]', '')
reviews['review_text'] = reviews['review_text'].str.replace('\s+', ' ')
reviews['review_text'] = reviews['review_text'].str.strip()
reviews.head()

Unnamed: 0,review_text,sentiment
0,sauce vipo infecte piquante sans aucun got ni ...,negative
1,la cuisine est mdiocre la sauce vipo est sans ...,negative
2,menu,positive
3,jai vraiment aim cette place le service est ch...,positive
4,cozy place good burgers price pay chicken twis...,positive



# Stopwords removal

In [8]:
from nltk.corpus import stopwords

# Load stopwords for English, French, and Arabic
STOPWORDS_EN = set(stopwords.words('english'))
STOPWORDS_FR = set(stopwords.words('french'))
STOPWORDS_AR = set(stopwords.words('arabic'))

# Custom function to remove stopwords
def remove_stopwords(text, language='english'):
    """Remove stopwords based on the specified language."""
    if language == 'english':
        stopwords_set = STOPWORDS_EN
    elif language == 'french':
        stopwords_set = STOPWORDS_FR
    elif language == 'arabic':
        stopwords_set = STOPWORDS_AR
    else:
        stopwords_set = STOPWORDS_EN  # Default to English if language is unknown

    # Remove stopwords from text
    return " ".join([word for word in str(text).split() if word not in stopwords_set])

In [12]:
# Apply the function to the DataFrame
reviews['review_text'] = reviews['review_text'].apply(lambda text: remove_stopwords(text, language='english')) 
reviews['review_text'] = reviews['review_text'].apply(lambda text: remove_stopwords(text, language='french'))
reviews.head()

Unnamed: 0,review_text,sentiment
0,sauce vipo infecte piquante sans aucun got ni ...,negative
1,cuisine mdiocre sauce vipo sans vouloir paratr...,negative
2,menu,positive
3,jai vraiment aim cette place service chaleureu...,positive
4,cozy place good burgers price pay chicken twis...,positive


# Model Architecture

In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

test_food = pd.read_csv('test_food.csv')

# Step 1: Vectorize the text data
vectorizer = TfidfVectorizer(stop_words=None, max_features=5000)

# Fit and transform the 'Text' column to numerical data
X = vectorizer.fit_transform(reviews['review_text'])

# Assuming that the sentiment is stored in 'sentiment' column (0 for negative, 1 for positive)
y = reviews['sentiment']

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = nb_model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8699

Classification Report:
              precision    recall  f1-score   support

    negative       0.94      0.62      0.75       541
    positive       0.85      0.98      0.91      1189

    accuracy                           0.87      1730
   macro avg       0.90      0.80      0.83      1730
weighted avg       0.88      0.87      0.86      1730

