# Imports 

In [24]:
# Data Processing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Optional Text Preprocessing
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# Optional Visualization
import matplotlib.pyplot as plt
import seaborn as sns


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zackb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zackb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zackb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data preprocessing 

In [37]:
reviews = pd.read_csv('merged_reviews.csv')
reviews.rename(columns={'Review_Text': 'review_text', 'Sentiment': 'sentiment'}, inplace=True)
reviews.head()

Unnamed: 0,review_text,sentiment
0,La livraison a été très rapide et le repas éta...,Positive
1,الأكل وصل باردًا ولم يكن طازجًا كما توقعت.,Negative
2,The food was delicious and the delivery was pr...,Positive
3,La plateforme a des problèmes de connexion fré...,Negative
4,خدمة التوصيل ممتازة والأسعار مناسبة.,Positive


# String preprocessing

In [38]:
reviews['review_text'] = reviews['review_text'].str.lower()
reviews['review_text'] = reviews['review_text'].str.replace('[^\w\s]', '')
reviews['review_text'] = reviews['review_text'].str.replace('\s+', ' ')
reviews['review_text'] = reviews['review_text'].str.strip()
reviews.head()

Unnamed: 0,review_text,sentiment
0,la livraison a été très rapide et le repas éta...,Positive
1,الأكل وصل باردًا ولم يكن طازجًا كما توقعت.,Negative
2,the food was delicious and the delivery was pr...,Positive
3,la plateforme a des problèmes de connexion fré...,Negative
4,خدمة التوصيل ممتازة والأسعار مناسبة.,Positive



# Stopwords removal

In [39]:
from nltk.corpus import stopwords

# Load stopwords for English, French, and Arabic
STOPWORDS_EN = set(stopwords.words('english'))
STOPWORDS_FR = set(stopwords.words('french'))
STOPWORDS_AR = set(stopwords.words('arabic'))

# Custom function to remove stopwords
def remove_stopwords(text, language='english'):
    """Remove stopwords based on the specified language."""
    if language == 'english':
        stopwords_set = STOPWORDS_EN
    elif language == 'french':
        stopwords_set = STOPWORDS_FR
    elif language == 'arabic':
        stopwords_set = STOPWORDS_AR
    else:
        stopwords_set = STOPWORDS_EN  # Default to English if language is unknown

    # Remove stopwords from text
    return " ".join([word for word in str(text).split() if word not in stopwords_set])

In [41]:
# Apply the function to the DataFrame
reviews['review_text'] = reviews['review_text'].apply(lambda text: remove_stopwords(text, language='english')) 
reviews['review_text'] = reviews['review_text'].apply(lambda text: remove_stopwords(text, language='french'))
reviews['review_text'] = reviews['review_text'].apply(lambda text: remove_stopwords(text, language='arabic'))
reviews.head()

Unnamed: 0,review_text,sentiment
0,livraison très rapide repas bnina!,Positive
1,الأكل وصل باردًا ولم يكن طازجًا توقعت.,Negative
2,food delicious delivery prompt.,Positive
3,plateforme problèmes connexion fréquents.,Negative
4,خدمة التوصيل ممتازة والأسعار مناسبة.,Positive


# Model Architecture

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder

test_food = pd.read_csv('test_food.csv')

# Step 1: Handle missing values in both reviews and test_food datasets
reviews['review_text'].fillna('missing', inplace=True)
test_food['review_text'].fillna('missing', inplace=True)

# Step 2: Encode sentiment labels ('negative' = 0, 'positive' = 1)
label_encoder = LabelEncoder()
reviews['sentiment'] = label_encoder.fit_transform(reviews['sentiment'])  # Binary labels: 'negative' and 'positive'

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reviews['review_text'], reviews['sentiment'], test_size=0.2, random_state=42)

# Step 4: Create a pipeline with TfidfVectorizer and MultinomialNB
model_pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', max_features=5000),
    MultinomialNB()
)

# Step 5: Train the model on the training data
model_pipeline.fit(X_train, y_train)

# Step 6: Evaluate the model using F1 score on the test set
y_pred = model_pipeline.predict(X_test)
f1 = f1_score(y_test, y_pred, average='binary')  # For binary classification
print(f"F1 Score on Test Set: {f1:.4f}")

# Optionally: Full classification report on the validation set
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))

# Step 7: Predict on the test data (test_food.csv)
X_test_food = test_food['review_text']
y_pred_food = model_pipeline.predict(X_test_food)

# Step 8: Prepare the submission file
# Assuming 'iD' is the identifier column in the test data
submission = test_food[['iD']].copy()  # Copy the 'iD' column
submission['sentiment'] = y_pred_food  # Add the predicted sentiment labels (0 or 1)

# Step 9: Save the submission file
submission.to_csv('submission.csv', index=False)

print("Submission file saved as 'submission.csv'")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reviews['review_text'].fillna('missing', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_food['review_text'].fillna('missing', inplace=True)


ValueError: pos_label=1 is not a valid label. It should be one of [0, 2]