# Imports 

In [8]:
# Data Processing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score

# Optional Text Preprocessing
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# Optional Visualization
import matplotlib.pyplot as plt
import seaborn as sns


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zackb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zackb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zackb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data preprocessing 

In [9]:
reviews = pd.read_csv('merged_reviews.csv')
reviews.rename(columns={'Review_Text': 'review_text', 'Sentiment': 'sentiment'}, inplace=True)
reviews.head()

Unnamed: 0,review_text,sentiment
0,La livraison a été très rapide et le repas éta...,Positive
1,الأكل وصل باردًا ولم يكن طازجًا كما توقعت.,Negative
2,The food was delicious and the delivery was pr...,Positive
3,La plateforme a des problèmes de connexion fré...,Negative
4,خدمة التوصيل ممتازة والأسعار مناسبة.,Positive


# String preprocessing

In [10]:
reviews['sentiment'].value_counts()

sentiment
Positive                         6677
Negative                         3111
Sentiment (Positive/Negative)       2
Positif                             2
Name: count, dtype: int64

In [11]:
reviews['sentiment'] = reviews['sentiment'].replace({
    'Positive': 'Positive', 
    'Negative': 'Negative', 
    'Sentiment (Positive/Negative)': 'Positive',  # assuming this corresponds to Positive
    'Positif': 'Positive'  # assuming 'Positif' is equivalent to Positive
})

In [12]:
reviews['sentiment'].value_counts()

sentiment
Positive    6681
Negative    3111
Name: count, dtype: int64

In [15]:
import re

def preprocess_text(text):
    if not isinstance(text, str):  # Check if the text is a string
        return ""  # Return an empty string if it's not
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    text = text.strip()  # Remove leading/trailing spaces
    return text

# Apply the function to the 'review_text' column
reviews['review_text'] = reviews['review_text'].apply(preprocess_text)


In [16]:
import spacy

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop])

# Apply lemmatization and stopword removal
reviews['review_text'] = reviews['review_text'].apply(lemmatize_text)



# Stopwords removal

In [None]:
from nltk.corpus import stopwords

# Load stopwords for English, French, and Arabic
STOPWORDS_EN = set(stopwords.words('english'))
STOPWORDS_FR = set(stopwords.words('french'))
STOPWORDS_AR = set(stopwords.words('arabic'))

# Custom function to remove stopwords
def remove_stopwords(text, language='english'):
    """Remove stopwords based on the specified language."""
    if language == 'english':
        stopwords_set = STOPWORDS_EN
    elif language == 'french':
        stopwords_set = STOPWORDS_FR
    elif language == 'arabic':
        stopwords_set = STOPWORDS_AR
    else:
        stopwords_set = STOPWORDS_EN  # Default to English if language is unknown

    # Remove stopwords from text
    return " ".join([word for word in str(text).split() if word not in stopwords_set])

In [None]:
# # Apply the function to the DataFrame
# reviews['review_text'] = reviews['review_text'].apply(lambda text: remove_stopwords(text, language='english')) 
# reviews['review_text'] = reviews['review_text'].apply(lambda text: remove_stopwords(text, language='french'))
# reviews['review_text'] = reviews['review_text'].apply(lambda text: remove_stopwords(text, language='arabic'))
# reviews.head()

Unnamed: 0,review_text,sentiment
0,livraison très rapide repas bnina!,positive
1,الأكل وصل باردًا ولم يكن طازجًا توقعت.,negative
2,food delicious delivery prompt.,positive
3,plateforme problèmes connexion fréquents.,negative
4,خدمة التوصيل ممتازة والأسعار مناسبة.,positive


# Model Architecture

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Load the dataset
test_food = pd.read_csv('test_food.csv')

# Drop duplicate reviews based on the 'review_text' column
reviews.drop_duplicates(subset='review_text', inplace=True)

# Handle missing values (None, NaN) in review_text and other columns if necessary
reviews['review_text'].fillna('', inplace=True)  # Replace NaNs with empty strings for text data
reviews.dropna(subset=['review_text'], inplace=True)  # Drop rows with missing text after filling NaNs

# Ensure there are no empty review texts left
reviews = test_food[test_food['review_text'].str.strip() != '']

# Encode the sentiment labels: 'Positive' -> 1, 'Negative' -> 0
label_encoder = LabelEncoder()
reviews['sentiment'] = label_encoder.fit_transform(reviews['sentiment'])

# Separate features (X) and target (y)
X = reviews['review_text']
y = reviews['sentiment']  # Now encoded as 0 (Negative) and 1 (Positive)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Parameter grid for GridSearchCV with more refined settings
param_grid = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],  # Unigrams, bigrams, trigrams
    'tfidfvectorizer__max_features': [5000, 10000, 15000],  # Max features
    'tfidfvectorizer__min_df': [1, 2],  # Minimum document frequency for feature inclusion
    'tfidfvectorizer__max_df': [0.85, 0.90, 0.95],  # Maximum document frequency to ignore common terms
    'multinomialnb__alpha': [0.5, 1.0, 1.5, 2.0]  # Smoothing parameter for Naive Bayes
}

# Create a pipeline with vectorizer, Naive Bayes model, and imputer for any remaining missing values in features
model_pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=10000),
    SimpleImputer(strategy='most_frequent'),  # Fill missing values in numeric columns (if any)
    MultinomialNB()
)

# Perform GridSearchCV to find the best parameters and improve F1 score
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and F1 score from GridSearch
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best F1 score found by grid search: {grid_search.best_score_:.4f}")

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='binary')  # Adjust for multi-class if necessary

print(f"F1 Score on Test Set: {f1:.4f}")

# Cross-validation to get the best score and prevent overfitting
cross_val_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
print(f"Cross-validated F1 scores: {cross_val_scores}")
print(f"Average Cross-validated F1 score: {cross_val_scores.mean():.4f}")

# Prepare the submission file (ensure to include the proper column for predictions)
submission = test_food[['iD']].copy()  # Assuming 'iD' is the identifier column
submission['sentiment'] = y_pred  # Add the predicted sentiment labels

# Save the submission file
submission.to_csv('submission5.csv', index=False)
print("Submission file saved as 'submission3.csv'")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reviews['review_text'].fillna('', inplace=True)  # Replace NaNs with empty strings for text data


KeyError: 'sentiment'