<a href="https://colab.research.google.com/github/akram01Br/PixelHawk-OD/blob/main/hotel_review_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Predicting hotel-review sentiment for Tunisian listings scraped from Booking.com
You’ll get:
• real-time scraping (≈1 k reviews)
• Arabic + French text cleaning (Tunisia-specific stop-words)
• multilingual BERT fine-tuning with transformers
• SHAP word-level explainability
• a tiny Streamlit front-end served from GitHub.



In [1]:
# Cell 1 – installs
!pip install -q transformers datasets arabic-reshaper python-bidi streamlit shap altair
!apt-get -qq install chromium-chromedriver
!pip install -q selenium==4.11.2

import re, json, numpy as np, pandas as pd, torch, shap, altair as alt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import arabic_reshaper, itertools, requests, time, zipfile, os

# Cell 2 – scrape 1 000 Tunisian hotel reviews
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=chrome_options)

urls = [
    "https://www.booking.com/reviews/tn/hotel/laico-tunis.html",
    "https://www.booking.com/reviews/tn/hotel/iberostar-averroes.html",
    "https://www.booking.com/reviews/tn/hotel/royal-tulip-carthage.html"
]
reviews, ratings = [], []

for url in urls:
    driver.get(url)
    time.sleep(2)
    review_blocks = driver.find_elements("css selector", ".review_item_review_content")
    score_blocks  = driver.find_elements("css selector", ".review-score-badge")
    for r, s in zip(review_blocks, score_blocks):
        txt = r.text.strip()
        if txt:
            score = float(s.text.replace(',', '.'))
            reviews.append(txt)
            ratings.append(1 if score >= 7 else 0)   # binary sentiment
    if len(reviews) >= 1000: break
driver.quit()

df = pd.DataFrame({"text": reviews, "label": ratings})
df = df.dropna().drop_duplicates()
df.head()

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.9/292.9 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hPreconfiguring packages ...
Selecting previously unselected package apparmor.
(Reading database ... 126284 files and directories currently installed.)
Preparing to unpack .../apparmor_3.0.4-2ubuntu2.4_amd64.deb ...
Unpacking apparmor (3.0.4-2ubuntu2.4) ...
Selecting previously unselected package squashfs-tools.
Preparing to unpack .../squashfs-tools_1%3a4.5-3build1_amd64.deb ...
Unpacking squashfs-

Unnamed: 0,text,label


Train multilingual BERT in 3 minutes


In [None]:
MODEL_NAME = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

ds = Dataset.from_pandas(df).train_test_split(test_size=0.2, seed=42)
ds = ds.map(tokenize, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

args = TrainingArguments(
    output_dir="bert-tn-sentiment",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=2,
    evaluation_strategy="epoch",
    save_strategy="no",
)

trainer = Trainer(model=model, args=args,
                  train_dataset=ds["train"], eval_dataset=ds["test"])
trainer.train()
print("Accuracy:", trainer.evaluate()["eval_accuracy"])

NameError: name 'AutoTokenizer' is not defined

In [2]:
!pip install -q --upgrade transformers datasets accelerate  # <- ensure fresh versions

from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="bert-tn-sentiment",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=2,
    eval_strategy="epoch",        # <- NEW name (was evaluation_strategy)
    save_strategy="no",
    logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=tokenizer
)

trainer.train()

NameError: name 'model' is not defined

SHAP explainability – top Arabic/French words


In [None]:
explainer = shap.Explainer(model, tokenizer)
sample = ds["test"].select(range(50))["text"]
shap_values = explainer(sample)

# aggregate token-level scores
def clean_tok(t):
    return re.sub(r"^##|</?w>", "", t)

word_imp = {}
for s, tokens in zip(shap_values.values, shap_values.data):
    for val, tok in zip(s[:,1], tokens):
        tok = clean_tok(tok)
        if re.search(r'[\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF]', tok) or re.search(r'[a-zA-Z]', tok):
            word_imp[tok] = word_imp.get(tok, 0) + abs(val)

top = pd.Series(word_imp).sort_values(ascending=False).head(15)
top

Save artefacts & push


In [None]:
!mkdir -p artifacts
model.save_pretrained("artifacts/bert-tn-sentiment")
tokenizer.save_pretrained("artifacts/bert-tn-sentiment")
df.to_csv("artifacts/tunisia_reviews.csv", index=False)

# same git-push snippet as before – simply commit this new notebook and folder

Bonus – 15-line Streamlit app


In [None]:
import streamlit as st, torch, joblib, pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

@st.cache_resource
def load():
    tok = AutoTokenizer.from_pretrained("artifacts/bert-tn-sentiment")
    mdl = AutoModelForSequenceClassification.from_pretrained("artifacts/bert-tn-sentiment")
    return tok, mdl

tok, mdl = load()
txt = st.text_area("اكتب تعليقك بالعربية أو الفرنسية:", height=100)
if st.button("Predict"):
    inputs = tok(txt, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        proba = torch.softmax(mdl(**inputs).logits, dim=-1)[0,1].item()
    st.metric("Positive probability", f"{proba:.2%}")

README snippet for this project
