# Reviews

In [1]:
import re
import html
from collections import defaultdict
from typing import Dict, List

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from tqdm import tqdm

from db import MongoDB
from models import User, BookRating

In [2]:
mongo = MongoDB().connect()
User.ensure_indexes(mongo)

In [3]:
reviews_raw_collection = mongo.get_collection("reviews_raw")
reviews_dict_list = list(reviews_raw_collection.find({}))

In [4]:
def clean_for_sentiment(text: str) -> str | None:
    try:
        if len(text) < 20 or detect(text) != "en":
            return None
    except LangDetectException:
        return None
    text = html.unescape(text)
    text = re.sub(r"http\S+|www\S+|@\S+|#\S+", " ", text)
    text = re.sub(r"[^a-zA-Z0-9\s.!?']", " ", text)
    text = re.sub(r"\s+", " ", text).strip().lower()
    return text if len(text) >= 5 else None

In [5]:
def clean_for_summarization(text: str) -> str | None:
    try:
        if len(text) < 20 or detect(text) != "en":
            return None
    except LangDetectException:
        return None
    text = html.unescape(text)
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-zA-Z0-9\s.,!?'\"]", " ", text)
    text = re.sub(r"\s+", " ", text).strip().lower()
    return text if len(text) >= 20 else None

In [6]:
reviews: List[dict] = []
users_cache: Dict[str, User] = {}
pending_ratings: Dict[str, List[BookRating]] = defaultdict(list)

In [7]:
for reviews_dict in tqdm(reviews_dict_list):  # remove [:1] to process all
    review_doc = {
        "_id": reviews_dict["_id"],
        "sentiment_text": [],
        "summarization_text": [],
    }

    book_id = reviews_dict["_id"]
    rating_sum = 0.0
    rating_count = 0

    for _, reviews_data in reviews_dict.get("_reviews", {}).items():
        # --- accumulate featured rating ---
        rating = reviews_data.get("rating")
        if isinstance(rating, (int, float)):
            rating_sum += float(rating)
            rating_count += 1

        # --- collect ratings to append for each user ---
        username = reviews_data.get("username")
        if username and isinstance(rating, (int, float)):
            # get or create user (cached -> DB -> new)
            user = users_cache.get(username)
            if user is None:
                found = User.get_by_username(mongo, username)
                user = found if found else User.create(mongo, username)
                users_cache[username] = user

            pending_ratings[username].append(
                BookRating(book_id=str(book_id), rating=float(rating))
            )

        # --- clean texts ---
        raw_text = reviews_data.get("review", "")
        sent_text = clean_for_sentiment(raw_text)
        if sent_text:
            review_doc["sentiment_text"].append(sent_text)

        summ_text = clean_for_summarization(raw_text)
        if summ_text:
            review_doc["summarization_text"].append(summ_text)

    # --- featured rating (guard divide-by-zero) ---
    review_doc["featured_rating"] = round(rating_sum / rating_count, 2) if rating_count else None
    reviews.append(review_doc)

100%|██████████| 228/228 [01:41<00:00,  2.24it/s]


In [8]:
# --- append ratings to each user in one DB call per user ---
for username, ratings_list in pending_ratings.items():
    user = users_cache[username]
    user.add_ratings(mongo, ratings_list)

In [9]:
reviews_collection = mongo.get_collection("reviews_clean")
reviews_collection.insert_many(reviews)

InsertManyResult(['73f65a70-8825-5401-bcce-2313da0cc6f8', 'e940f1b3-d333-50b7-a2b6-b95603134438', '74ec4ed6-0c42-5425-86bf-51af8793a280', '8368087f-946e-5eb0-b630-a614124d550f', 'f5c32f05-2e3c-5eac-baec-7504adf01a18', 'ec3cd213-bfdb-5356-ac43-15483385b247', 'ae52e56a-0ee4-506d-94f5-6d51d55ff5e3', 'a8664b84-fe0d-5741-8e2f-3e6f5260b871', 'f516548a-b783-5e8d-89f5-80dee74ad7b7', '71400d60-1f3b-5134-808c-bc00234b128c', 'aab400eb-5caa-55c2-b5d6-5376888a4c03', '6be338b2-4f47-5437-9ffa-0b92e5cb8fbf', '0fbce2e5-73d8-53c8-aa32-c3ea066c0d51', '0dc7fa81-0b51-539e-a16a-8ff440e0511d', '5f627705-6291-523e-be78-13220dc0e91e', '7374f7d8-bdc4-5705-b393-cf6741e29a65', 'ba01d28e-b0f1-53a7-b8d2-9d808d3a75c9', '30da9c11-f736-5f41-9166-fc65f663e2bb', '7eec2f86-f544-569f-94ec-17a001daa5f1', '3d9a1a52-057b-5f01-940b-5115bb7d5a1f', '75391d1b-b2d3-50cc-88c5-9e345d654a41', 'a94cd22b-04f2-58ea-8248-0b3ecceebbf6', 'a934a030-087d-51c6-9b9d-2ca76fe92349', '34cb6464-45cc-56e3-8f1d-42e5e5c734af', '25627f1c-e442-5f39-98

---