1. Get data
2. EDA
3. Prepare Data (feature engineering )
4. Select ,Train ,Finetune Model

In [24]:
import os
import pandas as pd
from pymongo import MongoClient
from typing import Dict, List, Any, Optional , Tuple
import json
from datetime import datetime


In [2]:
import math
import re
from typing import Dict, List, Any

In [70]:

# Configuration
MONGO_URI = os.getenv("MONGO_URI", "mongodb://localhost:27017")
DB_NAME = "digikala"
PRODUCTS_COLLECTION = "products"

class ProductDataReader:
    """
    A class to read product data from Digikala MongoDB database and convert to pandas DataFrame.
    """
    
    def __init__(self, mongo_uri: str = MONGO_URI, db_name: str = DB_NAME):
        """
        Initialize the MongoDB connection.
        
        Args:
            mongo_uri: MongoDB connection string
            db_name: Database name
        """
        self.client = MongoClient(mongo_uri)
        self.db = self.client[db_name]
        self.products_collection = self.db[PRODUCTS_COLLECTION]
        
    def get_collection_info(self) -> Dict[str, Any]:
        """
        Get basic information about the products collection.
        
        Returns:
            Dictionary containing collection statistics
        """
        try:
            total_docs = self.products_collection.count_documents({})
            sample_doc = self.products_collection.find_one()
            
            return {
                "total_documents": total_docs,
                "sample_document": sample_doc,
                "collection_name": PRODUCTS_COLLECTION,
                "database_name": DB_NAME
            }
        except Exception as e:
            print(f"Error getting collection info: {e}")
            return {}
    


    def get_specifications(self , spec_groups: List[Dict[str, Any]]) -> Dict[str, str]:
        # ---------- helpers ----------
        persian_digits = str.maketrans("۰۱۲۳۴۵۶۷۸۹", "0123456789")
        arabic_digits = str.maketrans("٠١٢٣٤٥٦٧٨٩", "0123456789")

        def to_ascii(s: Any) -> str:
            if s is None:
                return ""
            s = str(s).translate(persian_digits).translate(arabic_digits)
            return "".join(ch for ch in s if 32 <= ord(ch) <= 126)

        def join_vals(v):
            if not v:
                return ""
            if isinstance(v, (list, tuple)):
                return ", ".join([str(x) for x in v if x is not None])
            return str(v)

        def first_number(text: str) -> str:
            t = to_ascii(text)
            m = re.search(r"(\d+(?:\.\d+)?)", t)
            return m.group(1) if m else ""

        def extract_year(text: str) -> str:
            t = to_ascii(text)
            m = re.search(r"\b(20\d{2}|19\d{2})\b", t)
            return m.group(1) if m else ""

        def extract_size_3nums_mm(text: str) -> str:
            t = re.sub(r"[×X*]", "x", to_ascii(text))
            nums = re.findall(r"\d+(?:\.\d+)?", t)
            return "x".join(nums) if nums else ""

        def extract_resolution(text: str) -> str:
            t = to_ascii(text).lower()
            t = t.replace("×", "x").replace("X", "x")
            m = re.search(r"(\d{3,5})\s*[x*]\s*(\d{3,5})", t)
            if m:
                a, b = m.group(1), m.group(2)
                return f"{a}x{b}"
            return ""

        def extract_inch(text: str) -> str:
            t = to_ascii(text).lower()
            m = re.search(r"(\d+(?:\.\d+)?)\s*(inch|in|'|\")", t)
            if m:
                return m.group(1)
            m = re.search(r"\b(\d+(?:\.\d+)?)\b", t)
            return m.group(1) if m else ""

        def gcd(a: int, b: int) -> int:
            while b:
                a, b = b, a % b
            return a

        def aspect_ratio_from_res(res: str) -> str:
            if "x" not in res:
                return ""
            w, h = res.split("x")
            try:
                w, h = int(w), int(h)
            except Exception:
                return ""
            g = gcd(w, h) or 1
            w2, h2 = w // g, h // g
            if w2 > h2:
                w2, h2 = h2, w2
            return f"{w2}:{h2}"

        def ppi_from(res: str, inch: str) -> str:
            try:
                if "x" not in res:
                    return ""
                w, h = [int(x) for x in res.split("x")]
                d = float(inch)
                if d <= 0:
                    return ""
                ppi = math.sqrt(w*w + h*h) / d
                return str(int(round(ppi)))
            except Exception:
                return ""

        def contains_ip_rating(text: str) -> bool:
            t = to_ascii(text)
            return re.search(r"\bip\s*([5-9]\d)\b", t, flags=re.I) is not None

        def extract_ip_code(text: str) -> str:
            t = to_ascii(text)
            m = re.search(r"\bip\s*([5-9]\d)\b", t, flags=re.I)
            return f"IP{m.group(1)}" if m else ""

        # NEW: map Persian category to high/mid/low
        def map_category(value: str) -> str:
            raw = (value or "").replace("\u200c", "").strip().lower()
            # Keep Persian for matching before ASCII stripping
            if any(k in raw for k in ["پرچم", "پرچمدار", "پرچم دار"]):
                return "high"
            if any(k in raw for k in ["ميان رده", "میان رده", "میانرده", "ميان‌رده", "میان‌رده"]):
                return "mid"
            if any(k in raw for k in ["پايين رده", "پایین رده", "پایینرده"]):
                return "low"
            # English fallbacks
            t = to_ascii(raw)
            if "flagship" in t: return "high"
            if "mid" in t: return "mid"
            if "low" in t or "entry" in t: return "low"
            return ""

        # NEW: robust Persian SIM count mapping
        def sim_count_from(text: str) -> str:
            raw = (text or "").replace("\u200c", "").strip().lower()
            # direct numerics first
            num = first_number(raw)
            if num:
                return num
            # Persian words
            fa_map = {
                "یک": "1", "يك": "1", "1": "1", "تک": "1", "تك": "1",
                "دو": "2", "۲": "2",
                "سه": "3", "۳": "3",
                "چهار": "4", "۴": "4",
            }
            for k, v in fa_map.items():
                if k in raw:
                    return v
            # English words
            en_map = {"single": "1", "dual": "2", "triple": "3", "quad": "4"}
            t = to_ascii(raw).lower()
            for k, v in en_map.items():
                if k in t:
                    return v
            return ""

        # NEW: GB extraction tolerant to Persian units
        def gb_or_number(text: str) -> str:
            t_raw = text or ""
            # 1) try english-like "128 gb"
            t = to_ascii(t_raw).lower()
            m = re.search(r"\b(\d{1,4})\s*(gb|gib|gigabyte|gigabytes)\b", t)
            if m:
                return m.group(1)
            # 2) Persian unit cases -> ASCII leaves just number, so fallback to first number
            n = first_number(t_raw)
            return n

        def watt_number_max(text: str) -> str:
            t = to_ascii(text).lower()
            nums = re.findall(r"\b(\d{2,3})\s*(w|watt|watts)\b", t)
            vals = [int(n[0]) for n in nums] if nums else []
            return str(max(vals)) if vals else ""

        # ---------- flatten spec table ----------
        flat = {}
        try:
            for group in spec_groups if isinstance(spec_groups, list) else []:
                for attr in group.get("attributes", []) or []:
                    t = str(attr.get("title", "")).strip()
                    v = join_vals(attr.get("values", []))
                    if t and v and t not in flat:
                        flat[t] = v
        except Exception:
            pass

        norm = lambda s: str(s).strip().lower()
        def vby(keys: List[str]) -> str:
            keys_normed = [norm(x) for x in keys]
            for k in list(flat.keys()):
                if norm(k) in keys_normed:
                    return flat[k]
            return ""

        # ---------- outputs (edited) ----------
        out = {
            "region": "",
            "os": "",
            "introduce_date": "",
            "category": "",
            "model": "",
            "size": "",
            "weight": "",
            "simcard_numbers": "",
            "anti_water": "no",
            "display_technology": "",
            "refresh_rate": "",
            "screen_brightness": "",
            "size_screen_inch": "",
            "display_to_body_ratio": "",
            "aspect_ratio": "",
            "display_resolution": "",
            "pixel_per_inch": "",
            "cpu_model": "",
            "gpu_model": "",
            "storage_gb": "",
            "ram_gb": "",
            "sd_card_support": "no",
            "internet": "no",
            "camera_num": "",
            "camera_resolution_mp": "",
            "camera_num_features": "",
            "video": "",
            "selfie_resolution_mp": "",
            "selfie_features": "no",
            "battery_power_mah": "",
            "fast_charge": "no",
        }

        # simple direct
        out["model"] = to_ascii(vby(["مدل", "model"]))
        out["size"] = extract_size_3nums_mm(vby(["ابعاد", "size", "dimension", "dimensions"]))
        out["weight"] = first_number(vby(["وزن", "weight"]))

        # category mapping (high/mid/low)
        cat_raw = vby(["دسته ‌بندی", "دسته بندی", "category"])
        out["category"] = map_category(cat_raw)

        # region
        out["region"] = to_ascii(vby(["region", "منطقه", "منطقه عرضه", "ریجن"]))

        # os
        os_val = to_ascii(vby(["سیستم عامل", "os", "operating system"]))
        if not os_val:
            any_text = to_ascii(" ".join(flat.values())).lower()
            if "android" in any_text:
                os_val = "Android"
            elif "ios" in any_text or "i os" in any_text:
                os_val = "iOS"
        out["os"] = os_val

        # introduce year
        out["introduce_date"] = extract_year(vby(["تاریخ معرفی", "زمان معرفی", "introduce date", "introduction date"]))

        # sim count (Persian aware)
        sim_txt = vby(["تعداد سیم کارت", "تعداد سیم‌کارت", "سیم کارت", "sim count", "sim cards"])
        out["simcard_numbers"] = sim_count_from(sim_txt)

        # anti-water from any IP code
        body_txt = vby(["توضیحات بدنه", "body", "بدنه"])
        resist_txt = vby(["قابلیت‌های مقاومتی", "قابلیت\u200cهای مقاومتی", "resistance", "protection", "durability"])
        all_text = to_ascii(" ".join([body_txt, resist_txt, " ".join(flat.values())]))
        out["anti_water"] = "yes" if contains_ip_rating(all_text) else "no"
        ip_code = extract_ip_code(all_text)
        if ip_code:
            out["anti_water"] = "yes"

        # display
        out["display_technology"] = to_ascii(vby(["فناوری صفحه‌ نمایش", "فناوری صفحه نمایش", "display technology", "panel", "فناوری نمایش"]))
        out["refresh_rate"] = first_number(vby(["نرخ به‌روزرسانی تصویر", "نرخ بروزرسانی", "refresh rate"]))
        out["screen_brightness"] = first_number(vby(["روشنایی صفحه نمایش", "روشنایی", "brightness"]))
        out["size_screen_inch"] = extract_inch(vby(["اندازه", "اندازه صفحه", "اندازه صفحه نمایش", "display size"]))
        dbr_txt = vby(["نسبت صفحه‌ نمایش به بدنه", "نسبت نمایشگر به بدنه", "screen-to-body ratio", "display to body ratio"])
        out["display_to_body_ratio"] = first_number(dbr_txt)

        res_txt = vby(["رزولوشن صفحه نمایش", "رزولوشن", "resolution"])
        out["display_resolution"] = extract_resolution(res_txt)
        out["aspect_ratio"] = to_ascii(vby(["نسبت تصویر", "aspect ratio"])) or aspect_ratio_from_res(out["display_resolution"])
        out["pixel_per_inch"] = first_number(vby(["تراکم پیکسلی", "ppi", "pixel density"])) or ppi_from(out["display_resolution"], out["size_screen_inch"])

        # CPU/GPU
        out["cpu_model"] = to_ascii(vby(["تراشه", "چیپست", "chipset", "soc"]))
        out["gpu_model"] = to_ascii(vby(["پردازنده‌ گرافیکی", "پردازنده گرافیکی", "gpu"]))
        # cpu_num_cores and cpu_hertz removed as requested

        # Memory (robust)
        out["storage_gb"] = gb_or_number(vby(["حافظه داخلی", "storage", "internal storage"]))
        out["ram_gb"] = gb_or_number(vby(["مقدار RAM", "ram", "حافظه رم"]))

        # SD support
        sd_txt = vby(["پشتیبانی از کارت حافظه", "sd", "microsd", "کارت حافظه"])
        sd_ascii = to_ascii(sd_txt).lower()
        out["sd_card_support"] = "yes" if ("microsd" in sd_ascii or "sd" in sd_ascii) and ("no" not in sd_ascii and "faqd" not in sd_ascii and "without" not in sd_ascii) else "no"

        # Networks
        nets_txt = " ".join([
            vby(["شبکه‌های مخابراتی", "network", "networks"]),
            vby(["شبکه‌های ارتباطی قابل پشتیبانی", "communication networks"]),
        ])
        def networks_from(text: str) -> str:
            t = to_ascii(text).lower()
            nets = []
            if "5g" in t: nets.append("5G")
            if "4g" in t or "lte" in t: nets.append("4G")
            if "3g" in t: nets.append("3G")
            if "2g" in t: nets.append("2G")
            nets = list(dict.fromkeys(nets))
            return ", ".join(nets) if nets else "no"
        out["internet"] = networks_from(nets_txt)

        # Cameras
        out["camera_num"] = first_number(vby(["تعداد دوربین‌های پشت گوشی", "تعداد دوربین های پشت گوشی", "rear cameras", "number of rear cameras"]))
        out["camera_resolution_mp"] = first_number(vby(["رزولوشن دوربین اصلی", "دوربین اصلی", "main camera resolution"]))
        cam_feat_txt = vby(["مشخصات دوربین اصلی", "ویژگی‌های دوربین", "camera features"])
        if cam_feat_txt:
            parts = re.split(r"[,،|-]\s*", to_ascii(cam_feat_txt).strip(" -,")) if cam_feat_txt else []
            parts = [p for p in parts if p]
            out["camera_num_features"] = str(len(parts)) if parts else ""
        out["video"] = to_ascii(vby(["سایر مشخصات فیلمبرداری", "کیفیت فیلمبرداری", "video", "video recording"]))

        out["selfie_resolution_mp"] = first_number(vby(["رزولوشن دوربین سلفی", "selfie camera resolution"]))
        selfie_feat = vby(["مشخصات دوربین سلفی", "selfie features"])
        out["selfie_features"] = "yes" if selfie_feat.strip() else "no"

        # Battery / charging
        out["battery_power_mah"] = first_number(vby(["ظرفیت باتری", "battery capacity"]) or vby(["مشخصات باتری"]))
        # fast charge detection from Persian/English keywords in battery specs
        bat_specs = " ".join([vby(["مشخصات باتری"]), vby(["قابلیت‌های شارژ", "قابلیت‌های شارژر", "شارژ"])])
        bat_ascii = to_ascii(bat_specs).lower()
        fa_fast = (vby(["مشخصات باتری"]) or "").replace("\u200c", "").lower()
        fast_keywords_en = ["fast charge", "quick charge", "super fast"]
        fast_keywords_fa = ["فست شارژ", "شارژ سریع"]
        has_fast = any(k in bat_ascii for k in fast_keywords_en) or any(k in fa_fast for k in fast_keywords_fa)
        out["fast_charge"] = "yes" if has_fast else "no"
        # charging_power_w removed; keep if needed as helper:
        # out["charging_power_w"] = watt_number_max(bat_specs)

        # final ASCII clean + field trims
        for k, v in list(out.items()):
            v = to_ascii(v)
            if k in ["internet"]:
                v = v.replace("Lte", "4G")
            out[k] = v.strip()

        return out
    
    
    def process_colors(self, colors: List[str]) -> str:
        """
        Process colors list into a comma-separated string.
        
        Args:
            colors: List of color strings
            
        Returns:
            Comma-separated string of colors
        """
        if not colors:
            return ""
        return ", ".join(colors)
    
    def process_images(self, images: List[str]) -> str:
        """
        Process images list into a comma-separated string.
        
        Args:
            images: List of image URLs
            
        Returns:
            Comma-separated string of image URLs
        """
        if not images:
            return ""
        return ", ".join(images)
    
    def process_suggestions(self, suggestions: Dict) -> str:
        """
        Process suggestions dictionary into a string representation.
        
        Args:
            suggestions: Suggestions dictionary
            
        Returns:
            String representation of suggestions
        """
        if not suggestions:
            return ""
        return json.dumps(suggestions, ensure_ascii=False)
    
    def process_comments_overview(self, comments_overview: Dict) -> Dict[str, str]:
        """
        Process comments overview into separate fields.
        
        Args:
            comments_overview: Comments overview dictionary
            
        Returns:
            Dictionary with separate overview fields
        """
        if not comments_overview:
            return {"overview": "", "advantages": "", "disadvantages": ""}
        
        return {
            "overview": comments_overview.get("overview", ""),
            "advantages": comments_overview.get("advantages", ""),
            "disadvantages": comments_overview.get("disadvantages", "")
        }
    
    def read_products_to_dataframe(self, 
                                 limit: Optional[int] = None,
                                 filter_query: Optional[Dict] = None,
                                 include_specifications: bool = True) -> pd.DataFrame:
        """
        Read product data from MongoDB and convert to pandas DataFrame.
        
        Args:
            limit: Maximum number of documents to retrieve (None for all)
            filter_query: MongoDB filter query to apply
            include_specifications: Whether to flatten and include specifications
            
        Returns:
            pandas DataFrame containing product data
        """
        try:
            # Build query
            query = filter_query or {}
            
            # Get cursor
            cursor = self.products_collection.find(query)
            if limit:
                cursor = cursor.limit(limit)
            
            # Convert to list
            documents = list(cursor)
            
            if not documents:
                print("No documents found matching the criteria.")
                return pd.DataFrame()
            
            print(f"Retrieved {len(documents)} documents from MongoDB.")
            
            # Process documents
            processed_docs = []
            
            for doc in documents:
                # Basic fields
                processed_doc = {
                    "_id": doc.get("_id"),
                    "title_en": doc.get("title_en"),
                    "title_fa": doc.get("title_fa"),
                    "brand": doc.get("brand"),
                    "category": doc.get("category"),
                    "price": doc.get("price"),
                    "rate": doc.get("rate"),
                    "count_raters": doc.get("count_raters"),
                    "popularity": doc.get("popularity"),
                    "num_questions": doc.get("num_questions"),
                    "num_comments": doc.get("num_comments"),
                }
                
                # Process complex fields
                processed_doc["colors"] = self.process_colors(doc.get("colors", []))
                processed_doc["images"] = self.process_images(doc.get("images", []))
                processed_doc["suggestions"] = self.process_suggestions(doc.get("suggestions", {}))
                
                # Process comments overview
                comments_overview = self.process_comments_overview(doc.get("comments_overview", {}))
                processed_doc.update(comments_overview)
                
                # Process specifications if requested
                if include_specifications:
                    flattened_specs = self.flatten_specifications(doc.get("specifications", {}))
                    processed_doc.update(flattened_specs)
                
                processed_docs.append(processed_doc)
            
            # Create DataFrame
            df = pd.DataFrame(processed_docs)
            
            # Convert numeric columns
            numeric_columns = ["price", "rate", "count_raters", "popularity", "num_questions", "num_comments"]
            for col in numeric_columns:
                if col in df.columns:
                    df[col] = pd.to_numeric(df[col], errors='coerce')
            
            print(f"Created DataFrame with shape: {df.shape}")
            print(f"Columns: {list(df.columns)}")
            
            return df
            
        except Exception as e:
            print(f"Error reading data from MongoDB: {e}")
            return pd.DataFrame()
    
    def get_data_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
        """
        Get summary statistics of the DataFrame.
        
        Args:
            df: pandas DataFrame
            
        Returns:
            Dictionary containing summary statistics
        """
        if df.empty:
            return {"error": "DataFrame is empty"}
        
        summary = {
            "shape": df.shape,
            "columns": list(df.columns),
            "missing_values": df.isnull().sum().to_dict(),
            "data_types": df.dtypes.to_dict(),
            "numeric_summary": df.describe().to_dict() if not df.select_dtypes(include=['number']).empty else {},
            "categorical_summary": {}
        }
        
        # Categorical columns summary
        categorical_cols = df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            if col in df.columns:
                summary["categorical_summary"][col] = {
                    "unique_values": df[col].nunique(),
                    "most_common": df[col].value_counts().head().to_dict()
                }
        
        return summary
    
    def close_connection(self):
        """Close MongoDB connection."""
        if self.client:
            self.client.close()
            print("MongoDB connection closed.")


def main():
    """
    Example usage of the DigikalaDataReader class.
    """
    # Initialize reader
    reader = ProductDataReader()
    
    try:
        # Get collection info
        print("=== Collection Information ===")
        info = reader.get_collection_info()
        print(f"Total documents: {info.get('total_documents', 'Unknown')}")
        print(f"Database: {info.get('database_name', 'Unknown')}")
        print(f"Collection: {info.get('collection_name', 'Unknown')}")
        print("=== 5 Random Sample Specifications ===")
        pipeline = [
            {"$match": {}},
            {"$sample": {"size": 5}},
            {"$project": {"_id": 1, "title_en": 1, "title_fa": 1, "specifications": 1}},
        ]
        for i, doc in enumerate(reader.products_collection.aggregate(pipeline), 1):
            print(f"\n--- Sample #{i} ---")
            print(f"_id: {doc.get('_id')}")
            print(f"title_en: {doc.get('title_en') or ''}")
            print(f"title_fa: {doc.get('title_fa') or ''}")

            # Original specification data (as stored)
            print("Original specifications:")
            print(json.dumps(doc.get("specifications", []), indent=2, ensure_ascii=False))

            # Normalized (optional; keep if you still want to see extracted columns)
            spec = reader.get_specifications(doc.get("specifications", []))
            print("Normalized specifications:")
            print(json.dumps(spec, indent=2, ensure_ascii=True))
        # # Read all products (you can add limit for testing)
        # print("=== Reading Product Data ===")
        # df = reader.read_products_to_dataframe(limit=1000)  # Limit for testing
        
        # if not df.empty:
        #     # Display basic info
        #     print(f"DataFrame shape: {df.shape}")
        #     print(f"Columns: {list(df.columns)}")
        #     print()
            
        #     # Display first few rows
        #     print("=== First 5 Rows ===")
        #     print(df.head())
        #     print()
            
        #     # Get summary
        #     print("=== Data Summary ===")
        #     summary = reader.get_data_summary(df)
        #     print("Missing values per column:")
        #     for col, missing in summary["missing_values"].items():
        #         if missing > 0:
        #             print(f"  {col}: {missing}")
            
        #     # Save to CSV (optional)
        #     output_file = f"digikala_products_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        #     df.to_csv(output_file, index=False, encoding='utf-8')
        #     print(f"Data saved to: {output_file}")
            
        # else:
        #     print("No data retrieved.")
            
    except Exception as e:
        print(f"Error: {e}")
    
    finally:
        # Close connection
        reader.close_connection()


if __name__ == "__main__":
    main()

=== Collection Information ===
Total documents: 2353
Database: digikala
Collection: products
=== 5 Random Sample Specifications ===

--- Sample #1 ---
_id: 252852
title_en: Apple iPhone SE 32GB Mobile Phone
title_fa: گوشی موبایل اپل مدل iPhone SE ظرفیت 32 گیگابایت
Original specifications:
[
  {
    "title": "مشخصات کلی",
    "attributes": [
      {
        "title": "ابعاد",
        "values": [
          "7.6 × 58.6 × 123.8 میلی‌متر"
        ]
      },
      {
        "title": "وزن",
        "values": [
          "113 گرم"
        ]
      },
      {
        "title": "توضیحات بدنه",
        "values": [
          "قاب پشتی کاملا فلزی از جنس آلومینیوم\r\nمجهز به حس‌گر اثر انگشت (Fingerprint Sensor) "
        ]
      },
      {
        "title": "تعداد سیم کارت",
        "values": [
          "یک عدد "
        ]
      },
      {
        "title": "نوع سیم کارت",
        "values": [
          "سایز نانو (8.8 × 12.3 میلی‌متر) "
        ]
      }
    ]
  },
  {
    "title": "صفحه نمایش",
    "at