In [12]:
import pandas as pd

In [None]:
with open('scraped\smartprix_phones.html', 'r', encoding='utf-8') as f:
    html = f.read()

In [14]:
from bs4 import BeautifulSoup

In [15]:
soup = BeautifulSoup(html, 'lxml')

In [16]:
containers = soup.find_all('div',{'class':'sm-product has-tag has-features has-actions'})

In [22]:
def find_spec(specs, *needles):
    s = (s for s in specs if all(n in s.lower() for n in needles))
    return next(s, None)

phones = []
for card in containers:
    # basics
    model  = card.h2.get_text(strip=True) if card.h2 else None
    price  = card.select_one('span.price')
    price  = price.get_text(strip=True) if price else None

    # rating: any rank-x via CSS attr contains
    rating = card.select_one('div.score[class*="rank-"] b')
    rating = rating.get_text(strip=True) if rating else None

    # all <li> specs as a list
    specs = [li.get_text(strip=True) for li in card.select('ul.sm-feat.specs li')]

    phones.append({
        "model":   model,
        "price":   price,
        "rating":  rating,
        
        # Connectivity - full connectivity spec including SIM and network
        "connectivity": (find_spec(specs, "5g") or 
                        find_spec(specs, "4g") or 
                        find_spec(specs, "3g") or
                        find_spec(specs, "volte") or
                        find_spec(specs, "wi-fi") or
                        find_spec(specs, "wifi") or
                        find_spec(specs, "sim")),
        
        # Processor - check for all processor types
        "processor": (
                     # MediaTek Dimensity
                     find_spec(specs, "dimensity") or
                     # Qualcomm Snapdragon
                     find_spec(specs, "snapdragon") or
                     # MediaTek Helio
                     find_spec(specs, "helio") or
                     # Unisoc
                     find_spec(specs, "unisoc") or
                     # Spreadtrum
                     find_spec(specs, "spreadtrum") or
                     # Apple A-series (for iPhones)
                     find_spec(specs, "apple") or
                     find_spec(specs, "bionic") or
                     # Exynos (Samsung)
                     find_spec(specs, "exynos") or
                     # Google Tensor
                     find_spec(specs, "tensor") or
                     # Generic processor mentions
                     find_spec(specs, "processor") or
                     find_spec(specs, "ghz") or
                     find_spec(specs, "mhz") or
                     find_spec(specs, "khz")),
        
        "ram":     find_spec(specs, "ram"),
        
        # Storage - check for inbuilt storage
        "storage": find_spec(specs, "inbuilt"),
        
        "battery": (find_spec(specs, "battery") or 
                   find_spec(specs, "mah")),
        
        "display": (find_spec(specs, "inches") or 
                   find_spec(specs, "display")),
        
        # Camera - check for rear and front cameras
        "camera":  (find_spec(specs, "camera") or 
                   find_spec(specs, "mp")),
        
        # OS - check for all operating systems (improved logic)
        "os": (find_spec(specs, "android") or
              find_spec(specs, "ios") or 
              find_spec(specs, "symbian") or
              find_spec(specs, "windows", "phone") or
              # Fallback to version indicators only if they contain common patterns
              next((s for s in specs if s.startswith("v") and any(char.isdigit() for char in s)), None)),
        
        # Memory Card Support
        "memory_card": find_spec(specs, "memory", "card"),
    })

# Example: print first parsed phone
print(f"Total phones extracted: {len(phones)}")
print(f"\nSample phone (first entry):")
print(phones[0])


Total phones extracted: 1007

Sample phone (first entry):
{'model': 'BlackZone S25 Ultra Maxx', 'price': 'â‚¹8,999', 'rating': '63', 'connectivity': 'Dual Sim, 3G, 4G, VoLTE, Wi-Fi', 'processor': 'Spreadtrum, Octa Core, 1.6 GHz Processor', 'ram': '8 GB RAM, 128 GB inbuilt', 'storage': '8 GB RAM, 128 GB inbuilt', 'battery': '5000 mAh Battery', 'display': '6.6 inches, 720 x 1612 px Display', 'camera': '32 MP Rear & 16 MP Front Camera', 'os': 'Android v14', 'memory_card': None}


In [None]:
pd.DataFrame(phones).to_csv("data/raw/smartphones.csv", index=False, encoding="utf-8-sig")