In [4]:
"""
LALAFO AUTOMOTIVE DATA EXTRACTION SCRIPT
========================================

Purpose:
--------
Extracts car listings from Lalafo API, enriches them with detailed
metadata (parameters + SEO information), and exports brand-specific
Excel datasets.

This module performs:
- API communication
- Pagination
- Parameter extraction via ID mapping
- Data aggregation
- Excel export

No data cleaning or normalization is performed here.
"""

import requests
from datetime import datetime
import pandas as pd
from pandas import ExcelWriter


# ============================================================
# API CONFIGURATION
# ============================================================

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept": "application/json, text/plain, */*",
    "device": "pc"
}

BASE_SEARCH_URL = "https://lalafo.kg/api/search/v3/feed/search"
BASE_DETAILS_URL = "https://lalafo.kg/api/search/v3/feed/details"
BASE_SEO_URL = "https://lalafo.kg/api/seo/v3/metas/details"


# ============================================================
# API REQUEST FUNCTIONS
# ============================================================

def get_json(params):
    """Fetch paginated listing results."""
    response = requests.get(BASE_SEARCH_URL, headers=HEADERS, params=params)
    return response.json()


def get_auto_param_json(ad_id):
    """Fetch detailed parameters for a listing."""
    response = requests.get(
        f"{BASE_DETAILS_URL}/{ad_id}",
        headers=HEADERS,
        params={"expand": "url"}
    )
    return response.json()


def get_auto_title_json(ad_id):
    """Fetch SEO metadata for a listing."""
    response = requests.get(
        BASE_SEO_URL,
        headers=HEADERS,
        params={"ad_id": ad_id}
    )
    return response.json()


# ============================================================
# PARAMETER EXTRACTION
# ============================================================

def get_param_value(auto_params, param_id):
    """Extract parameter value by parameter ID."""
    for i in range(16):
        try:
            if auto_params[i]['id'] == param_id:
                return auto_params[i]['value']
        except (IndexError, KeyError):
            print("Index error")
            return None


# ============================================================
# DATA TRANSFORMATION
# ============================================================

def get_data_from_json(json_file, page):
    """
    Transform raw JSON response into structured dictionary objects.
    """
    domen = "https://lalafo.kg"

    result = []
    counter = 0

    for d in json_file["items"]:
        try:
            post_id = d["id"]
            created_time = d["created_time"]
            phone = d["mobile"]
            price = d["price"]
            url_goods = d["url"]

            vip_post = d["is_vip"]
            city = d["city"]

            try:
                nameseller = d["user"]["username"]
            except:
                nameseller = ""

            json_auto_params = get_auto_param_json(post_id)
            json_auto_title = get_auto_title_json(post_id)

            brand = json_auto_title["h1"]
            title = json_auto_title["title"]

            if json_auto_params["description"] == "":
                description = json_auto_title["title"].split("➤")[0]
            else:
                description = json_auto_params["description"]

            if "images" in json_auto_params and json_auto_params["images"]:
                image = json_auto_params["images"][0]["original_url"]
            else:
                image = ""

            # Parameter mapping (IDs preserved exactly)
            model = get_param_value(json_auto_params["params"], 49)
            condition = get_param_value(json_auto_params["params"], 29)
            year = get_param_value(json_auto_params["params"], 62)
            run_km = get_param_value(json_auto_params["params"], 56)
            fuel_type = get_param_value(json_auto_params["params"], 65)
            body_type = get_param_value(json_auto_params["params"], 63)
            transmission_type = get_param_value(json_auto_params["params"], 64)
            wheel_drive = get_param_value(json_auto_params["params"], 244)
            wheel_side = get_param_value(json_auto_params["params"], 106)
            color = get_param_value(json_auto_params["params"], 105)
            engine_capacity = get_param_value(json_auto_params["params"], 66)
            vin_code = get_param_value(json_auto_params["params"], 1156)
            tech_condition = get_param_value(json_auto_params["params"], 1155)
            clearance_rastamojka = get_param_value(json_auto_params["params"], 1157)
            in_stock = get_param_value(json_auto_params["params"], 242)
            payment = get_param_value(json_auto_params["params"], 1154)

            result.append({
                "post_id": post_id,
                "created_time": datetime.fromtimestamp(created_time).strftime("%d-%m-%Y %H:%M:%S"),
                "city": city,
                "brand": brand,
                "model": model,
                "title": title,
                "description": description,
                "price": price,
                "condition": condition,
                "year": year,
                "run_km": run_km,
                "fuel_type": fuel_type,
                "body_type": body_type,
                "transmission_type": transmission_type,
                "wheel_drive": wheel_drive,
                "wheel_side": wheel_side,
                "color": color,
                "engine_capacity": engine_capacity,
                "vin_code": vin_code,
                "tech_condition": tech_condition,
                "clearance": clearance_rastamojka,
                "image": image,
                "vip_status": vip_post,
                "url": domen + str(url_goods),
                "name_seller": nameseller,
                "payment": payment,
                "phone": phone,
            })

            counter += 1
            print(counter, "- element added (", brand, model, "); page -", page)

        except Exception as e:
            print("Error:", e)
            continue

    return result


# ============================================================
# EXPORT
# ============================================================

def save_excel(data, name):
    """Save collected data into Excel file."""
    df = pd.DataFrame(data)
    writer = ExcelWriter(f"../../../templates/dataset/lalafo_results_{name}.xlsx")
    df.to_excel(writer, "data")
    writer._save()
    print(f"Все сохранено в lalafo_results_{name}.xlsx")


# ============================================================
# MAIN PAGINATION LOGIC
# ============================================================

def get_cars_by_brand(brand_name, brand_id, amount, pages):
    params = {
        "expand": "url",
        "price[from]": 100000,
        "currency": "KGS",
        "per-page": amount,
        "page": 1,
        "sort_by": "newest",
        "category_id": brand_id,
    }

    all_data = []

    for i in range(1, pages):
        json_data = get_json(params)
        data = get_data_from_json(json_data, i)
        all_data.extend(data)

        if len(data) < params["per-page"]:
            break

        params["page"] += 1

    save_excel(all_data, brand_name)


# ============================================================
# EXECUTION
# ============================================================

car_brands = {
    "honda": "1570",
    "mercedes": "1585",
    "volkswagen": "1610",
    "hyundai": "1610",
    "audi": "1555",
    "bmw": "1557",
    "mitsubishi": "1589",
    "kia": "1576",
}

if __name__ == "__main__":
    for brand_name, brand_id in car_brands.items():
        get_cars_by_brand(brand_name, brand_id, 100, 20)
        print(f"Excel file for {brand_name} cars has been saved.")

KeyError: 'items'

In [None]:
# ============================================================
# SHOWCASE OUTPUT (For README / Portfolio)
# ============================================================

def generate_showcase_outputs(df, brand_name):
    """
    Generate small showcase artifacts for GitHub README:
    - Preview CSV (first 20 rows)
    - Summary statistics
    - Price distribution chart
    """

    import os
    import matplotlib.pyplot as plt

    showcase_dir = "showcase_outputs"
    os.makedirs(showcase_dir, exist_ok=True)

    # 1️⃣ Preview CSV
    preview_path = f"{showcase_dir}/{brand_name}_preview.csv"
    df.head(20).to_csv(preview_path, index=False)

    # 2️⃣ Summary statistics
    summary_path = f"{showcase_dir}/{brand_name}_summary.csv"
    df.describe(include="all").to_csv(summary_path)

    # 3️⃣ Price distribution chart
    if "price" in df.columns:
        plt.figure()
        df["price"].dropna().astype(float).hist(bins=30)
        plt.title(f"{brand_name} Price Distribution")
        plt.xlabel("Price (KGS)")
        plt.ylabel("Count")

        chart_path = f"{showcase_dir}/{brand_name}_price_distribution.png"
        plt.savefig(chart_path)
        plt.close()

    print(f"Showcase files generated for {brand_name}")