In [None]:
import pandas as pd
import requests
import json
import time
from tqdm import tqdm

# =========================
# Configuration
# =========================

OPENROUTER_API_KEY = "API_Key"
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"


OPENROUTER_MODEL = "nex-agi/deepseek-v3.1-nex-n1:free"

INPUT_CSV = "cleaned_companies_data.csv"
OUTPUT_CSV = "companies_enriched_deepseek.csv"

REQUEST_TIMEOUT = 90
RATE_LIMIT_SECONDS = 3
MAX_RETRIES = 2

# =========================
# Helper Functions
# =========================

def get_fallback_data():
    """Returns basic structure when extraction fails"""
    return {
        "primary_industry": "Unknown",
        "related_industries": [],
        "products_services": [],
        "expanded_keywords": []
    }

def extract_company_info(company_name, description, max_retries=MAX_RETRIES):
    """
    Calls OpenRouter to extract industry, products, and synonyms
    Returns a validated dictionary
    """

    prompt = f"""Extract information from this company description and return ONLY a JSON object.

Company: {company_name}
Description: {description}

Return this exact JSON format with no extra text:
{{"primary_industry": "specific industry",
  "related_industries": ["synonym1", "synonym2"],
  "products_services": ["product1", "product2"],
  "expanded_keywords": ["keyword1", "keyword2"]}}"""

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": "http://localhost",
        "X-Title": "Company Enrichment Script"
    }

    payload = {
        "model": OPENROUTER_MODEL,
        "messages": [
            {
                "role": "system",
                "content": "You extract data and return only valid JSON with no markdown, no explanation, no extra text."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        "temperature": 0.2,
        "max_tokens": 400
    }

    for attempt in range(max_retries):
        try:
            response = requests.post(
                OPENROUTER_URL,
                headers=headers,
                json=payload,
                timeout=REQUEST_TIMEOUT
            )

            if response.status_code != 200:
                print(f"‚ö†Ô∏è HTTP {response.status_code} for {company_name}")
                if attempt < max_retries - 1:
                    time.sleep(2)
                    continue
                return get_fallback_data()

            result = response.json()

            if "choices" not in result or not result["choices"]:
                print(f"‚ö†Ô∏è Invalid response structure for {company_name}")
                return get_fallback_data()

            content = result["choices"][0]["message"]["content"].strip()

            # Debug first request
            if attempt == 0 and len(enriched_data) == 0:
                print("\nüîç Debug ‚Äì Raw OpenRouter response:")
                print(content[:300], "\n")

            # Remove markdown code blocks if present
            if "```json" in content:
                content = content.split("```json")[1].split("```")[0]
            elif "```" in content:
                content = content.split("```")[1].split("```")[0]

            content = content.strip()

            # Extract JSON if surrounded by text
            if not content.startswith("{"):
                start = content.find("{")
                end = content.rfind("}") + 1
                if start != -1 and end > start:
                    content = content[start:end]

            extracted = json.loads(content)

            required_keys = [
                "primary_industry",
                "related_industries",
                "products_services",
                "expanded_keywords"
            ]

            if not all(k in extracted for k in required_keys):
                print(f"‚ö†Ô∏è Missing keys for {company_name}")
                return get_fallback_data()

            return extracted

        except json.JSONDecodeError as e:
            print(f"‚ö†Ô∏è JSON parse error for {company_name} (attempt {attempt+1})")
            if attempt < max_retries - 1:
                time.sleep(2)
            else:
                return get_fallback_data()

        except requests.exceptions.RequestException as e:
            print(f"‚ö†Ô∏è Connection error for {company_name}: {str(e)[:80]}")
            if attempt < max_retries - 1:
                time.sleep(3)
            else:
                return get_fallback_data()

        except Exception as e:
            print(f"‚ö†Ô∏è Unexpected error for {company_name}: {str(e)[:80]}")
            return get_fallback_data()

    return get_fallback_data()

def create_searchable_text(company_name, description, extracted):
    """Creates enhanced searchable text with all synonyms"""
    parts = [
        f"Company: {company_name}",
        f"Description: {description}",
        f"Industry: {extracted.get('primary_industry', '')}",
        f"Related Industries: {', '.join(extracted.get('related_industries', []))}",
        f"Products/Services: {', '.join(extracted.get('products_services', []))}",
        f"Keywords: {', '.join(extracted.get('expanded_keywords', []))}"
    ]
    return "\n".join(parts)

# =========================
# Main Pipeline
# =========================

print("üì• Loading CSV...")
df = pd.read_csv(INPUT_CSV)
df.columns = df.columns.str.strip()

print(f"Found {len(df)} companies")
print(f"Columns: {df.columns.tolist()}")

enriched_data = []

print("\n‚öôÔ∏è Processing companies via OpenRouter...\n")

for _, row in tqdm(df.iterrows(), total=len(df)):
    company_name = str(row["companyName"]).strip()
    description = str(row["description"]).strip()

    if not description or description.lower() == "nan":
        print(f"Skipping {company_name} ‚Äì empty description")
        continue

    extracted = extract_company_info(company_name, description)

    enriched_record = {
        "companyName": company_name,
        "original_description": description,
        "primary_industry": extracted["primary_industry"],
        "related_industries": json.dumps(extracted["related_industries"]),
        "products_services": json.dumps(extracted["products_services"]),
        "expanded_keywords": json.dumps(extracted["expanded_keywords"]),
        "searchable_text": create_searchable_text(
            company_name, description, extracted
        )
    }

    enriched_data.append(enriched_record)
    time.sleep(RATE_LIMIT_SECONDS)

# =========================
# Save Results
# =========================

enriched_df = pd.DataFrame(enriched_data)
enriched_df.to_csv(OUTPUT_CSV, index=False)

print(f"\n‚úÖ Enriched data saved to: {OUTPUT_CSV}")

print("\nüìä Sample results:")
print(enriched_df[["companyName", "primary_industry"]].head(10))

if len(enriched_df) > 0:
    first = enriched_df.iloc[0]
    print("\nüîç Example enriched record:")
    print(f"Company: {first['companyName']}")
    print(f"Primary Industry: {first['primary_industry']}")
    print("Related Industries:", json.loads(first["related_industries"]))
    print("Products/Services:", json.loads(first["products_services"]))
    print("Expanded Keywords:", json.loads(first["expanded_keywords"]))


üì• Loading CSV...
Found 105 companies
Columns: ['companyName', 'description', 'description_word_count']

‚öôÔ∏è Processing companies via OpenRouter...



  0%|          | 0/105 [00:00<?, ?it/s]


üîç Debug ‚Äì Raw OpenRouter response:
{"primary_industry": "commercial vehicle manufacturing",
  "related_industries": ["automotive", "transportation equipment"],
  "products_services": ["trucks", "buses", "vans", "construction vehicles", "spare parts", "vehicle services", "freight transportation platform", "dealer finance", "customer f 



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 105/105 [29:36<00:00, 16.92s/it]


‚úÖ Enriched data saved to: companies_enriched_deepseek.csv

üìä Sample results:
                  companyName                   primary_industry
0                   Traton SE   commercial vehicle manufacturing
1                2G Energy AG                     Energy Systems
2         MTU Aero Engines AG              Aerospace and Defense
3       Deutsche Lufthansa AG                           Aviation
4           Siemens Energy AG                  Energy Technology
5  Siemens Aktiengesellschaft                         Technology
6    Daimler Truck Holding AG   Commercial Vehicle Manufacturing
7            Deutsche Post AG                          Logistics
8                   Nordex SE                onshore wind energy
9              Rheinmetall AG  Defense and Automotive Technology

üîç Example enriched record:
Company: Traton SE
Primary Industry: commercial vehicle manufacturing
Related Industries: ['automotive', 'transportation equipment']
Products/Services: ['trucks', 'buses',


