In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#datafor seo
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import requests
from urllib.parse import urlparse
import base64
import time
import json

# === 认证信息 ===
USERNAME = "support@sociosquares.com"
PASSWORD = "135b6aab5381a28b"
AUTH_TOKEN = base64.b64encode(f"{USERNAME}:{PASSWORD}".encode()).decode()
HEADERS = {
    "Authorization": f"Basic {AUTH_TOKEN}",
    "Content-Type": "application/json"
}

# === San Francisco 文件路径 ===
SF_TOP3_PATH = "/content/drive/MyDrive/Part 2/San Francisco/top_3_results_filtered.csv"
SF_TOP10_PATH = "/content/drive/MyDrive/Part 2/San Francisco/top_10_results_filtered.csv"
OUTPUT_PATH = "/content/drive/MyDrive/Part 2/San Francisco/"

def extract_domain(url):
    try:
        return urlparse(url).netloc.replace("www.", "")
    except:
        return None

def get_domain_summary(domain):
    """获取域名的summary数据 - 包含各种权威指标"""
    try:
        print(f"    📊 Getting summary for: {domain}")

        response = requests.post(
            "https://api.dataforseo.com/v3/backlinks/summary/live",
            headers=HEADERS,
            json=[{"target": domain, "mode": "as_is"}]
        )

        if response.status_code == 200:
            data = response.json()
            if (data and
                data.get("status_code") == 20000 and
                data.get("tasks") and
                len(data["tasks"]) > 0):

                task_result = data["tasks"][0]
                if task_result.get("status_code") == 20000:
                    result_data = task_result.get("result", [])
                    if result_data and len(result_data) > 0:
                        summary = result_data[0]

                        # 提取各种指标
                        domain_rank = summary.get("rank", 0)
                        backlinks_count = summary.get("backlinks", 0)
                        referring_domains = summary.get("referring_domains", 0)
                        referring_main_domains = summary.get("referring_main_domains", 0)

                        print(f"    ✅ Summary: Rank={domain_rank}, Backlinks={backlinks_count:,}")

                        return {
                            "domain_rank": domain_rank,
                            "backlinks_count": backlinks_count,
                            "referring_domains": referring_domains,
                            "referring_main_domains": referring_main_domains
                        }

        print(f"    ⚠️ No summary data for {domain}")
        return None

    except Exception as e:
        print(f"    ❌ Summary error for {domain}: {e}")
        return None

def get_domain_backlinks_sample(domain):
    """获取域名的sample backlinks"""
    try:
        print(f"    🔗 Getting backlinks sample for: {domain}")

        response = requests.post(
            "https://api.dataforseo.com/v3/backlinks/backlinks/live",
            headers=HEADERS,
            json=[{"target": domain, "limit": 5, "mode": "as_is"}]
        )

        if response.status_code == 200:
            data = response.json()
            if (data and
                data.get("status_code") == 20000 and
                data.get("tasks") and
                len(data["tasks"]) > 0):

                task_result = data["tasks"][0]
                if task_result.get("status_code") == 20000:
                    result_data = task_result.get("result", [])
                    if result_data and len(result_data) > 0:
                        domain_result = result_data[0]
                        items = domain_result.get("items", [])

                        # 提取top backlinks和详细信息
                        top_backlinks = []
                        for item in items:
                            url_from = item.get("url_from")
                            if url_from:
                                top_backlinks.append(url_from)

                        print(f"    ✅ Found {len(top_backlinks)} sample backlinks")
                        return top_backlinks

        print(f"    ⚠️ No backlinks sample for {domain}")
        return []

    except Exception as e:
        print(f"    ❌ Backlinks error for {domain}: {e}")
        return []

def get_page_rank_for_domain(domain):
    """获取域名主页的page rank"""
    try:
        main_page_url = f"https://{domain}/"
        print(f"    📄 Getting page rank for: {main_page_url}")

        response = requests.post(
            "https://api.dataforseo.com/v3/backlinks/summary/live",
            headers=HEADERS,
            json=[{"target": main_page_url, "mode": "as_is"}]
        )

        if response.status_code == 200:
            data = response.json()
            if (data and
                data.get("status_code") == 20000 and
                data.get("tasks") and
                len(data["tasks"]) > 0):

                task_result = data["tasks"][0]
                if task_result.get("status_code") == 20000:
                    result_data = task_result.get("result", [])
                    if result_data and len(result_data) > 0:
                        page_summary = result_data[0]
                        page_rank = page_summary.get("rank", 0)

                        print(f"    ✅ Page rank: {page_rank}")
                        return page_rank

        print(f"    ⚠️ No page rank for {domain}")
        return 0

    except Exception as e:
        print(f"    ❌ Page rank error for {domain}: {e}")
        return 0

def enrich_single_domain(domain):
    """完整enrichment单个域名 - 获取所有指标"""
    print(f"\n  🎯 ENRICHING: {domain}")
    print(f"  " + "="*50)

    enrichment_data = {
        "domain_authority": 0,
        "page_authority": 0,
        "backlinks_count": 0,
        "top_backlinks": [],
        "trust_flow": 0,
        "referring_domains": 0,
        "referring_main_domains": 0
    }

    # 1. 获取域名summary数据
    summary_data = get_domain_summary(domain)
    if summary_data:
        enrichment_data["domain_authority"] = summary_data["domain_rank"]
        enrichment_data["backlinks_count"] = summary_data["backlinks_count"]
        enrichment_data["referring_domains"] = summary_data["referring_domains"]
        enrichment_data["referring_main_domains"] = summary_data["referring_main_domains"]

    time.sleep(1)  # API延迟

    # 2. 获取页面rank
    page_rank = get_page_rank_for_domain(domain)
    enrichment_data["page_authority"] = page_rank

    time.sleep(1)  # API延迟

    # 3. 获取sample backlinks
    top_backlinks = get_domain_backlinks_sample(domain)
    enrichment_data["top_backlinks"] = top_backlinks

    # 4. Trust flow (DataForSEO没有直接提供，使用domain rank作为近似)
    enrichment_data["trust_flow"] = enrichment_data["domain_authority"]

    # 显示结果摘要
    print(f"  📊 RESULTS:")
    print(f"    Domain Authority: {enrichment_data['domain_authority']}")
    print(f"    Page Authority: {enrichment_data['page_authority']}")
    print(f"    Backlinks Count: {enrichment_data['backlinks_count']:,}")
    print(f"    Top Backlinks: {len(enrichment_data['top_backlinks'])}")
    print(f"    Referring Domains: {enrichment_data['referring_domains']}")

    return enrichment_data

def process_sf_file_complete(file_path, file_type):
    """完整处理San Francisco文件 - 所有域名，所有指标"""
    print(f"\n🌉 PROCESSING: San Francisco {file_type.upper()} - COMPLETE VERSION")
    print("=" * 80)

    if not os.path.exists(file_path):
        print(f"❌ File not found: {file_path}")
        return None

    try:
        # 读取文件
        df = pd.read_csv(file_path)
        print(f"📄 Loaded {len(df)} rows from {file_type}")

        # 检查必要列
        required_columns = ["keyword", "city", "ranking", "link"]
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"❌ Missing columns: {missing_columns}")
            return None

        # 提取域名
        df["domain"] = df["link"].apply(extract_domain)

        # 获取所有唯一域名
        all_domains = df["domain"].dropna().unique()
        domain_counts = df["domain"].value_counts()

        print(f"🌐 Found {len(all_domains)} unique domains")
        print(f"📊 Total rows to process: {len(df)}")
        print(f"🎯 Will enrich ALL {len(all_domains)} domains")
        print(f"💰 Estimated cost: ${len(all_domains) * 0.06:.2f} (3 API calls per domain)")
        print(f"⏱️ Estimated time: {len(all_domains) * 4 / 60:.1f} minutes")

        # 显示top domains
        print(f"\n🔝 Top 10 domains by frequency:")
        for i, (domain, count) in enumerate(domain_counts.head(10).items()):
            print(f"  {i+1}. {domain} ({count} occurrences)")

        # 确认是否继续
        print(f"\n⚠️ This will make {len(all_domains) * 3} API calls!")
        print(f"🚀 Starting enrichment process in 3 seconds...")
        time.sleep(3)

        # Enrichment process
        print(f"\n📊 STARTING ENRICHMENT PROCESS...")
        print("=" * 80)

        enrichment_results = {}
        total_cost = 0

        for i, domain in enumerate(all_domains):
            if not domain or pd.isna(domain):
                continue

            print(f"\n📍 DOMAIN {i+1}/{len(all_domains)}: {domain}")
            print(f"🔄 Progress: {((i+1)/len(all_domains)*100):.1f}%")

            # 获取完整enrichment数据
            domain_data = enrich_single_domain(domain)
            enrichment_results[domain] = domain_data

            # 估算成本
            total_cost += 0.06  # 每个域名约3次API调用

            # 进度延迟
            if i < len(all_domains) - 1:
                print(f"  💤 Waiting 3 seconds...")
                time.sleep(3)

        # 应用enrichment结果到dataframe
        print(f"\n📋 APPLYING RESULTS TO DATAFRAME...")
        df["domain_authority"] = df["domain"].map(lambda d: enrichment_results.get(d, {}).get("domain_authority", 0))
        df["page_authority"] = df["domain"].map(lambda d: enrichment_results.get(d, {}).get("page_authority", 0))
        df["backlinks_count"] = df["domain"].map(lambda d: enrichment_results.get(d, {}).get("backlinks_count", 0))
        df["top_backlinks"] = df["domain"].map(lambda d: "; ".join(enrichment_results.get(d, {}).get("top_backlinks", [])))
        df["trust_flow"] = df["domain"].map(lambda d: enrichment_results.get(d, {}).get("trust_flow", 0))
        df["referring_domains"] = df["domain"].map(lambda d: enrichment_results.get(d, {}).get("referring_domains", 0))
        df["referring_main_domains"] = df["domain"].map(lambda d: enrichment_results.get(d, {}).get("referring_main_domains", 0))

        # 保存结果
        output_filename = f"San_Francisco_{file_type}_COMPLETE_enriched.csv"
        output_path = os.path.join(OUTPUT_PATH, output_filename)
        df.to_csv(output_path, index=False)

        # 统计报告
        total_backlinks = df["backlinks_count"].sum()
        rows_with_backlinks = (df["backlinks_count"] > 0).sum()
        max_backlinks = df["backlinks_count"].max()
        max_domain_authority = df["domain_authority"].max()
        max_page_authority = df["page_authority"].max()

        print(f"\n🎉 {file_type.upper()} COMPLETE ENRICHMENT FINISHED!")
        print("=" * 80)
        print(f"💾 Saved to: {output_filename}")
        print(f"📊 STATISTICS:")
        print(f"  📈 Total backlinks: {total_backlinks:,}")
        print(f"  🎯 Rows with backlinks: {rows_with_backlinks:,}/{len(df):,}")
        print(f"  🏆 Max backlinks: {max_backlinks:,}")
        print(f"  🏅 Max domain authority: {max_domain_authority}")
        print(f"  🏅 Max page authority: {max_page_authority}")
        print(f"  💰 Actual cost: ${total_cost:.2f}")

        return df, enrichment_results

    except Exception as e:
        print(f"❌ Error processing {file_type}: {e}")
        import traceback
        traceback.print_exc()
        return None

def main():
    """主函数 - 完整处理San Francisco所有数据"""
    print("🌉 SAN FRANCISCO COMPLETE DOMAIN ENRICHMENT SYSTEM")
    print("=" * 100)
    print("🎯 Target: San Francisco TOP3 and TOP10 files - ALL DOMAINS, ALL METRICS")
    print("📊 Metrics: Domain Authority, Page Authority, Backlinks, Trust Flow, Referring Domains")
    print("⚠️ WARNING: This will process ALL unique domains (expensive!)")

    # 预估成本
    # 读取文件以获取域名数量
    try:
        df3 = pd.read_csv(SF_TOP3_PATH)
        df10 = pd.read_csv(SF_TOP10_PATH)
        df3["domain"] = df3["link"].apply(extract_domain)
        df10["domain"] = df10["link"].apply(extract_domain)

        domains3 = set(df3["domain"].dropna().unique())
        domains10 = set(df10["domain"].dropna().unique())
        all_unique_domains = domains3.union(domains10)

        estimated_cost = len(all_unique_domains) * 0.06
        estimated_time = len(all_unique_domains) * 4 / 60

        print(f"\n📊 PRE-ANALYSIS:")
        print(f"  TOP3 unique domains: {len(domains3)}")
        print(f"  TOP10 unique domains: {len(domains10)}")
        print(f"  Combined unique domains: {len(all_unique_domains)}")
        print(f"  💰 Estimated total cost: ${estimated_cost:.2f}")
        print(f"  ⏱️ Estimated total time: {estimated_time:.1f} minutes")

    except Exception as e:
        print(f"⚠️ Could not pre-analyze files: {e}")

    results_summary = {}
    total_api_calls = 0

    # 处理TOP3文件
    print(f"\n🚀 STARTING TOP3 COMPLETE ENRICHMENT...")
    top3_result = process_sf_file_complete(SF_TOP3_PATH, "top3")

    if top3_result:
        df3, enrich3 = top3_result
        results_summary["top3"] = {
            "status": "Success",
            "rows": len(df3),
            "domains_processed": len(enrich3)
        }
        total_api_calls += len(enrich3) * 3
    else:
        results_summary["top3"] = {"status": "Failed"}

    # 延迟后处理TOP10文件
    print(f"\n💤 WAITING 10 seconds before TOP10...")
    time.sleep(10)

    print(f"\n🚀 STARTING TOP10 COMPLETE ENRICHMENT...")
    top10_result = process_sf_file_complete(SF_TOP10_PATH, "top10")

    if top10_result:
        df10, enrich10 = top10_result
        results_summary["top10"] = {
            "status": "Success",
            "rows": len(df10),
            "domains_processed": len(enrich10)
        }
        total_api_calls += len(enrich10) * 3
    else:
        results_summary["top10"] = {"status": "Failed"}

    # 最终报告
    print(f"\n🎉 SAN FRANCISCO COMPLETE ENRICHMENT FINISHED!")
    print("=" * 100)

    for file_type, result in results_summary.items():
        if result["status"] == "Success":
            print(f"✅ {file_type.upper()}: {result['rows']:,} rows, {result['domains_processed']} domains enriched")
        else:
            print(f"❌ {file_type.upper()}: Failed")

    print(f"\n📊 FINAL STATISTICS:")
    print(f"🔧 Total API calls made: {total_api_calls}")
    print(f"💰 Actual total cost: ~${total_api_calls * 0.02:.2f}")
    print(f"📂 Output location: {OUTPUT_PATH}")
    print(f"📄 Generated files:")
    print(f"  - San_Francisco_top3_COMPLETE_enriched.csv")
    print(f"  - San_Francisco_top10_COMPLETE_enriched.csv")

    # 域名分析汇总
    if top3_result and top10_result:
        print(f"\n🏆 TOP PERFORMING DOMAINS ANALYSIS:")

        # 合并所有enrichment结果
        all_domains = {}
        all_domains.update(enrich3)
        all_domains.update(enrich10)

        # 按不同指标排序
        by_backlinks = sorted(all_domains.items(), key=lambda x: x[1]["backlinks_count"], reverse=True)
        by_domain_auth = sorted(all_domains.items(), key=lambda x: x[1]["domain_authority"], reverse=True)

        print(f"\n🔗 Top 5 by Backlinks:")
        for i, (domain, data) in enumerate(by_backlinks[:5]):
            print(f"  {i+1}. {domain}: {data['backlinks_count']:,} backlinks")

        print(f"\n🏅 Top 5 by Domain Authority:")
        for i, (domain, data) in enumerate(by_domain_auth[:5]):
            print(f"  {i+1}. {domain}: {data['domain_authority']} authority")

    print(f"\n✨ COMPLETE ENRICHMENT SUCCESSFUL!")
    print(f"🎯 ALL METRICS POPULATED: Domain Authority, Page Authority, Trust Flow, Backlinks!")

if __name__ == "__main__":
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

  🎯 ENRICHING: goldinglawyers.com
    📊 Getting summary for: goldinglawyers.com
    ✅ Summary: Rank=333, Backlinks=14,254
    📄 Getting page rank for: https://goldinglawyers.com/
    ✅ Page rank: 48
    🔗 Getting backlinks sample for: goldinglawyers.com
    ✅ Found 5 sample backlinks
  📊 RESULTS:
    Domain Authority: 333
    Page Authority: 48
    Backlinks Count: 14,254
    Top Backlinks: 5
    Referring Domains: 1052
  💤 Waiting 3 seconds...

📍 DOMAIN 397/656: morganlewis.com
🔄 Progress: 60.5%

  🎯 ENRICHING: morganlewis.com
    📊 Getting summary for: morganlewis.com
    ✅ Summary: Rank=417, Backlinks=316,237
    📄 Getting page rank for: https://morganlewis.com/
    ✅ Page rank: 252
    🔗 Getting backlinks sample for: morganlewis.com
    ✅ Found 5 sample backlinks
  📊 RESULTS:
    Domain Authority: 417
    Page Authority: 252
    Backlinks Count: 316,237
    Top Backlinks: 5
    Referring Domains: 9440
  💤 Waiting 3 s

In [None]:
#Moz API
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import requests
import base64
import time
import json
from urllib.parse import urlparse
from datetime import datetime, timezone

# === Moz API 认证信息 ===
MOZ_ACCESS_ID = "mozscape-pn863y8erR"
MOZ_SECRET_KEY = "CpBLhVJlFaCEe9EPQkOoLjO6UF0iKUUf"

# === 路径设置 ===
BASE_PATH = "/content/drive/MyDrive/Part 2"
OUTPUT_BASE_PATH = "/content/drive/MyDrive/Part 2/Moz_Enriched_Results"

# 创建输出目录
os.makedirs(OUTPUT_BASE_PATH, exist_ok=True)

def extract_domain(url):
    """提取域名"""
    try:
        return urlparse(url).netloc.replace("www.", "")
    except:
        return None

def safe_join_backlinks(domain, moz_results):
    """安全地连接backlinks数据"""
    try:
        backlinks = moz_results.get(domain, {}).get("top_backlinks", [])
        if not backlinks:
            return ""

        # 确保所有元素都是字符串
        str_backlinks = []
        for link in backlinks:
            if isinstance(link, str):
                str_backlinks.append(link)
            elif isinstance(link, dict):
                # 如果是字典，尝试提取URL
                url = link.get('source', '') or link.get('url', '') or str(link)
                str_backlinks.append(url)
            else:
                str_backlinks.append(str(link))

        return "; ".join(str_backlinks)
    except Exception as e:
        print(f"    ⚠️ Error processing backlinks for {domain}: {e}")
        return ""

def get_moz_url_metrics(urls_batch):
    """获取Moz URL指标"""
    endpoint = "https://lsapi.seomoz.com/v2/url_metrics"

    headers = {
        "Authorization": f"Basic {base64.b64encode(f'{MOZ_ACCESS_ID}:{MOZ_SECRET_KEY}'.encode()).decode()}",
        "Content-Type": "application/json"
    }

    payload = {
        "targets": urls_batch
    }

    try:
        print(f"    🔍 Requesting Moz data for {len(urls_batch)} URLs...")

        response = requests.post(endpoint, headers=headers, json=payload)

        print(f"    📊 Response status: {response.status_code}")

        if response.status_code == 200:
            data = response.json()
            print(f"    ✅ Got data for {len(data.get('results', []))} URLs")
            return data.get('results', [])
        else:
            print(f"    ❌ Error: {response.status_code} - {response.text}")
            return []

    except Exception as e:
        print(f"    ❌ Exception: {e}")
        return []

def process_moz_results(moz_results):
    """处理Moz API返回的结果"""
    processed_data = {}

    for result in moz_results:
        domain = result.get('root_domain')

        if domain:
            processed_data[domain] = {
                'domain_authority': result.get('domain_authority', 0),
                'page_authority': result.get('page_authority', 0),
                'spam_score': result.get('spam_score', 0),
                'root_domains_to_root_domain': result.get('root_domains_to_root_domain', 0),
                'external_pages_to_root_domain': result.get('external_pages_to_root_domain', 0),
                'pages_to_root_domain': result.get('pages_to_root_domain', 0),
                'top_backlinks': []  # 初始化为空列表
            }

    return processed_data

def enrich_domains_with_moz(domains_list, city_name, batch_size=10):
    """使用Moz API批量enrichment域名"""
    print(f"  📊 Moz enrichment for {len(domains_list)} domains in {city_name}")

    # 准备URL列表
    urls_to_process = []
    for domain in domains_list:
        if domain and domain.strip():
            url = f"https://{domain.strip()}/"
            urls_to_process.append(url)

    print(f"  🎯 Processing {len(urls_to_process)} URLs in batches of {batch_size}")

    all_results = {}
    total_batches = (len(urls_to_process) + batch_size - 1) // batch_size

    for i in range(0, len(urls_to_process), batch_size):
        batch_urls = urls_to_process[i:i + batch_size]
        batch_num = (i // batch_size) + 1

        print(f"\n  📦 Batch {batch_num}/{total_batches}: {len(batch_urls)} URLs")

        # 获取URL metrics
        moz_results = get_moz_url_metrics(batch_urls)

        if moz_results:
            processed_results = process_moz_results(moz_results)
            all_results.update(processed_results)

            # 显示批次结果（前5个）
            shown = 0
            for domain, data in processed_results.items():
                if shown < 5:  # 只显示前5个以节省空间
                    da = data['domain_authority']
                    pa = data['page_authority']
                    backlinks = data['external_pages_to_root_domain']
                    print(f"    ✅ {domain}: DA={da}, PA={pa}, Backlinks={backlinks:,}")
                    shown += 1

            if len(processed_results) > 5:
                print(f"    ... and {len(processed_results) - 5} more domains")

        # API限制延迟
        if batch_num < total_batches:
            print(f"    💤 Waiting 2 seconds...")
            time.sleep(2)

    # 为其他域名设置空的backlinks列表
    for domain in all_results:
        if 'top_backlinks' not in all_results[domain]:
            all_results[domain]['top_backlinks'] = []

    return all_results

def process_sf_file_complete(file_type="top3"):
    """处理San Francisco完整文件"""
    print(f"\n🌉 Processing San Francisco {file_type.upper()} - COMPLETE VERSION")
    print("=" * 80)

    # 构建文件路径
    if file_type == "top3":
        file_pattern = "top_3_results_filtered.csv"
    else:
        file_pattern = "top_10_results_filtered.csv"

    file_path = os.path.join(BASE_PATH, "San Francisco", file_pattern)

    if not os.path.exists(file_path):
        print(f"❌ File not found: {file_path}")
        return None

    try:
        # 读取数据
        df = pd.read_csv(file_path)
        print(f"📄 Loaded {len(df)} rows from {file_type}")

        # 提取域名
        df["domain"] = df["link"].apply(extract_domain)
        unique_domains = df["domain"].dropna().unique()

        print(f"🌐 Found {len(unique_domains)} unique domains")
        print(f"💰 Estimated cost: ${len(unique_domains) * 0.01:.2f}")
        print(f"⏱️ Estimated time: {len(unique_domains) * 0.2 / 60:.1f} minutes")

        # 显示前几个域名
        print(f"🔝 Sample domains: {list(unique_domains[:5])}")

        # Moz enrichment
        moz_results = enrich_domains_with_moz(unique_domains, f"San Francisco {file_type}")

        # 应用结果到dataframe
        print(f"\n📋 Applying Moz results to dataframe...")

        df["domain_authority"] = df["domain"].map(lambda d: moz_results.get(d, {}).get("domain_authority", 0))
        df["page_authority"] = df["domain"].map(lambda d: moz_results.get(d, {}).get("page_authority", 0))
        df["spam_score"] = df["domain"].map(lambda d: moz_results.get(d, {}).get("spam_score", 0))
        df["backlinks_count"] = df["domain"].map(lambda d: moz_results.get(d, {}).get("external_pages_to_root_domain", 0))
        df["referring_domains"] = df["domain"].map(lambda d: moz_results.get(d, {}).get("root_domains_to_root_domain", 0))
        df["pages_to_domain"] = df["domain"].map(lambda d: moz_results.get(d, {}).get("pages_to_root_domain", 0))
        df["top_backlinks"] = df["domain"].map(lambda d: safe_join_backlinks(d, moz_results))
        df["trust_flow"] = None  # Moz不提供Trust Flow

        # 保存结果
        output_filename = f"San_Francisco_{file_type}_moz_enriched.csv"
        output_path = os.path.join(OUTPUT_BASE_PATH, output_filename)
        df.to_csv(output_path, index=False)

        # 统计报告
        enriched_rows = (df["domain_authority"] > 0).sum()
        max_da = df["domain_authority"].max()
        avg_da = df[df["domain_authority"] > 0]["domain_authority"].mean()
        max_backlinks = df["backlinks_count"].max()
        total_backlinks = df["backlinks_count"].sum()

        print(f"\n📊 {file_type.upper()} Results Summary:")
        print(f"💾 Saved: {output_filename}")
        print(f"📈 Enriched rows: {enriched_rows:,}/{len(df):,}")
        print(f"🏆 Max Domain Authority: {max_da}")
        print(f"📊 Avg Domain Authority: {avg_da:.1f}")
        print(f"🔗 Max Backlinks: {max_backlinks:,}")
        print(f"📊 Total Backlinks: {total_backlinks:,}")
        print(f"💰 Actual cost: ~${len(unique_domains) * 0.01:.2f}")

        return df, moz_results

    except Exception as e:
        print(f"❌ Error processing San Francisco {file_type}: {e}")
        import traceback
        traceback.print_exc()
        return None

def main():
    """主函数 - 处理San Francisco完整数据"""
    print("🌉 San Francisco Complete Moz Enrichment System")
    print("=" * 100)
    print("🎯 Processing San Francisco TOP3 and TOP10 files")
    print("📊 Will generate 2 separate enriched CSV files")

    results_summary = {}
    total_cost = 0
    total_domains = 0

    # 处理TOP3文件
    print(f"\n🚀 Starting TOP3 enrichment...")
    top3_result = process_sf_file_complete("top3")

    if top3_result:
        df3, moz3 = top3_result
        results_summary["top3"] = {
            "status": "Success",
            "rows": len(df3),
            "domains_processed": len(moz3),
            "max_da": df3["domain_authority"].max(),
            "total_backlinks": df3["backlinks_count"].sum()
        }
        total_cost += len(moz3) * 0.01
        total_domains += len(moz3)
    else:
        results_summary["top3"] = {"status": "Failed"}

    # 延迟后处理TOP10文件
    print(f"\n💤 Waiting 5 seconds before TOP10...")
    time.sleep(5)

    print(f"\n🚀 Starting TOP10 enrichment...")
    top10_result = process_sf_file_complete("top10")

    if top10_result:
        df10, moz10 = top10_result
        results_summary["top10"] = {
            "status": "Success",
            "rows": len(df10),
            "domains_processed": len(moz10),
            "max_da": df10["domain_authority"].max(),
            "total_backlinks": df10["backlinks_count"].sum()
        }
        total_cost += len(moz10) * 0.01
        total_domains += len(moz10)
    else:
        results_summary["top10"] = {"status": "Failed"}

    # 最终报告
    print(f"\n🎉 San Francisco Complete Enrichment Finished!")
    print("=" * 100)

    for file_type, result in results_summary.items():
        if result["status"] == "Success":
            print(f"✅ {file_type.upper()}: {result['rows']:,} rows, {result['domains_processed']} domains")
            print(f"   Max DA: {result['max_da']}, Total Backlinks: {result['total_backlinks']:,}")
        else:
            print(f"❌ {file_type.upper()}: Failed")

    print(f"\n📊 Final Statistics:")
    print(f"🌐 Total unique domains processed: {total_domains}")
    print(f"💰 Total cost: ~${total_cost:.2f}")
    print(f"📂 Output files saved to: {OUTPUT_BASE_PATH}")
    print(f"📄 Generated files:")
    print(f"  - San_Francisco_top3_moz_enriched.csv")
    print(f"  - San_Francisco_top10_moz_enriched.csv")

    # 域名分析汇总
    if top3_result and top10_result:
        print(f"\n🏆 Top Performing Domains Analysis:")

        # 合并结果找出最高DA域名
        all_domains = {}
        all_domains.update(moz3)
        all_domains.update(moz10)

        # 按DA排序
        sorted_by_da = sorted(all_domains.items(), key=lambda x: x[1]["domain_authority"], reverse=True)

        print(f"🥇 Top 5 by Domain Authority:")
        for i, (domain, data) in enumerate(sorted_by_da[:5]):
            da = data['domain_authority']
            pa = data['page_authority']
            backlinks = data['external_pages_to_root_domain']
            print(f"  {i+1}. {domain}: DA={da}, PA={pa}, Backlinks={backlinks:,}")

    print(f"\n✨ San Francisco enrichment completed successfully!")
    print(f"🎯 Ready for analysis with complete Domain Authority and Page Authority data!")

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🌉 San Francisco Complete Moz Enrichment System
🎯 Processing San Francisco TOP3 and TOP10 files
📊 Will generate 2 separate enriched CSV files

🚀 Starting TOP3 enrichment...

🌉 Processing San Francisco TOP3 - COMPLETE VERSION
📄 Loaded 1633 rows from top3
🌐 Found 246 unique domains
💰 Estimated cost: $2.46
⏱️ Estimated time: 0.8 minutes
🔝 Sample domains: ['tslo.com', 'eastbaybusinesslawyer.com', 'sfbar.org', 'coblentzlaw.com', 'lscarlsonlaw.com']
  📊 Moz enrichment for 246 domains in San Francisco top3
  🎯 Processing 246 URLs in batches of 10

  📦 Batch 1/25: 10 URLs
    🔍 Requesting Moz data for 10 URLs...
    📊 Response status: 200
    ✅ Got data for 10 URLs
    ✅ tslo.com: DA=25, PA=25, Backlinks=5,275
    ✅ eastbaybusinesslawyer.com: DA=12, PA=24, Backlinks=177
    ✅ sfbar.org: DA=52, PA=45, Backlinks=52,412
    ✅ coblentzlaw.com: DA=33, PA=36, Backlinks=8,28

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import requests
import base64
import time
import json
import glob
from urllib.parse import urlparse
from datetime import datetime, timezone

# === Moz API 认证信息 ===
MOZ_ACCESS_ID = "mozscape-pn863y8erR"
MOZ_SECRET_KEY = "CpBLhVJlFaCEe9EPQkOoLjO6UF0iKUUf"

# === 路径设置 ===
BASE_PATH = "/content/drive/MyDrive/Part 1"
OUTPUT_BASE_PATH = "/content/drive/MyDrive/Part 1/Moz_Enriched_Results"
PROGRESS_FILE = "/content/drive/MyDrive/Part 1/Moz_Enriched_Results/processing_progress.json"

# 创建输出目录
os.makedirs(OUTPUT_BASE_PATH, exist_ok=True)

def extract_domain(url):
    """提取域名"""
    try:
        return urlparse(url).netloc.replace("www.", "")
    except:
        return None

def safe_join_backlinks(domain, moz_results):
    """安全地连接backlinks数据"""
    try:
        backlinks = moz_results.get(domain, {}).get("top_backlinks", [])
        if not backlinks:
            return ""

        str_backlinks = []
        for link in backlinks:
            if isinstance(link, str):
                str_backlinks.append(link)
            elif isinstance(link, dict):
                url = link.get('source', '') or link.get('url', '') or str(link)
                str_backlinks.append(url)
            else:
                str_backlinks.append(str(link))

        return "; ".join(str_backlinks)
    except Exception as e:
        print(f"    ⚠️ Error processing backlinks for {domain}: {e}")
        return ""

def get_moz_url_metrics(urls_batch):
    """获取Moz URL指标"""
    endpoint = "https://lsapi.seomoz.com/v2/url_metrics"

    headers = {
        "Authorization": f"Basic {base64.b64encode(f'{MOZ_ACCESS_ID}:{MOZ_SECRET_KEY}'.encode()).decode()}",
        "Content-Type": "application/json"
    }

    payload = {"targets": urls_batch}

    try:
        response = requests.post(endpoint, headers=headers, json=payload)

        if response.status_code == 200:
            data = response.json()
            return data.get('results', [])
        else:
            print(f"    ❌ Error: {response.status_code} - {response.text}")
            return []

    except Exception as e:
        print(f"    ❌ Exception: {e}")
        return []

def process_moz_results(moz_results):
    """处理Moz API返回的结果"""
    processed_data = {}

    for result in moz_results:
        domain = result.get('root_domain')

        if domain:
            processed_data[domain] = {
                'domain_authority': result.get('domain_authority', 0),
                'page_authority': result.get('page_authority', 0),
                'spam_score': result.get('spam_score', 0),
                'root_domains_to_root_domain': result.get('root_domains_to_root_domain', 0),
                'external_pages_to_root_domain': result.get('external_pages_to_root_domain', 0),
                'pages_to_root_domain': result.get('pages_to_root_domain', 0),
                'top_backlinks': []
            }

    return processed_data

def enrich_domains_with_moz(domains_list, city_name, batch_size=10):
    """使用Moz API批量enrichment域名"""
    urls_to_process = []
    for domain in domains_list:
        if domain and domain.strip():
            url = f"https://{domain.strip()}/"
            urls_to_process.append(url)

    all_results = {}
    total_batches = (len(urls_to_process) + batch_size - 1) // batch_size

    for i in range(0, len(urls_to_process), batch_size):
        batch_urls = urls_to_process[i:i + batch_size]
        batch_num = (i // batch_size) + 1

        print(f"    📦 Batch {batch_num}/{total_batches}: {len(batch_urls)} URLs", end=" ")

        # 获取URL metrics
        moz_results = get_moz_url_metrics(batch_urls)

        if moz_results:
            processed_results = process_moz_results(moz_results)
            all_results.update(processed_results)
            print(f"✅ Got {len(processed_results)} domains")
        else:
            print("❌ Failed")

        # API限制延迟
        if batch_num < total_batches:
            time.sleep(2)

    return all_results

def save_progress(progress_data):
    """保存处理进度"""
    try:
        with open(PROGRESS_FILE, 'w') as f:
            json.dump(progress_data, f, indent=2)
    except Exception as e:
        print(f"⚠️ Failed to save progress: {e}")

def load_progress():
    """加载处理进度"""
    try:
        if os.path.exists(PROGRESS_FILE):
            with open(PROGRESS_FILE, 'r') as f:
                return json.load(f)
    except Exception as e:
        print(f"⚠️ Failed to load progress: {e}")

    return {
        "completed_files": [],
        "failed_files": [],
        "total_cost": 0,
        "total_domains": 0,
        "last_updated": None
    }

def process_single_file(city_name, file_type, progress_data):
    """处理单个文件并立即保存"""
    print(f"\n🏙️ Processing: {city_name} - {file_type.upper()}")
    print("=" * 70)

    # 检查是否已经处理过
    file_id = f"{city_name}_{file_type}"
    if file_id in progress_data["completed_files"]:
        print(f"✅ Already completed: {file_id}")
        return True

    # 构建文件路径
    if file_type == "top3":
        file_pattern = "*top_3*.csv"
    else:
        file_pattern = "*top_10*.csv"

    city_path = os.path.join(BASE_PATH, city_name)

    if not os.path.exists(city_path):
        print(f"❌ City directory not found: {city_path}")
        progress_data["failed_files"].append(f"{file_id} - Directory not found")
        return False

    matching_files = glob.glob(os.path.join(city_path, file_pattern))

    if not matching_files:
        print(f"❌ No {file_type} file found for {city_name}")
        progress_data["failed_files"].append(f"{file_id} - File not found")
        return False

    file_path = matching_files[0]
    print(f"📄 Found: {os.path.basename(file_path)}")

    try:
        # 读取数据
        df = pd.read_csv(file_path)
        print(f"📊 Loaded {len(df):,} rows")

        # 检查必要列
        required_columns = ["keyword", "city", "ranking", "link"]
        if not all(col in df.columns for col in required_columns):
            print(f"❌ Missing required columns")
            progress_data["failed_files"].append(f"{file_id} - Missing columns")
            return False

        # 提取域名
        df["domain"] = df["link"].apply(extract_domain)
        unique_domains = df["domain"].dropna().unique()

        print(f"🌐 Found {len(unique_domains)} unique domains")
        print(f"💰 Estimated cost: ${len(unique_domains) * 0.01:.2f}")

        # Moz enrichment
        print(f"📊 Starting Moz enrichment...")
        moz_results = enrich_domains_with_moz(unique_domains, city_name)

        if not moz_results:
            print(f"❌ No Moz results obtained")
            progress_data["failed_files"].append(f"{file_id} - No Moz data")
            return False

        # 应用结果到dataframe
        print(f"📋 Applying results to dataframe...")

        df["domain_authority"] = df["domain"].map(lambda d: moz_results.get(d, {}).get("domain_authority", 0))
        df["page_authority"] = df["domain"].map(lambda d: moz_results.get(d, {}).get("page_authority", 0))
        df["spam_score"] = df["domain"].map(lambda d: moz_results.get(d, {}).get("spam_score", 0))
        df["backlinks_count"] = df["domain"].map(lambda d: moz_results.get(d, {}).get("external_pages_to_root_domain", 0))
        df["referring_domains"] = df["domain"].map(lambda d: moz_results.get(d, {}).get("root_domains_to_root_domain", 0))
        df["pages_to_domain"] = df["domain"].map(lambda d: moz_results.get(d, {}).get("pages_to_root_domain", 0))
        df["top_backlinks"] = df["domain"].map(lambda d: safe_join_backlinks(d, moz_results))
        df["trust_flow"] = None  # Moz不提供Trust Flow

        # 立即保存结果
        output_filename = f"{city_name}_{file_type}_moz_enriched.csv"
        output_path = os.path.join(OUTPUT_BASE_PATH, output_filename)
        df.to_csv(output_path, index=False)

        # 统计报告
        enriched_rows = (df["domain_authority"] > 0).sum()
        max_da = df["domain_authority"].max()
        avg_da = df[df["domain_authority"] > 0]["domain_authority"].mean() if enriched_rows > 0 else 0
        total_backlinks = df["backlinks_count"].sum()
        actual_cost = len(unique_domains) * 0.01

        print(f"\n📊 Results Summary:")
        print(f"💾 Saved: {output_filename}")
        print(f"📈 Enriched: {enriched_rows:,}/{len(df):,} rows")
        print(f"🏆 Max DA: {max_da}, Avg DA: {avg_da:.1f}")
        print(f"🔗 Total Backlinks: {total_backlinks:,}")
        print(f"💰 Cost: ${actual_cost:.2f}")

        # 更新进度
        progress_data["completed_files"].append(file_id)
        progress_data["total_cost"] += actual_cost
        progress_data["total_domains"] += len(unique_domains)
        progress_data["last_updated"] = datetime.now().isoformat()

        # 保存进度
        save_progress(progress_data)

        return True

    except Exception as e:
        print(f"❌ Error processing {city_name} {file_type}: {e}")
        progress_data["failed_files"].append(f"{file_id} - Processing error: {str(e)}")
        save_progress(progress_data)
        return False

def discover_all_cities():
    """发现所有城市"""
    print("🔍 Discovering all cities...")

    cities = []
    try:
        for item in os.listdir(BASE_PATH):
            item_path = os.path.join(BASE_PATH, item)
            if os.path.isdir(item_path) and not item.startswith('.'):
                cities.append(item)
    except Exception as e:
        print(f"❌ Error discovering cities: {e}")
        return []

    cities.sort()
    print(f"📁 Found {len(cities)} cities")
    return cities

def main():
    """主函数 - 处理所有城市"""
    print("🌟 ALL CITIES MOZ ENRICHMENT SYSTEM")
    print("=" * 100)
    print("🎯 Processing ALL cities TOP3 and TOP10 files")
    print("💾 Progressive saving - each file saved immediately")
    print("🔄 Resume capability - can restart from interruption")

    # 加载进度
    progress_data = load_progress()
    completed_count = len(progress_data["completed_files"])

    if completed_count > 0:
        print(f"\n📈 Resuming from previous session:")
        print(f"✅ Already completed: {completed_count} files")
        print(f"💰 Cost so far: ${progress_data['total_cost']:.2f}")
        print(f"🌐 Domains so far: {progress_data['total_domains']}")

    # 发现所有城市
    all_cities = discover_all_cities()
    if not all_cities:
        print("❌ No cities found!")
        return

    # 计算总工作量
    total_files = len(all_cities) * 2  # 每个城市2个文件(top3, top10)
    remaining_files = total_files - completed_count

    print(f"\n📊 Processing Overview:")
    print(f"🏙️ Total cities: {len(all_cities)}")
    print(f"📄 Total files: {total_files}")
    print(f"🔄 Remaining files: {remaining_files}")
    print(f"💰 Estimated remaining cost: ${remaining_files * 3:.2f} (avg $3/file)")
    print(f"⏱️ Estimated time: {remaining_files * 2:.0f} minutes (avg 2min/file)")

    # 开始处理
    print(f"\n🚀 Starting processing...")

    successful_files = []
    failed_files = []

    start_time = time.time()

    for i, city in enumerate(all_cities):
        print(f"\n🏙️ CITY {i+1}/{len(all_cities)}: {city}")
        print(f"📊 Progress: {((i)/len(all_cities)*100):.1f}%")

        # 处理TOP3
        success_top3 = process_single_file(city, "top3", progress_data)
        if success_top3:
            successful_files.append(f"{city}_top3")
        else:
            failed_files.append(f"{city}_top3")

        # 短暂延迟
        print("💤 Waiting 3 seconds...")
        time.sleep(3)

        # 处理TOP10
        success_top10 = process_single_file(city, "top10", progress_data)
        if success_top10:
            successful_files.append(f"{city}_top10")
        else:
            failed_files.append(f"{city}_top10")

        # 城市间延迟
        if i < len(all_cities) - 1:
            print("💤 Waiting 5 seconds before next city...")
            time.sleep(5)

        # 每10个城市显示进度
        if (i + 1) % 10 == 0:
            elapsed_time = time.time() - start_time
            avg_time_per_city = elapsed_time / (i + 1)
            remaining_cities = len(all_cities) - (i + 1)
            estimated_remaining = remaining_cities * avg_time_per_city

            print(f"\n📈 Progress Update:")
            print(f"✅ Completed cities: {i+1}/{len(all_cities)}")
            print(f"💰 Total cost so far: ${progress_data['total_cost']:.2f}")
            print(f"⏱️ Estimated time remaining: {estimated_remaining/60:.1f} minutes")

    # 最终报告
    total_completed = len(progress_data["completed_files"])
    total_failed = len(progress_data["failed_files"])

    print(f"\n🎉 ALL CITIES PROCESSING COMPLETE!")
    print("=" * 100)
    print(f"✅ Successfully processed: {total_completed} files")
    print(f"❌ Failed: {total_failed} files")
    print(f"💰 Total cost: ${progress_data['total_cost']:.2f}")
    print(f"🌐 Total domains: {progress_data['total_domains']:,}")
    print(f"📂 Results saved to: {OUTPUT_BASE_PATH}")

    # 显示失败的文件
    if progress_data["failed_files"]:
        print(f"\n❌ Failed files:")
        for failure in progress_data["failed_files"][-10:]:  # 显示最后10个失败
            print(f"  - {failure}")
        if len(progress_data["failed_files"]) > 10:
            print(f"  ... and {len(progress_data['failed_files']) - 10} more")

    print(f"\n✨ All enriched data ready for analysis!")
    print(f"🎯 {total_completed} CSV files with Domain Authority, Page Authority, and Backlinks data!")

if __name__ == "__main__":
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    📦 Batch 61/105: 10 URLs ✅ Got 10 domains
    📦 Batch 62/105: 10 URLs ✅ Got 10 domains
    📦 Batch 63/105: 10 URLs ✅ Got 10 domains
    📦 Batch 64/105: 10 URLs ✅ Got 9 domains
    📦 Batch 65/105: 10 URLs ✅ Got 10 domains
    📦 Batch 66/105: 10 URLs ✅ Got 10 domains
    📦 Batch 67/105: 10 URLs ✅ Got 10 domains
    📦 Batch 68/105: 10 URLs ✅ Got 10 domains
    📦 Batch 69/105: 10 URLs ✅ Got 10 domains
    📦 Batch 70/105: 10 URLs ✅ Got 9 domains
    📦 Batch 71/105: 10 URLs ✅ Got 10 domains
    📦 Batch 72/105: 10 URLs ✅ Got 10 domains
    📦 Batch 73/105: 10 URLs ✅ Got 10 domains
    📦 Batch 74/105: 10 URLs ✅ Got 10 domains
    📦 Batch 75/105: 10 URLs ✅ Got 10 domains
    📦 Batch 76/105: 10 URLs ✅ Got 10 domains
    📦 Batch 77/105: 10 URLs ✅ Got 10 domains
    📦 Batch 78/105: 10 URLs ✅ Got 10 domains
    📦 Batch 79/105: 10 URLs ✅ Got 10 domains
    📦 Batch 80/105: 10 URLs ✅ Got 10 domains
    📦 Batch 81/105: 10 URLs ✅ Got 10 