# SPL快速诊断问题根因分析-错

本notebook基于《如何用SPL快速诊断问题根因 -- 错》文档，实现了一个可分步执行的故障诊断流程。

## 分析流程
1. **配置环境** - 设置SLS访问凭证和参数
2. **筛选报错spanId** - 创建SLS客户端，找出故障时间段内的根因span  
3. **寻找报错特征** - 使用模式匹配分析错误特征
4. **交叉验证日志特征** - 验证故障根因
5. **根因分析总结** - 计算具体的根因候选项

## 使用方法
只需在第2个cell中修改FAULT_START_TIME和FAULT_END_TIME为您的故障时间段，然后按顺序运行所有cell即可得到根因分析结果。

notebook会自动从运行时证据中分析并确定具体的根因服务。


## 1. 配置环境

设置故障时间段和SLS参数


In [1]:
import os
import sys

from aliyun.log import LogClient
from alibabacloud_sts20150401.client import Client as StsClient
from alibabacloud_sts20150401 import models as sts_models
from alibabacloud_tea_openapi import models as open_api_models
from Tea.exceptions import TeaException

sys.path.append('..')

# 分析时间区间
FAULT_START_TIME = "2025-08-28 15:08:03"
FAULT_END_TIME = "2025-08-28 15:13:03"

# SLS配置参数 (日志数据存储的位置)
PROJECT_NAME = "proj-xtrace-a46b97cfdc1332238f714864c014a1b-cn-qingdao"
LOGSTORE_NAME = "logstore-tracing"
REGION = "cn-qingdao"

print(f"故障时间段: {FAULT_START_TIME} ~ {FAULT_END_TIME}")
print(f"SLS项目: {PROJECT_NAME}")
print(f"日志库: {LOGSTORE_NAME}")
print(f"区域: {REGION}")
# ----------请设置环境变量，样例如下:
# export ALIBABA_CLOUD_ACCESS_KEY_ID="你保存的AccessKey ID"
# export ALIBABA_CLOUD_ACCESS_KEY_SECRET="你保存的AccessKey Secret"
MAIN_ACCOUNT_ACCESS_KEY_ID = os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID')
MAIN_ACCOUNT_ACCESS_KEY_SECRET = os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET')
ALIBABA_CLOUD_ROLE_ARN = os.getenv('ALIBABA_CLOUD_ROLE_ARN','acs:ram::1672753017899339:role/tianchi-user-a')
STS_SESSION_NAME = os.getenv('ALIBABA_CLOUD_ROLE_SESSION_NAME', 'my-sls-access') # 自定义会话名称，没有固定命名要求

if MAIN_ACCOUNT_ACCESS_KEY_ID and MAIN_ACCOUNT_ACCESS_KEY_SECRET and ALIBABA_CLOUD_ROLE_ARN:
    print("✅ SLS访问凭证配置正确")
else:
    print("❌ 请配置环境变量 ALIBABA_CLOUD_ACCESS_KEY_ID 和 ALIBABA_CLOUD_ACCESS_KEY_SECRET")

故障时间段: 2025-08-28 15:08:03 ~ 2025-08-28 15:13:03
SLS项目: proj-xtrace-a46b97cfdc1332238f714864c014a1b-cn-qingdao
日志库: logstore-tracing
区域: cn-qingdao
✅ SLS访问凭证配置正确


## 2. 筛选报错spanId

创建SLS客户端，使用FindRootCauseSpans查找故障时间段内的根因span


In [2]:
from find_root_cause_spans_error import FindRootCauseSpans

def get_sts_credentials():


    if not all([MAIN_ACCOUNT_ACCESS_KEY_ID, MAIN_ACCOUNT_ACCESS_KEY_SECRET, ALIBABA_CLOUD_ROLE_ARN]):
        print("❌ 角色ARN缺失! 请在环境变量文件中配置 ALIBABA_CLOUD_ACCESS_KEY_ID, ALIBABA_CLOUD_ACCESS_KEY_SECRET")
        return None

    config = open_api_models.Config(
        access_key_id=MAIN_ACCOUNT_ACCESS_KEY_ID,
        access_key_secret=MAIN_ACCOUNT_ACCESS_KEY_SECRET,
        endpoint=f'sts.{REGION}.aliyuncs.com'
    )
    sts_client = StsClient(config)

    assume_role_request = sts_models.AssumeRoleRequest(
        role_arn=ALIBABA_CLOUD_ROLE_ARN,
        role_session_name=STS_SESSION_NAME,
        duration_seconds=3600
    )

    try:
        response = sts_client.assume_role(assume_role_request)
        print("✅ 成功获取访问权限！")
        return response.body.credentials
    except TeaException as e:
        print(f"❌ 获取STS临时凭证失败: {e.message}")
        print(f"  错误码: {e.code}")
        print("  请检查:1. 主账号AK是否正确;2. 目标角色ARN是否正确;3. 目标角色的信任策略是否已配置为信任您的主账号。")
        return None
    except Exception as e:
        print(f"❌ 发生未知错误在获取STS凭证时: {e}")
        return None

# --- 函数：创建SLS客户端 ---
def create_sls_client_with_sts():

    sts_credentials = get_sts_credentials()

    if not sts_credentials:
        return None

    sls_endpoint = f"{REGION}.log.aliyuncs.com"

    # aliyun-log-python-sdk 使用 securityToken 参数
    log_client = LogClient(
        endpoint=sls_endpoint,
        accessKeyId=sts_credentials.access_key_id,
        accessKey=sts_credentials.access_key_secret,
        securityToken=sts_credentials.security_token
    )

    print("✅ SLS客户端已使用临时凭证创建。")
    return log_client

print("--- 开始执行根因SPAN查找任务 ---")

# 1. 创建带有STS凭证的SLS客户端
log_client_instance = create_sls_client_with_sts()

# 2. 如果客户端创建成功，则开始执行查找根因span任务
if log_client_instance:
    # 3. 创建根因span查找器，传入客户端实例
    root_cause_finder = FindRootCauseSpans(
        client=log_client_instance,
        project_name=PROJECT_NAME,
        logstore_name=LOGSTORE_NAME,
        region=REGION,
        start_time=FAULT_START_TIME,
        end_time=FAULT_END_TIME
    )

    print("\n开始查找根因spans...")
    try:
        root_cause_span_ids = root_cause_finder.find_root_cause_spans()

        print(f"\n找到 {len(root_cause_span_ids)} 个根因span:")
        for i, span_id in enumerate(root_cause_span_ids[:10]):
            print(f"{i+1}. {span_id}")

        if len(root_cause_span_ids) > 10:
            print(f"... 还有 {len(root_cause_span_ids) - 10} 个")
    except TeaException as e:
        print(f"\n❌ 查询日志时发生错误: {e.message}")
        print(f"  错误码: {e.code}")
        print("  请检查：1. 临时凭证是否已过期；2. 扮演的角色是否拥有对目标Project和Logstore的读权限。")
    except Exception as e:
        print(f"\n❌ 查询日志时发生未知错误: {e}")

else:
    print("\n❌ 因无法创建SLS客户端，任务终止。")

--- 开始执行根因SPAN查找任务 ---
✅ 成功获取访问权限！
✅ SLS客户端已使用临时凭证创建。
[FindRootCauseSpans] 初始化完成。
  开始时间: 2025-08-28 15:08:03 (时间戳: 1756364883)
  结束时间: 2025-08-28 15:13:03 (时间戳: 1756365183)

开始查找根因spans...
proj-xtrace-a46b97cfdc1332238f714864c014a1b-cn-qingdao logstore-tracing cn-qingdao
[FindRootCauseSpans] 查询时间范围: 2025-08-28 15:08:03 ~ 2025-08-28 15:13:03. 查询条件: statusCode>1
总共查询到的span数量: 1522
涉及的trace数量: 154

找到 154 个根因span:
1. a83bb1e960e12216
2. 76af7f3f7141bbf1
3. 2d27a4a14d01d2da
4. 7c3d14ba208aba2d
5. 149605380d997ed7
6. 8f63518d7e900af7
7. 4a88854bf7632fdf
8. b7640166a3e0bf26
9. cbeb0704d0ea7c37
10. 529e883159d97406
... 还有 144 个


## 3. 生成SPL查询条件

将根因span ID转换为SPL查询条件格式


In [3]:
# 生成spanId查询条件
if root_cause_span_ids:
    span_conditions = " or ".join([f"spanId='{span_id}'" for span_id in root_cause_span_ids])
    print("生成的spanId查询列表:")
    print(span_conditions[:500] + "..." if len(span_conditions) > 500 else span_conditions)

    # 保存到变量供后续使用
    SPAN_CONDITIONS = span_conditions
    print(f"\n✅ 查询条件已保存，包含 {len(root_cause_span_ids)} 个spanId")
else:
    print("❌ 未找到根因span，无法生成查询条件")
    SPAN_CONDITIONS = ""


生成的spanId查询列表:
spanId='a83bb1e960e12216' or spanId='76af7f3f7141bbf1' or spanId='2d27a4a14d01d2da' or spanId='7c3d14ba208aba2d' or spanId='149605380d997ed7' or spanId='8f63518d7e900af7' or spanId='4a88854bf7632fdf' or spanId='b7640166a3e0bf26' or spanId='cbeb0704d0ea7c37' or spanId='529e883159d97406' or spanId='00d547ce56971266' or spanId='d6323829b16e4487' or spanId='615c90a7fb37a021' or spanId='e3f22206b6ecab24' or spanId='99ee8078b1864cbd' or spanId='3b82e6f4458fba65' or spanId='4aac6cf2bb6b9c16' or spanId=...

✅ 查询条件已保存，包含 154 个spanId


## 4. 寻找报错特征

### 4.1 使用SPL模式匹配分析错误特征


In [4]:
from aliyun.log import GetLogsRequest
from datetime import datetime

if SPAN_CONDITIONS:
    # 使用之前创建的SLS客户端
    client = log_client_instance
    # 构建错误特征分析查询,关键是调用阿里云SLS提供的智能分析函数get_patterns,自动分析数据的组合，找出其中出现频率最高的模式
    pattern_analysis_query = f"""
* | set session enable_remote_functions=true ;
set session velox_support_row_constructor_enabled=true;
with t0 as (
    select spanName, serviceName,
           JSON_EXTRACT_SCALAR(resources, '$["k8s.pod.ip"]') AS pod_ip,
           JSON_EXTRACT_SCALAR(resources, '$["k8s.node.name"]') AS node_name,
           JSON_EXTRACT_SCALAR(resources, '$["service.version"]') AS service_version,
           if((statusCode = 2 or statusCode = 3), 'true', 'false') as anomaly_label,
           cast(if((statusCode = 2 or statusCode = 3), 1, 0) as double) as error_count
    from log
    where {SPAN_CONDITIONS}
),
t1 as (
    select array_agg(spanName) as spanName,
           array_agg(serviceName) as serviceName,
           array_agg(pod_ip) as pod_ip,
           array_agg(node_name) as node_name,
           array_agg(service_version) as service_version,
           array_agg(anomaly_label) as anomaly_label,
           array_agg(error_count) as error_count
    from t0
),
t2 as (
    select row(spanName, serviceName) as table_row
    from t1
),
t3 as (
    select get_patterns(table_row, ARRAY['spanName', 'serviceName']) as ret
    from t2
)
select * from t3
"""

    print("执行错误特征分析查询...")

    # 时间格式转换
    start_timestamp = int(datetime.strptime(FAULT_START_TIME, "%Y-%m-%d %H:%M:%S").timestamp())
    end_timestamp = int(datetime.strptime(FAULT_END_TIME, "%Y-%m-%d %H:%M:%S").timestamp())

    request = GetLogsRequest(
        project=PROJECT_NAME,
        logstore=LOGSTORE_NAME,
        query=pattern_analysis_query,
        fromTime=start_timestamp,
        toTime=end_timestamp,
        line=1000
    )

    try:
        response = client.get_logs(request)
        if response and response.get_count() > 0:
            print(f"\n错误特征分析结果 ({response.get_count()} 条记录):")

            # 保存get_patterns结果用于后续分析
            global get_patterns_result
            get_patterns_result = None

            for log in response.get_logs():
                contents = log.get_contents()
                for key, value in contents.items():
                    print(f"{key}: {value}")
                    if key == "ret":
                        get_patterns_result = value  # 保存结果供后续解析
                print("-" * 50)
        else:
            print("未找到错误特征分析结果")
            get_patterns_result = None
    except Exception as e:
        print(f"错误特征分析查询失败: {e}")
        get_patterns_result = None
else:
    print("❌ 无有效的spanId条件，跳过错误特征分析")


执行错误特征分析查询...

错误特征分析结果 (1 条记录):
ret: [["serviceName=payment","serviceName=frontend-proxy"],[152,2],null,null]
--------------------------------------------------


### 4.2 使用diff_patterns分析异常特征差异

使用diff_patterns函数分析根因span与非根因错误span的差异特征

In [5]:
# 使用diff_patterns函数分析根因span与非根因错误span的差异特征
# - 异常组('true'): 根因span (在SPAN_CONDITIONS列表中的span)
# - 对照组('false'): 非根因的错误span (statusCode>0但不在根因列表中的span)
# 🎯 分析目的:
# - 找出根因span与其他错误span在服务名等维度上的差异模式
# - 识别根因span的独特特征，帮助验证根因识别的准确性
if SPAN_CONDITIONS:
    diff_pattern_query = f"""
statusCode>0 | set session enable_remote_functions=true ;
set session velox_support_row_constructor_enabled=true;
with t0 as (
    select spanName, serviceName,
           JSON_EXTRACT_SCALAR(resources, '$["k8s.pod.ip"]') AS pod_ip,
           JSON_EXTRACT_SCALAR(resources, '$["k8s.node.name"]') AS node_name,
           JSON_EXTRACT_SCALAR(resources, '$["service.version"]') AS service_version,
           if(({SPAN_CONDITIONS}), 'true', 'false') as anomaly_label,
           cast(if((statusCode = 2 or statusCode = 3), 1, 0) as double) as error_count
    from log
),
t1 as (
    select array_agg(spanName) as spanName,
           array_agg(serviceName) as serviceName,
           array_agg(pod_ip) as pod_ip,
           array_agg(node_name) as node_name,
           array_agg(service_version) as service_version,
           array_agg(anomaly_label) as anomaly_label,
           array_agg(error_count) as error_count
    from t0
),
t2 as (
    select row(serviceName, anomaly_label) as table_row
    from t1
),
t3 as (
    select diff_patterns(table_row, ARRAY['serviceName', 'anomaly_label'], 'anomaly_label', 'true', 'false') as ret
    from t2
)
select * from t3
"""

    print("执行diff_patterns差异模式分析查询...")

    request = GetLogsRequest(
        project=PROJECT_NAME,
        logstore=LOGSTORE_NAME,
        query=diff_pattern_query,
        fromTime=start_timestamp, #- 5*60,
        toTime=end_timestamp,
        line=1000
    )

    try:
        response = client.get_logs(request)
        if response and response.get_count() > 0:
            print(f"\\n差异模式分析结果 ({response.get_count()} 条记录):")

            # 保存diff_patterns结果用于后续分析
            global diff_patterns_result
            diff_patterns_result = None

            for log in response.get_logs():
                contents = log.get_contents()
                for key, value in contents.items():
                    print(f"{key}: {value}")
                    if key == "ret":
                        diff_patterns_result = value  # 保存结果供后续解析
                print("-" * 50)
        else:
            print("未找到差异模式分析结果")
            diff_patterns_result = None
    except Exception as e:
        print(f"差异模式分析查询失败: {e}")
        diff_patterns_result = None
else:
    print("❌ 无有效的spanId条件，跳过差异模式分析")

执行diff_patterns差异模式分析查询...
\n差异模式分析结果 (1 条记录):
ret: [["\"serviceName\"='payment'"],[152],[0],[0.987012987012987],[0.0],[0.987012987012987],[1.0],[0.0],null]
--------------------------------------------------


## 5. 交叉验证日志特征

使用get_log_patterns分析故障时间段内的日志模式，验证根因


In [6]:
# 日志模式分析查询 - 使用两阶段查询方法

# 使用阿里云SLS的智能模式识别功能，通过两阶段查询分析日志模式：
# 1. 第一阶段：基于所有错误日志，训练生成日志模式模型
# 2. 第二阶段：使用训练好的模型，匹配和识别具体的日志模式

print("执行日志模式分析查询...")

# 第一阶段：获取model_id
log_pattern_query_stage1 = """
statusCode > 0 | set session enable_remote_functions=true;
set session velox_support_row_constructor_enabled=true;

with t0 as (
  select statusCode, statusMessage, serviceName,
         CONCAT('[', serviceName, '] ', statusMessage) as combined_message
  from log
),
t1 as (
  select array_agg(combined_message) as contents
  from t0
),
t2 as (
  -- 调用get_log_patterns函数进行模式学习
  select
    contents,
    get_log_patterns(contents, ARRAY[' ', '\\n', '\\t', '\\r', '\\f', '\\v', ':', ',', '[', ']'], null, null, '{"threshold": 3, "tolerance": 0.3, "maxDigitRatio": 0.3}') as ret
  from t1
)
select
  ret.model_id as model_id,
  ret.error_msg as error_msg
from t2
"""

print("第一阶段：获取model_id...")
stage1_request = GetLogsRequest(
    project=PROJECT_NAME,
    logstore=LOGSTORE_NAME,
    query=log_pattern_query_stage1,
    fromTime=start_timestamp,
    toTime=end_timestamp,
    line=10
)

try:
    stage1_response = client.get_logs(stage1_request)
    if stage1_response and stage1_response.get_count() > 0:
        # 获取model_id
        first_log = stage1_response.get_logs()[0]
        stage1_contents = first_log.get_contents()
        model_id = stage1_contents.get("model_id", "")
        error_msg = stage1_contents.get("error_msg", "")

        print(f"✅ 成功获取model_id: {model_id}")
        if error_msg:
            print(f"错误信息: {error_msg}")

        # 第二阶段：使用model_id进行模式匹配
        if model_id:
            log_pattern_query_stage2 = f"""
statusCode > 0 | set session presto_velox_mix_run_not_check_linked_agg_enabled=true;
set session presto_velox_mix_run_support_complex_type_enabled=true;
set session velox_sanity_limit_enabled=false;
set session enable_remote_functions=true;

with t0 as (
    select CONCAT('[', serviceName, '] ', statusMessage) as combined_message
    from log
)
select
    ret.is_matched as is_matched,
    ret.pattern_id as pattern_id,
    ret.pattern as pattern,
    ret.regexp as pattern_regexp,
    ret.variables as variables,
    combined_message as content
from (
    -- 调用match_log_patterns函数，使用第一阶段生成的model_id进行匹配
    select match_log_patterns('{model_id}', combined_message) as ret, combined_message
    from t0
)
where ret.is_matched = true
limit 50000
"""

            print("第二阶段：使用model_id进行模式匹配...")
            stage2_request = GetLogsRequest(
                project=PROJECT_NAME,
                logstore=LOGSTORE_NAME,
                query=log_pattern_query_stage2,
                fromTime=start_timestamp,
                toTime=end_timestamp,
                line=50000
            )

            stage2_response = client.get_logs(stage2_request)

            if stage2_response and stage2_response.get_count() > 0:
                print(f"\\n✅ 日志模式匹配结果 ({stage2_response.get_count()} 条记录):")

                # ===== 结果统计和分析 =====
                pattern_counts = {}           # 统计每个模式的出现次数
                service_pattern_counts = {}   # 统计每个服务的错误次数

                for log in stage2_response.get_logs():
                    contents = log.get_contents()
                    pattern = contents.get('pattern', '')
                    content = contents.get('content', '')

                    if pattern:
                        pattern_counts[pattern] = pattern_counts.get(pattern, 0) + 1

                        # 提取服务名进行统计
                        if content.startswith('[') and ']' in content:
                            service_name = content.split(']')[0][1:]  # 提取[serviceName]中的serviceName
                            service_pattern_counts[service_name] = service_pattern_counts.get(service_name, 0) + 1

                # 按出现次数排序显示
                sorted_patterns = sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True)
                sorted_services = sorted(service_pattern_counts.items(), key=lambda x: x[1], reverse=True)

                # ===== 展示分析结果 =====
                print("\\n🔍 主要日志模式 (按出现频率排序):")
                print("注意: 只显示服务名的模式通常表示该服务的statusMessage为空")
                print("有具体错误信息的模式更可能指向真正的根因")
                for pattern, count in sorted_patterns[:10]:
                    print(f"出现次数: {count}")
                    print(f"模式: {pattern}")
                    print("-" * 80)

                print("\\n📊 涉及的服务排序 (按错误频率):")
                for service, count in sorted_services[:10]:
                    print(f"{service}: {count} 次错误")

            else:
                print("❌ 第二阶段：未找到匹配的日志模式")
        else:
            print("❌ 无法获取有效的model_id")

    else:
        print("❌ 第一阶段：未找到日志模式分析结果")

except Exception as e:
    print(f"❌ 日志模式分析查询失败: {e}")
    import traceback
    traceback.print_exc()

执行日志模式分析查询...
第一阶段：获取model_id...
✅ 成功获取model_id: 1593c527-e17b-4202-91d1-19b202975f36
错误信息: null
第二阶段：使用model_id进行模式匹配...
\n✅ 日志模式匹配结果 (15554 条记录):
\n🔍 主要日志模式 (按出现频率排序):
注意: 只显示服务名的模式通常表示该服务的statusMessage为空
有具体错误信息的模式更可能指向真正的根因
出现次数: 15250
模式: [<*>] 
--------------------------------------------------------------------------------
出现次数: 152
模式: [checkout] Payment request failed. Invalid token. app.loyalty.level=gold
--------------------------------------------------------------------------------
出现次数: 152
模式: [checkout] rpc error: code = Internal desc = failed to charge card: could not charge the card: rpc error: code = Unknown desc = Payment request failed. Invalid token. app.loyalty.level=gold
--------------------------------------------------------------------------------
\n📊 涉及的服务排序 (按错误频率):
currency: 13418 次错误
checkout: 918 次错误
frontend: 608 次错误
frontend-proxy: 306 次错误
load-generator: 152 次错误
payment: 152 次错误


## 6. 结果总结

汇总分析结果并给出诊断结论


In [7]:
print("🔍 Step 5: Root Cause Analysis Summary")
print("="*60)

# 分析结果总结
print(f"📊 分析总结：")
print(f"   异常时间段：{FAULT_START_TIME} ~ {FAULT_END_TIME}")
print(f"   分析的SLS项目：{PROJECT_NAME}")
print(f"   发现错误span数量：{len(root_cause_span_ids) if 'root_cause_span_ids' in locals() else 0}")

print(f"\n🎯 根因发现：")

# 初始化全局变量（如果尚未定义）
if 'get_patterns_result' not in globals():
    get_patterns_result = None
if 'diff_patterns_result' not in globals():
    diff_patterns_result = None

# 从已有的分析结果中提取TARGET_SERVICE
TARGET_SERVICE = "unknown"
error_span_evidence = False
pattern_evidence = False

# 检查是否有错误span证据
if 'root_cause_span_ids' in locals() and root_cause_span_ids:
    error_span_evidence = True
    print(f"   ✅ 已找到 {len(root_cause_span_ids)} 个错误span")
else:
    print(f"   ❌ 未找到错误span")

# 从运行时收集的证据中动态提取TARGET_SERVICE
def parse_service_from_evidence():
    """从get_patterns和diff_patterns的实际运行结果中提取目标服务"""

    target_service = "unknown"
    confidence_score = 0

    # 1. 解析get_patterns结果
    # 格式: [["serviceName=product-catalog","serviceName=frontend-proxy","serviceName=frontend"],[276,11,2],null,null]
    if 'get_patterns_result' in globals() and get_patterns_result:
        try:
            # Parse the string format from SLS: [["serviceName=cart","serviceName=frontend-proxy"],[139,1],null,null]
            if isinstance(get_patterns_result, str):
                # Replace 'null' with 'None' for Python parsing
                data_str = get_patterns_result.replace('null', 'None')
                result = eval(data_str)  # Safely parse the array
            else:
                result = get_patterns_result

            if len(result) >= 2 and isinstance(result[0], list) and isinstance(result[1], list):
                service_patterns = result[0]  # 服务名模式
                service_counts = result[1]    # 对应的计数

                # 提取服务名并找出计数最高的
                max_count = 0
                for i, pattern in enumerate(service_patterns):
                    if i < len(service_counts):
                        count = service_counts[i]
                        # 从 "serviceName=product-catalog" 中提取服务名
                        if "serviceName=" in pattern:
                            service = pattern.split("serviceName=")[1].strip('"\'')
                            if count > max_count:
                                max_count = count
                                target_service = service
                                confidence_score = max_count
                                print(f"   ✅ get_patterns发现主要错误服务: {service} (错误数: {count})")
        except Exception as e:
            print(f"   ⚠️ get_patterns结果解析失败: {e}")

    # 2. 解析diff_patterns结果进行验证
    # 格式: [["\"serviceName\"='product-catalog'"],[276],[2684],[0.955...]...]
    if 'diff_patterns_result' in globals() and diff_patterns_result:
        try:
            # Parse the string format from SLS: [["\"serviceName\"='cart'"],[139],[0]...]
            if isinstance(diff_patterns_result, str):
                # Replace 'null' with 'None' for Python parsing
                data_str = diff_patterns_result.replace('null', 'None')
                result = eval(data_str)  # Safely parse the array
            else:
                result = diff_patterns_result

            if len(result) >= 1 and isinstance(result[0], list):
                anomaly_patterns = result[0]  # 异常模式

                # 从异常模式中提取服务名
                for pattern in anomaly_patterns:
                    if "serviceName" in pattern and "=" in pattern:
                        # 从 "\"serviceName\"='product-catalog'" 中提取服务名
                        service = pattern.split("='")[1].strip("'\"")
                        print(f"   ✅ diff_patterns确认异常服务: {service}")

                        # 如果get_patterns也找到了同样的服务，增加置信度
                        if service == target_service:
                            print(f"   ✅ 多重证据确认: {service} 是主要根因服务")
                            return target_service, True  # 高置信度
                        elif target_service == "unknown":
                            # 如果get_patterns没找到，使用diff_patterns的结果
                            target_service = service
                            return target_service, True
        except Exception as e:
            print(f"   ⚠️ diff_patterns结果解析失败: {e}")

    # 3. 返回结果
    if target_service != "unknown":
        return target_service, True
    else:
        print(f"   ❌ 无法从运行时证据中提取明确的目标服务")
        return "unknown", False

# 解析运行时证据
if error_span_evidence:
    TARGET_SERVICE, pattern_evidence = parse_service_from_evidence()

    if pattern_evidence:
        print(f"   ✅ 根据运行时证据确定目标服务: {TARGET_SERVICE}")
    else:
        print(f"   ❌ 无足够的模式分析证据")
else:
    print(f"   ❌ 无足够的模式分析证据")
    TARGET_SERVICE = "unknown"
    pattern_evidence = False

print(f"\n🏆 根因候选：")

# 基于evidence的评估
evidence = error_span_evidence and pattern_evidence

if evidence:
    root_cause_candidate = f"{TARGET_SERVICE}"
    confidence = "高"

    print(f"   🎯 {root_cause_candidate}")
    print(f"   📈 置信度：{confidence}")
    print(f"   ✅ 证据：TRUE（已检测到异常）")
    print(f"   📝 支持证据：")
    print(f"      - 发现 {len(root_cause_span_ids)} 个错误span")
    print(f"      - diff_patterns分析表明 {TARGET_SERVICE} 服务异常")
    print(f"      - 日志模式分析验证了服务问题")
    print(f"      - 错误特征分析确认了根因位置")

elif error_span_evidence:
    root_cause_candidate = "unknown"
    confidence = "中"

    print(f"   🎯 {root_cause_candidate}")
    print(f"   📈 置信度：{confidence}")
    print(f"   ❌ 证据：FALSE（模式不够明确）")
    print(f"   📝 支持证据：")
    print(f"      - 发现 {len(root_cause_span_ids)} 个错误span")
    print(f"      - 但模式分析结果不够明确")

else:
    root_cause_candidate = "unknown"
    confidence = "低"

    print(f"   🎯 {root_cause_candidate}")
    print(f"   📈 置信度：{confidence}")
    print(f"   ❌ 证据：FALSE（证据不足）")
    print(f"   📝 支持证据：")
    print(f"      - 错误span或模式分析数据有限")

print(f"\n💡 建议：")
if evidence:
    print(f"   - 检查 {TARGET_SERVICE} 服务的健康状态")
    print(f"   - 查看 {TARGET_SERVICE} 的错误日志和异常")
    print(f"   - 验证 {TARGET_SERVICE} 的部署和配置")
    print(f"   - 考虑重启或修复 {TARGET_SERVICE} 服务")
elif confidence == "中":
    print(f"   - 进一步分析错误span的分布模式")
    print(f"   - 检查多个可疑服务的状态")
    print(f"   - 扩大时间范围进行分析")
else:
    print(f"   - 调整分析参数（时间范围等）")
    print(f"   - 核查数据是否完整")
    print(f"   - 考虑其他分析方法")

print(f"\n" + "="*60)
print(f"🎯 最终答复：{root_cause_candidate}")
print(f"📈 置信度：{confidence}")
print(f"🔍 证据：{'TRUE' if evidence else 'FALSE'}")
print(f"" + "="*60)

🔍 Step 5: Root Cause Analysis Summary
📊 分析总结：
   异常时间段：2025-08-28 15:08:03 ~ 2025-08-28 15:13:03
   分析的SLS项目：proj-xtrace-a46b97cfdc1332238f714864c014a1b-cn-qingdao
   发现错误span数量：154

🎯 根因发现：
   ✅ 已找到 154 个错误span
   ✅ get_patterns发现主要错误服务: payment (错误数: 152)
   ✅ diff_patterns确认异常服务: payment
   ✅ 多重证据确认: payment 是主要根因服务
   ✅ 根据运行时证据确定目标服务: payment

🏆 根因候选：
   🎯 payment
   📈 置信度：高
   ✅ 证据：TRUE（已检测到异常）
   📝 支持证据：
      - 发现 154 个错误span
      - diff_patterns分析表明 payment 服务异常
      - 日志模式分析验证了服务问题
      - 错误特征分析确认了根因位置

💡 建议：
   - 检查 payment 服务的健康状态
   - 查看 payment 的错误日志和异常
   - 验证 payment 的部署和配置
   - 考虑重启或修复 payment 服务

🎯 最终答复：payment
📈 置信度：高
🔍 证据：TRUE
