# 五子棋AI数据分析 (Person E)

本笔记本对自对弈评估数据进行探索性分析（EDA）

In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import os

# 设置样式
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style('whitegrid')
sns.set_context('notebook', font_scale=1.2)

print("✓ 库导入成功")

## 1. 数据加载与概览

In [None]:
# 加载最新的评估数据
data_files = glob("../data/results/self_play/aggregated/results_*.csv")
if data_files:
    latest_file = max(data_files, key=os.path.getctime)
    df = pd.read_csv(latest_file)
    print(f"✓ 已加载数据: {latest_file}")
    print(f"  总记录数: {len(df)}")
else:
    print("❌ 未找到数据文件")
    df = None

In [None]:
# 查看数据前几行
df.head()

In [None]:
# 数据基本信息
df.info()

In [None]:
# 描述性统计
df.describe()

## 2. 数据质量检查

In [None]:
# 检查缺失值
print("缺失值统计:")
print(df.isnull().sum())

# 检查重复值
print(f"\n重复记录数: {df.duplicated().sum()}")

In [None]:
# 查看参与的AI算法
algorithms = sorted(set(df['player1'].unique()) | set(df['player2'].unique()))
print(f"参与评估的AI算法 ({len(algorithms)}个):")
for algo in algorithms:
    print(f"  - {algo}")

## 3. 胜率分析

In [None]:
# 胜者分布
winner_counts = df['winner'].value_counts()
print("胜者分布:")
print(winner_counts)

# 可视化
plt.figure(figsize=(8, 6))
winner_counts.plot(kind='bar', color=['green', 'red', 'gray'])
plt.title('胜者分布', fontsize=16, fontweight='bold')
plt.xlabel('胜者')
plt.ylabel('对局数')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# 计算每个算法的胜率
win_stats = []

for algo in algorithms:
    # 作为player1的记录
    as_p1 = df[df['player1'] == algo]
    p1_wins = (as_p1['winner'] == 'player1').sum()
    
    # 作为player2的记录
    as_p2 = df[df['player2'] == algo]
    p2_wins = (as_p2['winner'] == 'player2').sum()
    
    total_games = len(as_p1) + len(as_p2)
    total_wins = p1_wins + p2_wins
    
    win_stats.append({
        'algorithm': algo,
        'total_games': total_games,
        'wins': total_wins,
        'win_rate': total_wins / total_games if total_games > 0 else 0
    })

win_df = pd.DataFrame(win_stats).sort_values('win_rate', ascending=False)
win_df

In [None]:
# 胜率可视化
plt.figure(figsize=(10, 6))
colors = sns.color_palette('RdYlGn', len(win_df))
plt.barh(win_df['algorithm'], win_df['win_rate'], color=colors)
plt.xlabel('胜率', fontsize=14)
plt.title('AI算法胜率对比', fontsize=16, fontweight='bold')
plt.xlim(0, 1)

# 添加数值标签
for i, (idx, row) in enumerate(win_df.iterrows()):
    plt.text(row['win_rate'] + 0.02, i, f"{row['win_rate']:.1%}", 
            va='center', fontsize=12)

plt.tight_layout()
plt.show()

## 4. 对局步数分析

In [None]:
# 步数统计
print("对局步数统计:")
print(f"  平均步数: {df['total_moves'].mean():.1f}")
print(f"  中位数: {df['total_moves'].median():.0f}")
print(f"  标准差: {df['total_moves'].std():.1f}")
print(f"  最少步数: {df['total_moves'].min():.0f}")
print(f"  最多步数: {df['total_moves'].max():.0f}")

In [None]:
# 步数分布直方图
plt.figure(figsize=(10, 6))
plt.hist(df['total_moves'], bins=30, edgecolor='black', alpha=0.7, color='skyblue')
plt.axvline(df['total_moves'].mean(), color='red', linestyle='--', linewidth=2, 
           label=f"平均: {df['total_moves'].mean():.1f}")
plt.xlabel('步数', fontsize=14)
plt.ylabel('频次', fontsize=14)
plt.title('对局步数分布', fontsize=16, fontweight='bold')
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. 响应时间分析

In [None]:
# 各算法响应时间统计
time_stats = []

for algo in algorithms:
    as_p1 = df[df['player1'] == algo]['player1_avg_time']
    as_p2 = df[df['player2'] == algo]['player2_avg_time']
    all_times = pd.concat([as_p1, as_p2])
    
    time_stats.append({
        'algorithm': algo,
        'mean_time': all_times.mean(),
        'median_time': all_times.median(),
        'std_time': all_times.std()
    })

time_df = pd.DataFrame(time_stats).sort_values('mean_time')
time_df

In [None]:
# 响应时间箱线图
plt.figure(figsize=(10, 6))

time_data = []
labels = []

for algo in algorithms:
    as_p1 = df[df['player1'] == algo]['player1_avg_time']
    as_p2 = df[df['player2'] == algo]['player2_avg_time']
    all_times = pd.concat([as_p1, as_p2])
    time_data.append(all_times)
    labels.append(algo)

plt.boxplot(time_data, labels=labels)
plt.ylabel('响应时间 (秒)', fontsize=14)
plt.title('AI算法响应时间分布', fontsize=16, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## 6. 相关性分析

In [None]:
# 计算相关性矩阵
numeric_cols = ['total_moves', 'player1_avg_time', 'player2_avg_time']
corr = df[numeric_cols].corr()

# 可视化
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', 
           center=0, square=True, linewidths=1)
plt.title('特征相关性矩阵', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 7. 关键发现总结

In [None]:
print("="*60)
print(" 关键发现")
print("="*60)

print(f"\n1. 最高胜率: {win_df.iloc[0]['algorithm']} ({win_df.iloc[0]['win_rate']:.1%})")
print(f"2. 最快响应: {time_df.iloc[0]['algorithm']} ({time_df.iloc[0]['mean_time']:.4f}s)")
print(f"3. 平均对局长度: {df['total_moves'].mean():.1f} 步")
print(f"4. 总评估对局: {len(df)} 场")
print(f"\n5. 效率-性能权衡:")
merged = win_df.merge(time_df, on='algorithm')
for _, row in merged.iterrows():
    print(f"   {row['algorithm']:15s} | 胜率: {row['win_rate']:.1%} | 时间: {row['mean_time']:.4f}s")

## 8. 导出分析结果

In [None]:
# 导出汇总统计
win_df.to_csv('../data/results/notebook_win_rates.csv', index=False)
time_df.to_csv('../data/results/notebook_time_stats.csv', index=False)

print("✓ 分析结果已导出")