# Python 统计分析完整代码示例

```python
# 基础库导入
import numpy as np
import pandas
import scipy as sp
import matplotlib.pyplot as plt
import seaborn
from statsmodels.formula.api import ols
import statsmodels.api as sm

# 一、数据表示与转换
## 1. 从NumPy数组创建DataFrame
t = np.linspace(-6, 6, 20)  # 生成-6到6的20个均匀点
sin_t = np.sin(t)
cos_t = np.cos(t)
df = pandas.DataFrame({'t': t, 'sin': sin_t, 'cos': cos_t})
print("从数组创建的DataFrame前5行：")
print(df.head(), "\n")

## 2. 加载外部数据（以brain_size.csv为例）
data = pandas.read_csv('examples/brain_size.csv', sep=';', na_values=".")
print("加载的brain_size数据形状：", data.shape)
print("数据列名：", data.columns.tolist(), "\n")


# 二、DataFrame数据操作
## 1. 基本信息查看
print("Gender列前5行数据：")
print(data['Gender'].head(), "\n")

## 2. 数据选择与计算
female_viq_mean = data[data['Gender'] == 'Female']['VIQ'].mean()
print(f"女性VIQ平均值：{female_viq_mean:.2f}", "\n")

## 3. 分组操作（groupby）
groupby_gender = data.groupby('Gender')
print("按性别分组的VIQ平均值：")
for gender, group_data in groupby_gender['VIQ']:
    print(f"{gender}：{group_data.mean():.2f}")
print("\n所有数值列的分组平均值：")
print(groupby_gender.mean(), "\n")

## 4. 数据可视化（散点矩阵）
from pandas import plotting
plotting.scatter_matrix(data[['Weight', 'Height', 'MRI_Count']])
plt.suptitle('Weight、Height、MRI_Count散点矩阵')
plt.show()

plotting.scatter_matrix(data[['PIQ', 'VIQ', 'FSIQ']])
plt.suptitle('PIQ、VIQ、FSIQ散点矩阵')
plt.show()


# 三、假设检验
## 1. 单样本t检验
t_stat, p_value = sp.stats.ttest_1samp(data['VIQ'], 0)
print(f"单样本t检验（VIQ均值是否为0）：t={t_stat:.4f}, p={p_value:.4f}")

stat, p = sp.stats.wilcoxon(data['VIQ'])
print(f"Wilcoxon符号秩检验：统计量={stat:.4f}, p={p:.4f}", "\n")

## 2. 两样本t检验
female_viq = data[data['Gender'] == 'Female']['VIQ']
male_viq = data[data['Gender'] == 'Male']['VIQ']

t_stat, p_value = sp.stats.ttest_ind(female_viq, male_viq)
print(f"两样本t检验（男女VIQ差异）：t={t_stat:.4f}, p={p_value:.4f}")

stat, p = sp.stats.mannwhitneyu(female_viq, male_viq)
print(f"Mann-Whitney U检验：统计量={stat:.4f}, p={p:.4f}", "\n")

## 3. 配对检验
t_stat, p_value = sp.stats.ttest_rel(data['FSIQ'], data['PIQ'])
print(f"配对t检验（FSIQ与PIQ差异）：t={t_stat:.4f}, p={p_value:.4f}")

diff = data['FSIQ'] - data['PIQ']
t_stat_diff, p_value_diff = sp.stats.ttest_1samp(diff, 0)
print(f"差值单样本t检验：t={t_stat_diff:.4f}, p={p_value_diff:.4f}")

stat, p = sp.stats.wilcoxon(data['FSIQ'], data['PIQ'], method="approx")
print(f"Wilcoxon配对检验：统计量={stat:.4f}, p={p:.4f}", "\n")


# 四、线性模型与方差分析
## 1. 简单线性回归
x = np.linspace(-5, 5, 20)
rng = np.random.default_rng(27446968)
y = -5 + 3*x + 4 * rng.normal(size=x.shape)
reg_data = pandas.DataFrame({'x': x, 'y': y})
model = ols("y ~ x", reg_data).fit()
print("简单线性回归模型摘要：")
print(model.summary(), "\n")

## 2. 分类变量处理
model_gender = ols("VIQ ~ C(Gender)", data).fit()
print("性别对VIQ影响的模型摘要：")
print(model_gender.summary(), "\n")

## 3. 长格式数据转换与分析
data_fisq = pandas.DataFrame({'iq': data['FSIQ'], 'type': 'fsiq'})
data_piq = pandas.DataFrame({'iq': data['PIQ'], 'type': 'piq'})
data_long = pandas.concat((data_fisq, data_piq))
model_iq = ols("iq ~ type", data_long).fit()
print("不同类型IQ差异的模型摘要：")
print(model_iq.summary(), "\n")

## 4. 多元回归
iris_data = pandas.read_csv('examples/iris.csv')
model_multi = ols('sepal_width ~ name + petal_length', iris_data).fit()
print("多元回归模型摘要（萼片宽度影响因素）：")
print(model_multi.summary(), "\n")

## 5. 事后检验（ANOVA）
f_test_result = model_multi.f_test([0, 1, -1, 0])
print("方差分析结果（versicolor与virginica系数差异）：")
print(f_test_result, "\n")


# 五、Seaborn可视化
## 1. 散点矩阵（pairplot）
# 假设wage_data为包含WAGE、AGE、EDUCATION、SEX的数据集
# seaborn.pairplot(wage_data, vars=['WAGE', 'AGE', 'EDUCATION'], kind='reg')
# seaborn.pairplot(wage_data, vars=['WAGE', 'AGE', 'EDUCATION'], kind='reg', hue='SEX')
# plt.show()

## 2. 单变量回归图（lmplot）
# seaborn.lmplot(y='WAGE', x='EDUCATION', data=wage_data)
# plt.title('工资与教育程度的回归关系')
# plt.show()

## 3. 交互作用检验
# 假设wage_data包含wage、education、gender列
# result = sm.ols(formula='wage ~ education + gender + education * gender', data=wage_data).fit()
# print("交互作用模型摘要：")
# print(result.summary())