In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')  # 忽略警告信息
# 读取数据
df = pd.read_csv(
    r"C:\Users\le\Desktop\数据分析\cleaned_source.csv",
    skiprows=2,
    header=None,
    names=[
        "CustomerID", "Age", "Gender", "Tenure",
        "Usage Frequency", "Support Calls", "Payment Delay",
        "Subscription Type", "Contract Length", "Total Spend",
        "Last Interact", "Churn"
    ]
)

# 确保 Age 列是字符串类型
df["Age"] = df["Age"].astype(str)

# 拆分合并字段
# 拆分前先检查是否有足够的分隔符
split_result = df["Age"].str.split(" ", n=1, expand=True)

# 确保拆分后的结果总是有两列
split_result = split_result.reindex(columns=[0, 1], fill_value="Unknown")
split_result.columns = ["Age", "Gender"]  # 设置列名

# 更新原始 DataFrame
df[["Age", "Gender"]] = split_result

# 将 Age 和 Payment Delay 转换为整数类型
df["Age"] = pd.to_numeric(df["Age"], errors='coerce')  # 转换失败的值将变为 NaN
df["Age"].fillna(df["Age"].median(), inplace=True)  # 用中位数填充 NaN
df["Payment Delay"] = df["Payment Delay"].astype(int)

# 删除多余列
df = df.iloc[:, 0:12]

# 缺失值处理
df.dropna(subset=["Payment Delay"], inplace=True)
df["Gender"].fillna(df["Gender"].mode()[0], inplace=True)
df["Last Interact"].fillna(df["Last Interact"].median(), inplace=True)

# 异常值处理
upper_limit = df["Total Spend"].quantile(0.99)
df["Total Spend"] = np.where(df["Total Spend"] > upper_limit, upper_limit, df["Total Spend"])
# 统计流失用户
churned_users = df[df["Churn"] == 1]
print(f"流失用户数量：{len(churned_users)}")

# 保存数据
df.to_csv("final_cleaned_data.csv", index=False)
print("预处理完成！")

流失用户数量：30492
预处理完成！
