In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import random

In [2]:
## 0、数据处理成csv形式

In [3]:
columns = ['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'maritalStatus', 'occupation', 'relationship', 'race', 'sex',
          'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry', 'income']
df_train_set = pd.read_csv('./adult.data', names=columns)
df_test_set = pd.read_csv('./adult.test', names=columns, skiprows=1) #第一行是非法数据

df_train_set.to_csv('./train_adult.csv', index=False)
df_test_set.to_csv('./test_adult.csv', index=False)

In [4]:
## 1、数据读取

In [5]:
df_train_set = pd.read_csv('./train_adult.csv')
df_test_set = pd.read_csv('./test_adult.csv')

In [6]:
## 2、数据预处理

In [7]:
### 2.1 删除对应属性

In [8]:
df_train_set.drop(['fnlwgt', 'educationNum'], axis=1, inplace=True)
df_test_set.drop(['fnlwgt', 'educationNum'], axis=1, inplace=True)

In [9]:
### 2.2 重复行记录处理

In [10]:
df_train_set.drop_duplicates(inplace=True)
df_test_set.drop_duplicates(inplace=True)

In [11]:
### 2.3 缺失值处理

In [12]:
df_train_set.dropna(inplace=True)
df_test_set.dropna(inplace=True)

In [13]:
### 2.4 异常值处理

In [14]:
new_columns = ['workclass', 'education', 'maritalStatus', 'occupation', 'relationship', 'race', 'sex',
               'nativeCountry', 'income']
for col in new_columns:
    df_train_set = df_train_set[~df_train_set[col].str.contains(r'\?', regex=True)]
    df_test_set = df_test_set[~df_test_set[col].str.contains(r'\?', regex=True)]

df_train_set.reset_index(drop=True, inplace=True)
df_test_set.reset_index(drop=True, inplace=True)

In [15]:
### 2.5 连续型变量处理

In [16]:
bins = [0, 25, 50, 75, 100]
df_train_set['age'] = pd.cut(df_train_set['age'], bins, labels=False)
df_test_set['age'] = pd.cut(df_test_set['age'], bins, labels=False)

In [17]:
### 2.6 离散型变量处理

In [18]:
# 定义一个通用的映射函数，处理未知类别

In [19]:
def create_mapping(column_values):
    unique_values = column_values.unique()
    mapping = {label: idx for idx, label in enumerate(unique_values)}
    mapping['unknown'] = len(mapping)  # 为未知类别添加一个索引
    return mapping

In [20]:
# 处理训练集
mappings = {}  # 保存所有映射，以便在测试集中使用相同的映射

for col in new_columns:
    if col == 'income':
        continue  # income列单独处理
    mapping = create_mapping(df_train_set[col])
    mappings[col] = mapping
    df_train_set[col] = df_train_set[col].map(mapping)

# income编码
income_mapping = {'<=50K': 0, '>50K': 1}
df_train_set['income'] = df_train_set['income'].str.strip()
df_train_set['income'] = df_train_set['income'].map(income_mapping)
mappings['income'] = income_mapping

# 处理测试集，使用与训练集相同的映射
for col in new_columns:
    if col == 'income':
        continue
    mapping = mappings[col]
    if 'unknown' not in mapping:
        mapping['unknown'] = len(mapping)
    df_test_set[col] = df_test_set[col].map(lambda x: mapping.get(x, mapping['unknown']))

# income编码
df_test_set['income'] = df_test_set['income'].str.strip()
df_test_set['income'] = df_test_set['income'].str.replace('.', '', regex=False)
df_test_set['income'] = df_test_set['income'].map(lambda x: income_mapping.get(x, -1))

# 检查数据集长度
print("训练集样本数：", len(df_train_set))
print("测试集样本数：", len(df_test_set))

# 检查是否存在缺失值
print("训练集缺失值情况：\n", df_train_set.isnull().sum())
print("测试集缺失值情况：\n", df_test_set.isnull().sum())

# 如果仍有缺失值，可以选择填充或删除
df_train_set.fillna(-1, inplace=True)
df_test_set.fillna(-1, inplace=True)

训练集样本数： 26904
测试集样本数： 14130
训练集缺失值情况：
 age              0
workclass        0
education        0
maritalStatus    0
occupation       0
relationship     0
race             0
sex              0
capitalGain      0
capitalLoss      0
hoursPerWeek     0
nativeCountry    0
income           0
dtype: int64
测试集缺失值情况：
 age              0
workclass        0
education        0
maritalStatus    0
occupation       0
relationship     0
race             0
sex              0
capitalGain      0
capitalLoss      0
hoursPerWeek     0
nativeCountry    0
income           0
dtype: int64


In [21]:
# 3. 构造决策树，进行训练

In [22]:
def calc_gini(df):
    labels = df['income']
    label_counts = labels.value_counts()
    total = len(labels)
    gini = 1.0 - sum((count / total) ** 2 for count in label_counts)
    return gini

def split_dataset(df, index, value):
    feature = df.columns[index]
    left_df = df[df[feature] == value]
    right_df = df[df[feature] != value]
    return left_df, right_df

def choose_best_feature_to_split(df):
    base_gini = calc_gini(df)
    best_gini = float('inf')
    best_feature_index = -1
    best_value = None
    best_splits = None

    num_features = len(df.columns) - 1  # Exclude the label column 'income'
    for i in range(num_features):
        feature = df.columns[i]
        unique_values = df[feature].unique()
        for value in unique_values:
            left_df, right_df = split_dataset(df, i, value)
            if len(left_df) == 0 or len(right_df) == 0:
                continue  # Skip invalid splits

            total_instances = len(df)
            weight_left = len(left_df) / total_instances
            weight_right = len(right_df) / total_instances
            gini_left = calc_gini(left_df)
            gini_right = calc_gini(right_df)
            gini_split = weight_left * gini_left + weight_right * gini_right

            if gini_split < best_gini:
                best_gini = gini_split
                best_feature_index = i
                best_value = value
                best_splits = (left_df, right_df)

    if best_feature_index == -1:
        # No valid split found
        return None, None, None
    else:
        return (best_feature_index, best_value), best_splits, best_gini

def build_decision_tree(df, columns, flags):
    labels = df['income']
    # Base case 1: If all labels are the same, return the label
    if len(labels.unique()) == 1:
        return {'label': labels.iloc[0]}

    # Base case 2: If no features left to split on, return the majority label
    if len(df.columns) == 1:  # Only the label column is left
        majority_label = labels.value_counts().idxmax()
        return {'label': majority_label}

    # Choose the best feature to split
    best_feature, best_splits, best_gini = choose_best_feature_to_split(df)

    if best_feature is None:
        # No valid split found, return majority label
        majority_label = labels.value_counts().idxmax()
        return {'label': majority_label}

    feature_index, feature_value = best_feature
    feature_name = df.columns[feature_index]

    # Build subtrees
    left_df, right_df = best_splits

    left_subtree = build_decision_tree(left_df.drop(columns=[feature_name]), columns, flags)
    right_subtree = build_decision_tree(right_df.drop(columns=[feature_name]), columns, flags)

    # Return the tree
    return {'feature_index': feature_index,
            'feature_name': feature_name,
            'value': feature_value,
            'left': left_subtree,
            'right': right_subtree}

def save_decision_tree(cart):
    np.save('cart.npy', cart)

def load_decision_tree():
    cart = np.load('cart.npy', allow_pickle=True)
    return cart.item()

In [23]:
df_train = df_train_set.copy() #防止预处理重新来

In [24]:
columns = df_train.columns.to_list()
flags = [0 for i in range(len(columns))]

In [25]:
cart = build_decision_tree(df_train, columns, flags)
save_decision_tree(cart)

In [26]:
## 4. 评估

In [27]:
def classify(cart, df_row, columns):
    if 'label' in cart:
        return cart['label']
    else:
        feature_name = cart['feature_name']
        feature_value = cart['value']
        if feature_name not in df_row:
            return random.randint(0, 1)  # 如果特征缺失，随机返回一个标签
        if df_row[feature_name] == feature_value:
            return classify(cart['left'], df_row, columns)
        else:
            return classify(cart['right'], df_row, columns)

def predict(cart, df, columns):
    pred_list = []
    for i in range(len(df)):
        pred_label = classify(cart, df.iloc[i, :], columns)
        pred_list.append(pred_label)
    return pred_list

def calc_acc(pred_list, test_list):
    pred = np.array(pred_list)
    test = np.array(test_list)
    acc = np.sum(pred == test) / len(test)
    return acc

In [28]:
## 5. 预测和评估

In [29]:
# 开始预测
columns = df_train.columns.to_list()
cart = load_decision_tree()  # 加载模型
test_list = df_test_set['income'].to_numpy()
pred_list = predict(cart, df_test_set, columns)
acc = calc_acc(pred_list, test_list)
print("测试集上的准确率为:", acc)

测试集上的准确率为: 0.8167020523708421


In [30]:
# 将预测结果输出到新的.csv文件中
df_test_set['prediction'] = pred_list
df_test_set.to_csv('test_predictions.csv', index=False)
print("预测结果已保存到 test_predictions.csv 文件中。")

预测结果已保存到 test_predictions.csv 文件中。
