In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [44]:
train_columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 
                 'marital_status', 'occupation', 'relationship', 'race', 
                 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 
                 'native_country', 'income']
df_train = pd.read_csv('adult.data', names=train_columns, na_values=' ?')  # 使用' ?'标识缺失值


# 删除'occupation'和'native_country'中含有缺失值的行
df_train.dropna(subset=['occupation', 'native_country'], inplace=True)


df_train['income'] = df_train['income'].map(lambda x: 0 if x == ' <=50K' else 1)

# 预处理训练数据
X_train = df_train.drop('income', axis=1)
y_train = df_train['income']

df_train

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0


In [43]:
from sklearn.utils import resample

# 分离两个类别
df_majority = df_train[df_train['income'] == 0]
df_minority = df_train[df_train['income'] == 1]

# 上采样少数类
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # 采样替换
                                 n_samples=len(df_majority),    # 与多数类相同的样本数
                                 random_state=123) # 随机种子保证重现性

# 合并多数类与上采样后的少数类
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# 打乱数据顺序
df_upsampled = df_upsampled.sample(frac=1, random_state=123).reset_index(drop=True)

# 更新X_train, y_train
X_train = df_upsampled.drop('income', axis=1)
y_train = df_upsampled['income']

In [60]:
income_counts = df_train['income'].value_counts()

income_counts

income
0    22654
1     7508
Name: count, dtype: int64

In [45]:
# 加载测试数据
test_columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 
                'marital_status', 'occupation', 'relationship', 'race', 
                'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 
                'native_country', 'income']
df_test= pd.read_csv('adult.test', names=test_columns, na_values=' ?')  # 使用' ?'标识缺失值
df_test=df_test.dropna()

# 删除'occupation'和'native_country'中含有缺失值的行
df_test.dropna(subset=['occupation', 'native_country'], inplace=True)

df_test['income'] = df_test['income'].map(lambda x: 0 if x == ' <=50K.' else 1)

# 预处理测试数据
X_test = df_test.drop('income', axis=1)
y_test = df_test['income']

df_test

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,0
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,0
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,1
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,1
6,34,Private,198693.0,10th,6.0,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,30.0,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,33,Private,245211.0,Bachelors,13.0,Never-married,Prof-specialty,Own-child,White,Male,0.0,0.0,40.0,United-States,0
16277,39,Private,215419.0,Bachelors,13.0,Divorced,Prof-specialty,Not-in-family,White,Female,0.0,0.0,36.0,United-States,0
16279,38,Private,374983.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,50.0,United-States,0
16280,44,Private,83891.0,Bachelors,13.0,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455.0,0.0,40.0,United-States,0


In [40]:
from sklearn.utils import resample

# 分离两个类别
df_majority = df_test[df_test['income'] == 0]
df_minority = df_test[df_test['income'] == 1]

# 下采样多数类
df_majority_downsampled = resample(df_majority, 
                                   replace=False,    # 无需替换
                                   n_samples=len(df_minority),  # 少数类的样本数
                                   random_state=123) # 随机种子保证重现性

# 合并少数类与下采样后的多数类
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# 打乱数据顺序
df_downsampled = df_downsampled.sample(frac=1, random_state=123).reset_index(drop=True)

# 更新X_test, y_test
X_test = df_downsampled.drop('income', axis=1)
y_test = df_downsampled['income']

In [46]:
numeric_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
#categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    #('onehot', OneHotEncoder(handle_unknown='ignore'))])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # 注意这里设置sparse=False
])

# 组合处理器
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [47]:
# 对分类特征进行独热编码
df_train_encoded = pd.get_dummies(df_train, columns=categorical_features)

# 计算特征与income的相关性
correlation_matrix = df_train_encoded.corr()

# 提取与income相关性的系数
income_correlation = correlation_matrix['income'].sort_values(ascending=False)

print(income_correlation.head(20))

income                                1.000000
marital_status_ Married-civ-spouse    0.445418
relationship_ Husband                 0.401236
education_num                         0.335286
age                                   0.241998
hours_per_week                        0.229480
capital_gain                          0.221196
sex_ Male                             0.216699
occupation_ Exec-managerial           0.213442
occupation_ Prof-specialty            0.181458
education_ Bachelors                  0.178847
education_ Masters                    0.174126
education_ Prof-school                0.156472
capital_loss                          0.150053
workclass_ Self-emp-inc               0.137646
education_ Doctorate                  0.129162
relationship_ Wife                    0.125126
race_ White                           0.084735
workclass_ Federal-gov                0.057394
native_country_ United-States         0.040204
Name: income, dtype: float64


In [59]:
##LogisticRegression

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])
clf.fit(X_train, y_train)


y_pred1 = clf.predict(X_test)
print(classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

           0       0.94      0.80      0.86     11360
           1       0.57      0.84      0.68      3700

    accuracy                           0.81     15060
   macro avg       0.75      0.82      0.77     15060
weighted avg       0.85      0.81      0.82     15060



In [52]:
##RandomForest
from sklearn.ensemble import RandomForestClassifier


clf_rf = Pipeline(steps=[('preprocessor', preprocessor),
                         ('classifier', RandomForestClassifier(n_estimators=100, class_weight='balanced'))])


clf_rf.fit(X_train, y_train)


y_pred_rf = clf_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90     11360
           1       0.73      0.61      0.66      3700

    accuracy                           0.85     15060
   macro avg       0.80      0.77      0.78     15060
weighted avg       0.84      0.85      0.84     15060



In [57]:
##xgboost
from xgboost import XGBClassifier

# 使用 XGBoost 
clf_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss'))])

# 训练模型
clf_xgb.fit(X_train, y_train)

# 使用测试数据评估模型
y_pred_xgb = clf_xgb.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92     11360
           1       0.77      0.66      0.71      3700

    accuracy                           0.87     15060
   macro avg       0.83      0.80      0.81     15060
weighted avg       0.87      0.87      0.87     15060



In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [16]:
##SVM
svm_clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', SVC(kernel='linear'))])


svm_clf.fit(X_train, y_train)


y_pred2 = svm_clf.predict(X_test)
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.85      0.77      0.81      3700
           1       0.79      0.86      0.82      3700

    accuracy                           0.82      7400
   macro avg       0.82      0.82      0.82      7400
weighted avg       0.82      0.82      0.82      7400



In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from keras.models import Sequential
from keras.layers import Dense

In [33]:
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [34]:
# 预处理训练数据和测试数据
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
X_train = X_train.toarray()
X_test = X_test.toarray()

# 构建神经网络模型
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# 编译模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 训练模型
model.fit(X_train, y_train, epochs=15, batch_size=10)

# 评估模型
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss}, Test accuracy: {accuracy}')

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test loss: 0.34633588790893555, Test accuracy: 0.8395750522613525


In [35]:
y_pred_prob = model.predict(X_test)
y_pred3 = (y_pred_prob > 0.5).astype("int32")



In [36]:
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89     11360
           1       0.66      0.71      0.68      3700

    accuracy                           0.84     15060
   macro avg       0.78      0.80      0.79     15060
weighted avg       0.84      0.84      0.84     15060

