In [2]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
adults = pd.read_csv('../data/adults.txt')
adults.head(2)

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [4]:
train = adults[['occupation','hours_per_week','race']].copy()
target = adults['sex']

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,Normalizer,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [6]:
train.dtypes

occupation        object
hours_per_week     int64
race              object
dtype: object

In [10]:
train['hours_per_week'].dtypes == object

False

In [11]:
for column in train:
    if train[column].dtypes == object:
        unique_array = train[column].unique()
        def transform_data(item):
            return np.argwhere(item == unique_array)[0,0]
        train[column] = train[column].map(transform_data)

In [12]:
train

Unnamed: 0,occupation,hours_per_week,race
0,0,40,0
1,1,13,0
2,2,40,0
3,2,40,1
4,3,40,1
5,1,40,0
6,4,16,1
7,1,45,0
8,3,50,0
9,1,40,0


In [13]:
X_train,X_test,y_train,y_test = train_test_split(train,target)

In [15]:
KNeighborsClassifier(n_neighbors=5).fit(X_train,y_train).score(X_test,y_test)

0.7324652991033043

In [16]:
LogisticRegression().fit(X_train,y_train).score(X_test,y_test)

0.6861564918314703

In [17]:
SVC().fit(X_train,y_train).score(X_test,y_test)

0.743151946935266

# 优化处理

In [19]:
# 特征预处理
train1 = adults[['occupation','hours_per_week','race']].copy()
target1 = adults['sex']

In [26]:
# 对职业这一列进行独热编码
label_enc1 = LabelEncoder().fit_transform(train1['occupation'])
onehot_enc1 = OneHotEncoder().fit_transform(label_enc1.reshape(-1,1))
t1 = np.array(onehot_enc1.todense())
t1

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
# 对种族这一列进行独热编码
label_enc2 = LabelEncoder().fit_transform(train1['race'])
onehot_enc2 = OneHotEncoder().fit_transform(label_enc2.reshape(-1,1))
t2 = np.array(onehot_enc2.todense())
t2

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]])

In [32]:
t1.shape

(32561, 15)

In [33]:
t2.shape

(32561, 5)

In [34]:
train1['hours_per_week'].shape

(32561,)

In [38]:
# 得到一个度热编码后的新的数据集
train2 = np.concatenate((t1,t2,train1['hours_per_week'].values.reshape(-1,1)),axis=1)

normal_train = Normalizer().fit_transform(train2)
standard_train = StandardScaler().fit_transform(train2)
minmax_train = MinMaxScaler().fit_transform(train2)

In [47]:
# 对样本标签进行独热编码处理
target2 = np.array(OneHotEncoder().fit_transform(LabelEncoder().fit_transform(target).reshape(-1,1)).todense())
target2

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [52]:
X_train2,X_test2,y_train2,y_test2 = train_test_split(train2,target)

In [53]:
KNeighborsClassifier().fit(X_train2,y_train2).score(X_test2,y_test2)

0.7123203537648938

In [54]:
LogisticRegression().fit(X_train2,y_train2).score(X_test2,y_test2)

0.7301314334848299

In [55]:
SVC().fit(X_train2,y_train2).score(X_test2,y_test2)

0.7347991647217786

In [61]:
train3 = train['hours_per_week'].values.reshape(-1,1)
target

0          Male
1          Male
2          Male
3          Male
4        Female
5        Female
6        Female
7          Male
8        Female
9          Male
10         Male
11         Male
12       Female
13         Male
14         Male
15         Male
16         Male
17         Male
18         Male
19       Female
20         Male
21       Female
22         Male
23         Male
24       Female
25         Male
26         Male
27         Male
28         Male
29         Male
          ...  
32531    Female
32532      Male
32533      Male
32534    Female
32535      Male
32536    Female
32537      Male
32538    Female
32539      Male
32540    Female
32541    Female
32542      Male
32543    Female
32544    Female
32545    Female
32546    Female
32547      Male
32548      Male
32549    Female
32550      Male
32551      Male
32552      Male
32553      Male
32554      Male
32555      Male
32556    Female
32557      Male
32558    Female
32559      Male
32560    Female
Name: sex, Length: 32561

In [62]:
X_train3,X_test3,y_train3,y_test3 = train_test_split(train3,target)

In [63]:
KNeighborsClassifier().fit(X_train3,y_train3).score(X_test3,y_test3)

0.682962780985137

In [64]:
# 如果特征选择的不好，会导致不重要特征对算法影响力加大，所以导致算法评分会降低

# 重新对数据特征进行选择

In [69]:
train = adults.drop('sex',axis=1)
target = adults['sex']

In [76]:
train.dtypes

age                int64
workclass         object
final_weight       int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
salary            object
dtype: object

In [77]:
# 存放独热编码后的特征数据
datas = []
# 存放所有非对象类型的列标签
int_columns = []
for column in train:
    if train[column].dtypes == object:
        data = np.array(OneHotEncoder().fit_transform(LabelEncoder().fit_transform(train[column]).reshape(-1,1)).todense())
        datas.append(data)
    else:
        int_columns.append(column)

In [81]:
# 获取非对象类型的列，作为最原始的数据集
f1 = train[int_columns].values
# 再把所有独热编码的数据级联到数据集f1当中
for data in datas:
    f1 = np.hstack((f1,data))
f1.shape

(32561, 108)

In [86]:
# 把target进行数字编码
target = LabelEncoder().fit_transform(target)

In [89]:
# 特征选择
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier

#GBDT作为基模型的特征选择
train1 = SelectFromModel(GradientBoostingClassifier()).fit_transform(f1, target)

In [90]:
train1.shape

(32561, 28)

In [91]:
X_train,X_test,y_train,y_test = train_test_split(train1,target)

In [99]:
KNeighborsClassifier(n_neighbors=7).fit(X_train,y_train).score(X_test,y_test)

0.6583957744748803

In [100]:
LogisticRegression().fit(X_train,y_train).score(X_test,y_test)

0.6688367522417393

In [101]:
SVC().fit(X_train,y_train).score(X_test,y_test)

0.7058100970396757

In [103]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

#递归特征消除法，返回特征选择后的数据
#参数estimator为基模型
#参数n_features_to_select为选择的特征个数
f2 = RFE(estimator=LogisticRegression(), n_features_to_select=30).fit_transform(train1,target)

In [104]:
f2.shape

(32561, 28)

In [105]:
X_train1,X_test1,y_train1,y_test1 = train_test_split(f2,target)

In [106]:
LogisticRegression().fit(X_train,y_train).score(X_test,y_test)

0.6688367522417393

In [None]:
# 特征处理