In [1]:
import numpy as np
import pandas as pd
from pandas import Series ,DataFrame
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [2]:
# 读取文件中的adult.txt文件，使用KNN算法训练模型，然后使用模型预测一个人的年收入是否大于50

In [3]:
adults = pd.read_csv('./adults.txt')
adults.head()

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
adults.shape

(32561, 15)

In [6]:
adults.columns

Index(['age', 'workclass', 'final_weight', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'salary'],
      dtype='object')

In [19]:
# 挑选出我认为对年收入有影响的属性
target = adults['salary']
data = adults[['age', 'workclass','education', 'race','sex','hours_per_week','native_country']]

In [20]:
data.dtypes

age                int64
workclass         object
education         object
race              object
sex               object
hours_per_week     int64
native_country    object
dtype: object

In [21]:
# 将字符串属性转变成整型
workclass = data['workclass'].unique()
workclass

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

In [22]:
np.argwhere(workclass == 'State-gov')[0,0]+1

1

In [23]:
def convertstr2int(item):
    return np.argwhere(workclass == item)[0,0]+1

In [25]:
data.dtypes

age                int64
workclass          int64
education         object
race              object
sex               object
hours_per_week     int64
native_country    object
dtype: object

In [29]:
data.head(5)

Unnamed: 0,age,workclass,education,race,sex,hours_per_week,native_country
0,39,1,Bachelors,White,Male,40,United-States
1,50,2,Bachelors,White,Male,13,United-States
2,38,3,HS-grad,White,Male,40,United-States
3,53,3,11th,Black,Male,40,United-States
4,28,3,Bachelors,Black,Female,40,Cuba


In [24]:
data['workclass'] = data['workclass'].map(convertstr2int)
data.head

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


<bound method NDFrame.head of        age  workclass     education                race     sex  \
0       39          1     Bachelors               White    Male   
1       50          2     Bachelors               White    Male   
2       38          3       HS-grad               White    Male   
3       53          3          11th               Black    Male   
4       28          3     Bachelors               Black  Female   
5       37          3       Masters               White  Female   
6       49          3           9th               Black  Female   
7       52          2       HS-grad               White    Male   
8       31          3       Masters               White  Female   
9       42          3     Bachelors               White    Male   
10      37          3  Some-college               Black    Male   
11      30          1     Bachelors  Asian-Pac-Islander    Male   
12      23          3     Bachelors               White  Female   
13      32          3    Assoc-a

In [30]:
# 将所有需要替换的列一次性替换
cols = ['education','race','sex','native_country']
for col in cols :
    unique = data[col].unique()
    def convertstr2int(item):
        return np.argwhere(unique == item)[0,0]+1
    data[col] = data[col].map(convertstr2int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [32]:
display(data)

Unnamed: 0,age,workclass,education,race,sex,hours_per_week,native_country
0,39,1,1,1,1,40,1
1,50,2,1,1,1,13,1
2,38,3,2,1,1,40,1
3,53,3,3,2,1,40,1
4,28,3,1,2,2,40,2
5,37,3,4,1,2,40,1
6,49,3,5,2,2,16,3
7,52,2,2,1,1,45,1
8,31,3,4,1,2,50,1
9,42,3,1,1,1,40,1


In [33]:
data.head(5)

Unnamed: 0,age,workclass,education,race,sex,hours_per_week,native_country
0,39,1,1,1,1,40,1
1,50,2,1,1,1,13,1
2,38,3,2,1,1,40,1
3,53,3,3,2,1,40,1
4,28,3,1,2,2,40,2


In [41]:
x_train,x_test,y_train,y_test = train_test_split(data,target,test_size = 0.005)
x_test.shape

(163, 7)

In [42]:
knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(x_train,y_train)
y_ = knn.predict(x_test)
y_

array(['>50K', '<=50K', '<=50K', '<=50K', '<=50K', '>50K', '<=50K',
       '>50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K',
       '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '>50K', '<=50K',
       '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '>50K', '<=50K',
       '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K',
       '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K',
       '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K',
       '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K',
       '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K',
       '>50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K',
       '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K',
       '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K',
       '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K',
       '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '>50K', '<=50K',
       '<=50K', '<=50K', '<

In [43]:
knn.score(x_test,y_test)

0.7730061349693251

In [44]:
# 将当前算法保存
from sklearn.externals import joblib
joblib.dump(knn,'salary_50k')

['salary_50k']

In [45]:
salary_50k = joblib.load('./salary_50k')