In [5]:
import numpy as np
import pandas as pd

In [6]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [7]:
train_data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [8]:
print(set(train_data.education), '\n\n',
      set(train_data.occupation),'\n\n', 
      set(train_data.workclass))

{' Some-college', ' 11th', ' Preschool', ' Assoc-acdm', ' 7th-8th', ' 12th', ' HS-grad', ' Prof-school', ' Assoc-voc', ' 5th-6th', ' 10th', ' Doctorate', ' Masters', ' 1st-4th', ' 9th', ' Bachelors'} 

 {' Transport-moving', ' Exec-managerial', ' Sales', ' Adm-clerical', ' Handlers-cleaners', ' Other-service', ' Machine-op-inspct', ' Tech-support', ' Craft-repair', ' ?', ' Protective-serv', ' Farming-fishing', ' Armed-Forces', ' Prof-specialty', ' Priv-house-serv'} 

 {' Self-emp-not-inc', ' Private', ' Self-emp-inc', ' Federal-gov', ' Local-gov', ' State-gov', ' ?', ' Never-worked', ' Without-pay'}


In [9]:
#Understand data with descriptive statistics
print(train_data.nunique(),'\n', train_data.education.value_counts(),'\n', train_data.dtypes, '\n',
      train_data.describe(), '\n',train_data.corr(),'\n', train_data.shape, '\n',train_data.isnull().values.any())

age                  73
workclass             9
fnlwgt            21648
education            16
education_num        16
marital_status        7
occupation           15
relationship          6
race                  5
sex                   2
capital_gain        119
capital_loss         92
hours_per_week       94
native_country       42
income                2
dtype: int64 
  HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: education, dtype: int64 
 age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object


In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV

In [35]:
def output_result(filename,predict_value):
    id_ = []
    for i in range(predict_value.shape[0]):
        id_.append(str(i+1))
    output = pd.DataFrame(columns=['id','label'])
    output['id'] = id_
    output['label'] = predict_value
    output.to_csv(filename,index = False)
    
    print(output.head())

In [10]:
def get_preprocessed_data(filename):
    train_fea = []
    i = 0
    with open(filename) as f:
        for line in f.readlines():
            if(i == 0):
                features = line.split(',')
                i = 1
            else:
                train_fea.append(line.split(','))
    return np.array(features), np.array(train_fea).astype(np.float64)

In [11]:
features_X, train_fea_X = get_preprocessed_data('X_train')
features_Y, train_fea_Y = get_preprocessed_data('Y_train')
train_fea_Y = train_fea_Y.reshape((-1))
features_X_test, test_fea_X = get_preprocessed_data('X_test')

In [16]:
param_test1 = {'n_estimators':[i for i in range(200,350,20)]}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300,
                                  min_samples_leaf=20,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10), 
                       param_grid = param_test1, scoring='accuracy',cv=5,n_jobs = 12)
gsearch1.fit(train_fea_X,train_fea_Y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.87064, std: 0.00314, params: {'n_estimators': 200},
  mean: 0.87092, std: 0.00333, params: {'n_estimators': 220},
  mean: 0.87095, std: 0.00286, params: {'n_estimators': 240},
  mean: 0.87083, std: 0.00287, params: {'n_estimators': 260},
  mean: 0.87052, std: 0.00304, params: {'n_estimators': 280},
  mean: 0.87030, std: 0.00310, params: {'n_estimators': 300},
  mean: 0.87043, std: 0.00307, params: {'n_estimators': 320},
  mean: 0.87064, std: 0.00270, params: {'n_estimators': 340}],
 {'n_estimators': 240},
 0.870949909400817)

In [19]:
param_test2 = {'max_depth':[i for i in range(7,14,2)], 'min_samples_split':[i for i in range(50,501,50)]}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=240, min_samples_leaf=20, 
      max_features='sqrt', subsample=0.8, random_state=10), 
   param_grid = param_test2, scoring='accuracy', cv=5, n_jobs = 12)
gsearch2.fit(train_fea_X,train_fea_Y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

([mean: 0.87089, std: 0.00225, params: {'max_depth': 7, 'min_samples_split': 50},
  mean: 0.87138, std: 0.00393, params: {'max_depth': 7, 'min_samples_split': 100},
  mean: 0.87089, std: 0.00314, params: {'max_depth': 7, 'min_samples_split': 150},
  mean: 0.87027, std: 0.00303, params: {'max_depth': 7, 'min_samples_split': 200},
  mean: 0.87107, std: 0.00314, params: {'max_depth': 7, 'min_samples_split': 250},
  mean: 0.87138, std: 0.00334, params: {'max_depth': 7, 'min_samples_split': 300},
  mean: 0.87070, std: 0.00310, params: {'max_depth': 7, 'min_samples_split': 350},
  mean: 0.87000, std: 0.00311, params: {'max_depth': 7, 'min_samples_split': 400},
  mean: 0.87107, std: 0.00294, params: {'max_depth': 7, 'min_samples_split': 450},
  mean: 0.87018, std: 0.00312, params: {'max_depth': 7, 'min_samples_split': 500},
  mean: 0.86938, std: 0.00262, params: {'max_depth': 9, 'min_samples_split': 50},
  mean: 0.86914, std: 0.00226, params: {'max_depth': 9, 'min_samples_split': 100},
  mean

In [21]:
param_test3 = {'min_samples_split':[i for i in range(100,500,100)], 'min_samples_leaf':[i for i in range(20,101,10)]}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=240,max_depth=9,
                                     max_features='sqrt', subsample=0.8, random_state=10), 
                       param_grid = param_test3, scoring='accuracy', cv=5, n_jobs=12)
gsearch3.fit(train_fea_X,train_fea_Y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: 0.86914, std: 0.00226, params: {'min_samples_leaf': 20, 'min_samples_split': 100},
  mean: 0.87052, std: 0.00301, params: {'min_samples_leaf': 20, 'min_samples_split': 200},
  mean: 0.87169, std: 0.00309, params: {'min_samples_leaf': 20, 'min_samples_split': 300},
  mean: 0.87163, std: 0.00345, params: {'min_samples_leaf': 20, 'min_samples_split': 400},
  mean: 0.87055, std: 0.00243, params: {'min_samples_leaf': 30, 'min_samples_split': 100},
  mean: 0.87009, std: 0.00390, params: {'min_samples_leaf': 30, 'min_samples_split': 200},
  mean: 0.87070, std: 0.00272, params: {'min_samples_leaf': 30, 'min_samples_split': 300},
  mean: 0.87077, std: 0.00282, params: {'min_samples_leaf': 30, 'min_samples_split': 400},
  mean: 0.87049, std: 0.00222, params: {'min_samples_leaf': 40, 'min_samples_split': 100},
  mean: 0.87037, std: 0.00340, params: {'min_samples_leaf': 40, 'min_samples_split': 200},
  mean: 0.87123, std: 0.00306, params: {'min_samples_leaf': 40, 'min_samples_split': 300},

In [23]:
gbm1 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=240,max_depth=9, min_samples_leaf =20, 
               min_samples_split =300, max_features='sqrt', subsample=0.8, random_state=10)
gbm1.fit(train_fea_X,train_fea_Y)
y_pred = gbm1.predict(train_fea_X)
print(np.sum(y_pred == train_fea_Y)/y_pred.shape[0])

0.888179109978


In [24]:
param_test4 = {'max_features':[i for i in range(7,20,2)]}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=240,max_depth=9, min_samples_leaf =20, 
               min_samples_split =300, subsample=0.8, random_state=10), 
                       param_grid = param_test4, scoring='accuracy', cv=5,n_jobs = 12)
gsearch4.fit(train_fea_X,train_fea_Y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.86972, std: 0.00348, params: {'max_features': 7},
  mean: 0.87055, std: 0.00269, params: {'max_features': 9},
  mean: 0.87080, std: 0.00260, params: {'max_features': 11},
  mean: 0.87144, std: 0.00322, params: {'max_features': 13},
  mean: 0.87070, std: 0.00183, params: {'max_features': 15},
  mean: 0.87034, std: 0.00288, params: {'max_features': 17},
  mean: 0.87061, std: 0.00212, params: {'max_features': 19}],
 {'max_features': 13},
 0.8714412948005282)

In [26]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=240,max_depth=9, min_samples_leaf =20, 
               min_samples_split =300, max_features='sqrt', random_state=10), 
                       param_grid = param_test5, scoring='accuracy', cv=5, n_jobs = 12)
gsearch5.fit(train_fea_X,train_fea_Y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

([mean: 0.86972, std: 0.00214, params: {'subsample': 0.6},
  mean: 0.87034, std: 0.00306, params: {'subsample': 0.7},
  mean: 0.86969, std: 0.00320, params: {'subsample': 0.75},
  mean: 0.87169, std: 0.00309, params: {'subsample': 0.8},
  mean: 0.87037, std: 0.00255, params: {'subsample': 0.85},
  mean: 0.87052, std: 0.00240, params: {'subsample': 0.9}],
 {'subsample': 0.8},
 0.8716869875003839)

In [30]:
gbm2 = GradientBoostingClassifier(learning_rate=0.05, n_estimators=480,max_depth=9, min_samples_leaf =20, 
               min_samples_split =300, max_features='sqrt', subsample=0.8, random_state=10)
gbm2.fit(train_fea_X,train_fea_Y)
y_pred = gbm2.predict(train_fea_X)
print(np.sum(y_pred == train_fea_Y)/y_pred.shape[0])

0.889468996652


In [31]:
gbm3 = GradientBoostingClassifier(learning_rate=0.01, n_estimators=2400,max_depth=9, min_samples_leaf =20, 
               min_samples_split =300, max_features='sqrt', subsample=0.8, random_state=10)
gbm3.fit(train_fea_X,train_fea_Y)
y_pred = gbm3.predict(train_fea_X)
print(np.sum(y_pred == train_fea_Y)/y_pred.shape[0])

0.88999109364


In [32]:
gbm4 = GradientBoostingClassifier(learning_rate=0.005, n_estimators=4800,max_depth=9, min_samples_leaf =20, 
               min_samples_split =300, max_features='sqrt', subsample=0.8, random_state=10)
gbm4.fit(train_fea_X,train_fea_Y)
y_pred = gbm4.predict(train_fea_X)
print(np.sum(y_pred == train_fea_Y)/y_pred.shape[0])

0.88949970824


In [38]:
pre_test = gbm3.predict(test_fea_X).astype(int)
output_result('output_gbm3.csv',pre_test)

  id  label
0  1      0
1  2      0
2  3      0
3  4      1
4  5      0


In [39]:
pre_test = gbm2.predict(test_fea_X).astype(int)
output_result('output_gbm2.csv',pre_test)

  id  label
0  1      0
1  2      0
2  3      0
3  4      1
4  5      0


In [40]:
pre_test = gbm1.predict(test_fea_X).astype(int)
output_result('output_gbm1.csv',pre_test)

  id  label
0  1      0
1  2      0
2  3      0
3  4      1
4  5      0


In [50]:
from sklearn.externals import joblib #jbolib模块

joblib.dump(gbm3, 'gbm3.pkl')
joblib.dump(gbm2, 'gbm2.pkl')
joblib.dump(gbm1, 'gbm1.pkl')
# #读取Model
# gbm3 = joblib.load('save/clf.pkl')

# #测试读取后的Model
# print(clf3.predict(X[0:1]))

['gbm1.pkl']

In [51]:
test_model = joblib.load('gbm3.pkl')

In [53]:
test_pre = test_model.predict(test_fea_X).astype(int)

In [54]:
pre_test = gbm3.predict(test_fea_X).astype(int)

In [56]:
np.sum(test_pre != pre_test)

0

In [58]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.19.1.
