# 决策树--非线性回归与分类

# 广告分类屏蔽
我们用网格搜索来确定决策树模型最大最优评价效果（F1 score）的超参数，然后把决策树用在测试集进行效果评估。

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split

In [10]:
df = pd.read_csv('ad.data',header=None,low_memory=False)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1549,1550,1551,1552,1553,1554,1555,1556,1557,1558
0,125,125,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
1,57,468,8.2105,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
2,33,230,6.9696,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
3,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
4,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.


In [13]:
explanatory_variable_columns = set(df.columns.values)
response_variable_column = df[len(df.columns.values)-1]
explanatory_variable_columns.remove(len(df.columns.values)-1)

#将label变为0和1
y = [1 if e=='ad.' else 0 for e in response_variable_column]
X = df.loc[:,list(explanatory_variable_columns)]

In [17]:
X[0]

0        125
1         57
2         33
3         60
4         60
5         60
6         59
7         60
8         60
9         60
10         ?
11        90
12        90
13        90
14        33
15        60
16        60
17       125
18        60
19        30
20        90
21        90
22        90
23        90
24         ?
25        90
26        90
27        60
28        60
29        60
        ... 
3249       ?
3250       ?
3251      16
3252      24
3253       ?
3254      25
3255       ?
3256      55
3257       ?
3258       ?
3259      10
3260      11
3261       ?
3262     150
3263      16
3264     134
3265      23
3266      40
3267     158
3268      25
3269       ?
3270       ?
3271       ?
3272     106
3273      30
3274     170
3275     101
3276      23
3277       ?
3278      40
Name: 0, dtype: object

In [34]:
X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [50]:
#创建pipeline和DecisionTreeClassifier类实例
pipeline = Pipeline([('clf',DecisionTreeClassifier(criterion='entropy'))])  #将criterion设置为entropy，表示用信息增益启发式算法建立决策树
parameters = {
    'clf__max_depth': (150, 155, 160),
    'clf__min_samples_split': (2, 3),  #不能为1 否则下面会报错
    'clf__min_samples_leaf': (1, 2, 3)
}

In [51]:
#确定网格搜索的参数范围；将GridSearchCV的搜索目标scoring设置为f1
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   17.4s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('clf', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'clf__max_depth': (150, 155, 160), 'clf__min_samples_split': (2, 3), 'clf__min_samples_leaf': (1, 2, 3)},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=1)

In [52]:
print('最佳效果：%0.3f'% grid_search.best_score_)
print('最优参数：')
print(grid_search.best_params_)
predictions = grid_search.predict(X_test)
print(classification_report(y_test,predictions))

最佳效果：0.880
最优参数：
{'clf__max_depth': 150, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 3}
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       711
          1       0.91      0.91      0.91       109

avg / total       0.98      0.98      0.98       820



# 决策树集成
随机森林（random forest）是一种随机选取训练集解释变量的子集进行训练，获得一系列决策树的集合的方法。随机森林通常用其决策树集合里每个决策树的预测结果的均值或众数作为最终预测值。scikit-learn里的随机森林使用均值作为预测值。随机森林相比单一决策树，不太会受到拟合过度的影响

In [53]:
#把DecisionTreeClassifier替换成RandomForestClassifier就可以了
from sklearn.ensemble import RandomForestClassifier


In [56]:
pipeline = Pipeline([('clf',RandomForestClassifier(criterion='entropy'))])
parameters = {
    'clf__n_estimators': (5, 10, 20, 50),
    'clf__max_depth': (50, 150, 250),
    'clf__min_samples_split': (2, 3), #不能为1？
    'clf__min_samples_leaf': (1, 2, 3)
}
grid_search =GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1,scoring='f1')
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:   33.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'clf__n_estimators': (5, 10, 20, 50), 'clf__max_depth': (50, 150, 250), 'clf__min_samples_split': (2, 3), 'clf__min_samples_leaf': (1, 2, 3)},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=1)

In [57]:
print('最佳效果：%0.3f' % grid_search.best_score_)
print('最优参数：')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))

predictions = grid_search.predict(X_test)
print(classification_report(y_test, predictions))

最佳效果：0.921
最优参数：
	clf__max_depth: 50
	clf__min_samples_leaf: 1
	clf__min_samples_split: 3
	clf__n_estimators: 50
             precision    recall  f1-score   support

          0       0.99      1.00      0.99       711
          1       0.97      0.91      0.94       109

avg / total       0.98      0.98      0.98       820

