In [1]:
from sklearn.datasets import fetch_california_housing, load_wine
from sklearn.metrics import r2_score,mean_absolute_error,accuracy_score,\
mean_squared_error
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier,\
ExtraTreeClassifier,ExtraTreeRegressor,export_graphviz
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
import numpy as np
import pandas as pd
import graphviz
import pydotplus

In [2]:
def evalue(model,y_test,x_test):
    y_predict=model.predict(x_test)
    mse=mean_squared_error(y_test,y_predict)
    mae=mean_absolute_error(y_test,y_predict)
    score=model.score(x_test,y_test)
    r2score=r2_score(y_test,y_predict)
    print("mse:{} \nmae:{}\nscore:{}\nr2_score:{}\n".format(mse,mae,score,r2score))

## 回归算法

### 导入数据

In [3]:
data_california_housing=fetch_california_housing()

In [4]:
data_california_housing.DESCR

'California housing dataset.\n\nThe original database is available from StatLib\n\n    http://lib.stat.cmu.edu/datasets/\n\nThe data contains 20,640 observations on 9 variables.\n\nThis dataset contains the average house value as target variable\nand the following input variables (features): average income,\nhousing average age, average rooms, average bedrooms, population,\naverage occupation, latitude, and longitude in that order.\n\nReferences\n----------\n\nPace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,\nStatistics and Probability Letters, 33 (1997) 291-297.\n\n'

In [5]:
data=data_california_housing.data
target=data_california_housing.target
features=data_california_housing.feature_names

In [6]:
show=pd.DataFrame(data,columns=features)

In [7]:
show.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


### 随机选择两个特征

In [8]:
l=[0,1,2,3,4,5,6,7]
import random
k=random.sample(l,2)
data_random_feature=show.iloc[:,k]

In [9]:
x_train,x_test,y_train,y_test=train_test_split(data_random_feature,target,test_size=0.2,random_state=1)

In [10]:
x_train.head()

Unnamed: 0,Longitude,AveBedrms
15961,-122.43,1.014184
1771,-122.35,0.957096
16414,-121.24,1.666667
5056,-118.35,1.149526
8589,-118.39,1.057325


### 决策树回归模型

In [11]:
regression_tree=DecisionTreeRegressor(max_depth=3,random_state=1)
regression_tree.fit(x_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

#### 随机选取两个特征后的模型评分

In [12]:
evalue(regression_tree,y_test,x_test)

mse:1.0188665503324579 
mae:0.7758569332541163
score:0.2232418520466214
r2_score:0.2232418520466214



#### 随机选取两个特征后的模型可视化

In [13]:
dot_data=export_graphviz(regression_tree,out_file=None,\
                        feature_names=[features[k[0]],features[k[1]]],rounded=True,filled=True)
graph=pydotplus.graph_from_dot_data(dot_data)
graph.write_png('../regressiontreeOf2Features.png')

True

#### 使用所有特征进行预测，并评估实验结果

In [14]:
x_train,x_test,y_train,y_test=train_test_split(data,target,test_size=0.2,random_state=1)
regression_tree=DecisionTreeRegressor(random_state=1)
randomforest=RandomForestRegressor(random_state=1)
regression_tree.fit(x_train,y_train)
randomforest.fit(x_train,y_train)
print("the RegressionTree result:\n")
evalue(regression_tree,y_test,x_test)
print("the RandomForest result:\n")
evalue(randomforest,y_test,x_test)

the RegressionTree result:

mse:0.48368777971778837 
mae:0.44637680882013675
score:0.6312486420928465
r2_score:0.6312486420928465

the RandomForest result:

mse:0.28395567445559267 
mae:0.3478378055448464
score:0.7835193591162573
r2_score:0.7835193591162573



In [16]:
x_train.shape

(16512, 8)

#### 使用网格交叉验证进行自动调参

In [17]:
parameter_regression={"criterion":["mse","mae"],\
#                       "splitter":["best","random"],\
#                       "max_features":["auto","sqrt","log2"],\
                      "max_depth":[3,5,7]
}

parameter_randomforest={"criterion":["mse","mae"],\
#                         "max_features":["auto","sqrt","log2"],\
#                         "max_depth":[3,5,7]
}

In [18]:
dtr=GridSearchCV(DecisionTreeRegressor(),parameter_regression,cv=5)
dtr.fit(x_train,y_train)
evalue(dtr,y_test,x_test)
print("best params:{}".format(dtr.best_params_))

mse:0.4220694346611607 
mae:0.45721869923788
score:0.6782249134902347
r2_score:0.6782249134902347

best params:{'criterion': 'mse', 'max_depth': 7}


In [None]:
rfr=GridSearchCV(RandomForestRegressor(),parameter_randomforest,cv=5)
rfr.fit(x_train,y_train)
evalue(rfr,y_test,x_test)
print("best params:{}".format(rfr.best_params_))

## 分类算法
### 导入数据及拆分数据

In [23]:
data=load_wine()

In [24]:
x_data=data.data
y_data=data.target
features=data.feature_names
target_class=data.target_names

In [25]:
print(target_class)
print(features)

['class_0' 'class_1' 'class_2']
['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [26]:
show=pd.DataFrame(x_data,columns=features)

In [27]:
show.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [28]:
x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.2,random_state=1)

#### 决策树分类模型

In [33]:
parameter={"criterion":['gini','entropy'],"max_depth":[3,6,None],\
           "max_features":["auto","sqrt","log2",None]}
# 这里选择的超参数只选择了树的深度和使用信息增益度或gini指数作为分类依据
dtc=GridSearchCV(DecisionTreeClassifier(),parameter,cv=4)
dtc.fit(x_train,y_train)
evalue(dtc,y_test,x_test)
print("best params:{}".format(dtc.best_params_))

mse:0.05555555555555555 
mae:0.05555555555555555
score:0.9444444444444444
r2_score:0.9103362391033624

best params:{'criterion': 'entropy', 'max_depth': 6, 'max_features': 'log2'}


#### 随机森林分类模型

In [34]:
parameter={"criterion":['gini','entropy'],"max_depth":[3,6,None],\
           "max_features":["auto","sqrt","log2",None]}
# 这里选择的超参数只选择了树的深度和使用信息增益度或gini指数作为分类依据
rfc=GridSearchCV(RandomForestClassifier(),parameter,cv=4)
rfc.fit(x_train,y_train)
evalue(rfc,y_test,x_test)
print("best params:{}".format(rfc.best_params_))

mse:0.027777777777777776 
mae:0.027777777777777776
score:0.9722222222222222
r2_score:0.9551681195516812

best params:{'criterion': 'gini', 'max_depth': 6, 'max_features': 'sqrt'}
