## XOR分类


In [56]:
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import plot_model
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

pd.set_option('max_rows',8)
pd.set_option('max_columns',8)

In [57]:
# 生成xor数据
import pandas as pd 
xor_dataset = pd.DataFrame([[1,1,0],[1,0,1],[0,1,1],[0,0,0]],columns=['x0','x1','label'])
x,y = xor_dataset[['x0','x1']], xor_dataset['label']
xor_dataset.head()

# keras实现逻辑回归
from keras.layers import *
from keras.models import Sequential, Model
from tensorflow import random
np.random.seed(5) # 固定随机种子
random.set_seed(5)
model = Sequential()
model.add(Dense(1, input_dim=3, activation='sigmoid'))
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy')
xor_dataset['x2'] = xor_dataset['x0'] * xor_dataset['x1'] # 加入非线性特征
x,y = xor_dataset[['x0','x1','x2']], xor_dataset['label']
model.fit(x, y, epochs=10000,verbose=False)
print("正确标签：",y.values)
print("模型预测：",model.predict(x).round())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1)                 4         
Total params: 4
Trainable params: 4
Non-trainable params: 0
_________________________________________________________________
正确标签： [0 1 1 0]
模型预测： [[0.]
 [1.]
 [1.]
 [0.]]


In [60]:
model.get_weights()

[array([[ 4.6734157],
        [ 4.674698 ],
        [-9.370911 ]], dtype=float32),
 array([-2.3304145], dtype=float32)]

### GBDT +LR

In [3]:
# 加载数据集：查看数据介绍， 并转为DataFrame
dataset_cancer = datasets.load_breast_cancer()    # 加载癌细胞数据集

print(dataset_cancer['DESCR'])

df = pd.DataFrame(dataset_cancer.data, columns=dataset_cancer.feature_names)  

df['label'] = dataset_cancer.target

print(df.shape)

df.head()

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,...,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,...,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,...,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,...,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,...,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,...,0.1625,0.2364,0.07678,0


In [46]:

drop_feas = ['label']
# 选择标签y及特征x
y = df.label
x = df.drop(drop_feas,axis=1)  # 删除相关性强特征及标签列


# holdout验证法： 按3：7划分测试集 训练集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=0)



In [47]:
## GBDT +LR 

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier


gbdt = GradientBoostingClassifier(n_estimators=50, random_state=10, subsample=0.8, max_depth=6,
                                  min_samples_split=20)
gbdt.fit(x_train, y_train) # GBDT 训练集训练

train_new_feature = gbdt.apply(x) # 返回数据在训练好的模型里每棵树中所处的叶子节点的位置
print(train_new_feature.shape)
train_new_feature = train_new_feature.reshape(-1, 50)
display(train_new_feature)
print(train_new_feature.shape)


enc = OneHotEncoder()
enc.fit(train_new_feature)
train_new_feature2 = np.array(enc.transform(train_new_feature).toarray())  # onehot表示

print(train_new_feature2.shape)
train_new_feature2




(569, 50, 1)


array([[21., 15., 21., ..., 11., 15., 20.],
       [22., 21., 15., ..., 13., 22., 23.],
       [22., 21., 22., ..., 28., 22., 23.],
       ...,
       [17., 19., 11., ..., 28., 22., 37.],
       [22., 22., 22., ..., 28., 22., 40.],
       [10.,  7.,  7., ..., 19.,  7., 35.]])

(569, 50)
(569, 668)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [52]:
x_all_feas = pd.concat([x.reset_index(drop=True),pd.DataFrame(train_new_feature2)],axis=1)
x_all_feas.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,...,664,665,666,667
0,17.99,10.38,122.8,1001.0,...,0.0,0.0,0.0,0.0
1,20.57,17.77,132.9,1326.0,...,0.0,0.0,0.0,0.0
2,19.69,21.25,130.0,1203.0,...,0.0,0.0,0.0,0.0
3,11.42,20.38,77.58,386.1,...,0.0,0.0,0.0,0.0
4,20.29,14.34,135.1,1297.0,...,0.0,0.0,0.0,0.0


In [53]:
# holdout验证法： 按3：7划分测试集 训练集
x_train, x_test, y_train, y_test = train_test_split(x_all_feas , y, test_size=0.3,random_state=0)

In [54]:
from sklearn.linear_model import LogisticRegression


model = LogisticRegression()
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [55]:
# 模型评估 f1score

def model_metrics(model, x, y):
    """
    评估函数
    """
    yhat = model.predict(x).round() #模型预测yhat，预测阈值按默认0.5划分
    result = {
              'f1_score': f1_score(y, yhat),
              'precision':precision_score(y, yhat),
              'recall':recall_score(y, yhat)
             }
    return result


# 模型评估结果
print("TRAIN")
print(model_metrics(model, x_train, y_train))

print("TEST")
print(model_metrics(model, x_test, y_test))

TRAIN
{'f1_score': 1.0, 'precision': 1.0, 'recall': 1.0}
TEST
{'f1_score': 0.9953917050691244, 'precision': 0.9908256880733946, 'recall': 1.0}
