In [1]:
# 使下面的代码支持python2和python3
from __future__ import division, print_function, unicode_literals

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os 

# 在每一次的运行后获得的结果与这个notebook的结果相同
np.random.seed(42) 

# 让matplotlib的图效果更好
%matplotlib inline
import matplotlib as mpl
import matplotlib as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# 设置保存图片的途径
PROJECT_ROOT_DIR = "."

def save_fig(fig_id, tight_layout=True):
    '''
    只需在clustering_test_202007121.ipynb文件所在目录处，建立一个images的文件夹，运行即可保存自动图片
    
    :param fig_id: 图片名称
    '''
    path = os.path.join(PROJECT_ROOT_DIR, "images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)
    
# 忽略掉没用的警告 (Scipy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", category=FutureWarning, module='sklearn',  lineno=196)

# get mark 

In [2]:
from sklearn import metrics

def get_marks(estimator, data, name=None, kmeans=None, af=None,db=None):
    """获取评分，有五种需要知道数据集的实际分类信息，有三种不需要，参考readme.txt
    
    :param estimator: 模型
    :param name: 初始方法
    :param data: 特征数据集
    """
    estimator.fit(data)
    print(20 * '*', name, 20 * '*')
    if kmeans:
        print("Mean Inertia Score: ", estimator.inertia_)
    elif af:
        cluster_centers_indices = estimator.cluster_centers_indices_
        print("The estimated number of clusters: ", len(cluster_centers_indices))
    print("Homogeneity Score         (均一性): ", metrics.homogeneity_score(labels, estimator.labels_))
    print("Completeness Score        (完整性): ", metrics.completeness_score(labels, estimator.labels_))
    print("V-Measure Score           (V量): ", metrics.v_measure_score(labels, estimator.labels_))
    print("Adjusted Rand Score       (调整后兰德指数): ", metrics.adjusted_rand_score(labels, estimator.labels_))
    print("Adjusted Mutual Info Score(调整后的共同信息): ", metrics.adjusted_mutual_info_score(labels, estimator.labels_))
    print("Calinski Harabasz Score:  (方差比指数) ", metrics.calinski_harabasz_score(data, estimator.labels_))
    print("Silhouette Score          (轮廓分数): ", metrics.silhouette_score(data, estimator.labels_))

In [3]:
# 读取数据集
df = pd.read_excel('Test_202007121quanbu.xlsx')
df

Unnamed: 0,TRUE VALUE,SiO2 (wt. %),TiO2 (wt. %),Al2O3 (wt. %),Cr2O3 (wt. %),FeO (wt. %),MnO (wt. %),MgO (wt. %),CaO (wt. %),Na2O (wt. %),IV (Al),H2O (wt. %)
0,1,47.342,2.663,6.784,0.208,6.835,0.132,12.553,22.460,0.527,0.225799,0.034081
1,1,47.223,2.698,5.606,0.010,7.286,0.090,12.715,23.135,0.240,0.213222,0.015711
2,1,48.296,1.728,5.993,1.089,6.002,0.068,13.117,22.856,0.522,0.196852,0.025636
3,1,49.363,1.903,7.919,0.080,6.565,0.096,12.257,20.559,0.767,0.176325,0.026104
4,1,46.939,2.597,7.438,0.159,7.048,0.071,11.941,23.016,0.521,0.240831,0.042122
5,1,50.545,0.952,6.945,0.166,5.501,0.103,13.297,21.008,0.950,0.138926,0.020295
6,1,50.822,1.584,4.595,0.544,5.298,0.090,13.679,21.909,0.974,0.119149,0.021745
7,1,47.879,2.304,6.473,0.410,6.368,0.130,12.542,22.506,0.562,0.204663,0.035665
8,1,46.900,2.780,7.083,0.018,7.028,0.126,12.071,22.784,0.542,0.235206,0.018804
9,1,47.444,2.251,9.002,0.157,6.717,0.075,12.097,20.883,0.933,0.236551,0.055946


In [4]:
# 查看数据集是否有空值，看需不需要插值
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2691 entries, 0 to 2690
Data columns (total 12 columns):
TRUE VALUE       2691 non-null int64
SiO2 (wt. %)     2691 non-null float64
TiO2 (wt. %)     2691 non-null float64
Al2O3 (wt. %)    2691 non-null float64
Cr2O3 (wt. %)    2691 non-null float64
FeO (wt. %)      2691 non-null float64
MnO (wt. %)      2691 non-null float64
MgO (wt. %)      2691 non-null float64
CaO (wt. %)      2691 non-null float64
Na2O  (wt. %)    2691 non-null float64
IV (Al)          2691 non-null float64
H2O (wt. %)      2691 non-null float64
dtypes: float64(11), int64(1)
memory usage: 252.4 KB


In [5]:
# 将真实的分类标签与特征分开
data = df.drop('TRUE VALUE', axis=1)
labels = df['TRUE VALUE']
# 获取数据的数量和特征的数量
n_samples, n_features = data.shape
# 获取分类标签的数量
n_labels = len(np.unique(labels))
np.unique(labels)

array([-1,  0,  1])

In [6]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters_

3

In [58]:
from sklearn import metrics

def get_marks(estimator, data, name=None, kmeans=None, af=None):
    """获取评分，有五种需要知道数据集的实际分类信息，有三种不需要，参考readme.txt
    
    :param estimator: 模型
    :param name: 初始方法
    :param data: 特征数据集
    """
    estimator.fit(data)
    print(20 * '*', name, 20 * '*')
    if kmeans:
        print("Mean Inertia Score: ", estimator.inertia_)
    elif af:
        cluster_centers_indices = estimator.cluster_centers_indices_
        print("The estimated number of clusters: ", len(cluster_centers_indices))
    print("Homogeneity Score: ", metrics.homogeneity_score(labels, estimator.labels_))
    print("Completeness Score: ", metrics.completeness_score(labels, estimator.labels_))
    print("V Measure Score: ", metrics.v_measure_score(labels, estimator.labels_))
    print("Adjusted Rand Score: ", metrics.adjusted_rand_score(labels, estimator.labels_))
    print("Adjusted Mutual Info Score: ", metrics.adjusted_mutual_info_score(labels, estimator.labels_))
    print("Calinski Harabasz Score: ", metrics.calinski_harabasz_score(data, estimator.labels_))
    print("Silhouette Score: ", metrics.silhouette_score(data, estimator.labels_))

# DBSCAN

In [59]:
from sklearn.cluster import DBSCAN

db = DBSCAN(eps = 1 , min_samples = 2)
db.fit(data)

DBSCAN(algorithm='auto', eps=1, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=2, n_jobs=None, p=None)

In [60]:
cluster_centers_indices =  [[1, 1], [-1, -1], [1, -1],[-1,1]]
labels = db.labels_

In [61]:
cluster_centers_indices

[[1, 1], [-1, -1], [1, -1], [-1, 1]]

In [62]:
np.unique(labels)

array([-1,  0,  1,  2,  3], dtype=int64)

In [65]:
get_marks(db, data=data, db=True)

******************** None ********************
Homogeneity Score         (均一性):  1.0
Completeness Score        (完整性):  1.0
V-Measure Score           (V量):  1.0
Adjusted Rand Score       (调整后兰德指数):  1.0
Adjusted Mutual Info Score(调整后的共同信息):  1.0
Calinski Harabasz Score:  (方差比指数)  8.633645377918013
Silhouette Score          (轮廓分数):  -0.1650270066141785


In [73]:
from sklearn.model_selection import GridSearchCV

# 使用GridSearchCV自动寻找最优参数
params = {'eps':[0.3, 0.5, 0.8, 1], 'min_samples':[2, 5, 10, 15, 30]}
cluster = DBSCAN()
db_best_model = GridSearchCV(cluster, params, cv=5, scoring='silhouette_score', verbose=1)
db_best_model.fit(data)

ValueError: 'silhouette_score' is not a valid scoring value. Use sorted(sklearn.metrics.SCORERS.keys()) to get valid options.

In [50]:
# 最优模型的参数设置
db_best_model.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
# 最优模型的评分，使用调整的兰德系数(adjusted_rand_score)作为评分
db_best_model.best_score_

In [None]:
from sklearn.decomposition import PCA

# 使用普通PCA进行降维，将特征从11维降至3维
pca3 = PCA(n_components=n_labels)
reduced_data = pca3.fit_transform(data)
get_marks(AffinityPropagation(min_samples=-200, eps=0.8), 
          reduced_data, name="PCA-based AF", af=True)