# 模糊聚类分析在红酒数据集的运用

对于实验的说明：

实验步骤：

    1. 导入数据
    2. 评价标准
    3. 数据规格化
    4. 构造模糊相似矩阵
    5. 构造模糊等价矩阵
    6. 聚类 -- 调整lambda-截值

## 1. 导入数据

In [201]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine

In [202]:
wine = load_wine()

In [203]:
wine.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [204]:
X = wine.data
y = wine.target

print('shape', X.shape)

shape (178, 13)


### * 实验使用10%数据展示

In [205]:
from sklearn.model_selection import train_test_split

In [206]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.94, random_state=1)
print('X_train shape', X_train.shape)
print('y_train shape', y_train.shape)

X_train shape (10, 13)
y_train shape (10,)


In [240]:
feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','od280/od315稀释葡萄酒','脯氨酸']
# pd.DataFrame(X_train, columns=feature_name).to_excel('wine-train.xlsx')
# pd.DataFrame(y_train).to_excel('wine-result.xlsx')

## 2. 评价标准

因为红酒数据有13个特征，每个特征代表一种成分，因此使用13个特征来评价每一瓶红酒

## 3. 数据规格化

去量钢化，对数据进行标准化处理

In [207]:
from sklearn.preprocessing import MinMaxScaler

In [208]:
# 归一化处理
X_01 = MinMaxScaler().fit_transform(X_train)

In [241]:
# pd.DataFrame(X_01, columns=feature_name).to_excel('wine-train_01.xlsx')

## 4. 构造模糊相似矩阵

准备使用的方式：

    1. 余弦相似度法
    2. 相关系数法

In [209]:
# 余弦相似度法
from sklearn.metrics.pairwise import cosine_similarity

In [210]:
matrix_cos = cosine_similarity(X_01)
print('matrix_cos shape', matrix_cos.shape)
print('less than 0:', np.sum(matrix_cos < 0))

matrix_cos shape (10, 10)
less than 0: 0


In [243]:
# pd.DataFrame(matrix_cos, columns=range(10)).to_excel('matrix_cos.xlsx')

In [211]:
# 相关系数法
# TODO：

## 5. 构造模糊等价矩阵

使用R2 = compound(R, R),

    如果R2 == R，则R为模糊等价矩阵
    
    如果R2 != R, 则继续计算R4，R8...

In [212]:
def compound(r):
    """定义合成运算, 返回r2"""
    r2 = np.ones_like(r)
    n = len(r)
    for i in range(n-1):
        for j in range(i+1, n):
            u, v = r[i,:], r[:,j]
            
            # 取小取大
            tmp = np.zeros_like(u)
            for idx in range(len(u)):
                tmp[idx] = min(u[idx], v[idx])
            r2[i,j] = np.max(tmp)
            r2[j,i] = r2[i,j]
    return r2         

def is_equal(r, _r):
    """判断两个矩阵是否相等"""
    return (r == _r).all()

def get_equal_matrix(r):
    """用模糊相似矩阵构建模糊等价矩阵"""
    while True:
        new_r = compound(r)
        if is_equal(r, new_r):
            break
        else:
            r = new_r
    return r.copy()

In [213]:
# r = [[1.0, 0.4, 0.7, 0.0, 0.9],
#     [0.4, 1.0, 0.8, 0.0, 0.0],
#     [0.7, 0.8, 1.0, 0.6, 0.0],
#     [0.0, 0.0, 0.6, 1.0, 0.0],
#     [0.9, 0.0, 0.0, 0.0, 1.0]]
# r = np.array(r)


# 构建模糊等价矩阵
matrix_equal = get_equal_matrix(matrix_cos)

In [244]:
# pd.DataFrame(matrix_equal, columns=range(10)).to_excel('matrix_equal.xlsx')

## 6. 聚类 -- 调整lambda-截值

In [214]:
# 根据模糊等价矩阵获得所有的lambda截值
lmd_vals = sorted(list(set(matrix_equal.ravel().tolist())))

In [249]:
for item in zip(range(10), lmd_vals):
    print(item)
    


(0, 0.6356017260014928)
(1, 0.7532866532215748)
(2, 0.7822094376522073)
(3, 0.7863244705649027)
(4, 0.8002796327226721)
(5, 0.881413375016439)
(6, 0.8853555412247797)
(7, 0.8894221554251438)
(8, 0.8909947890888853)
(9, 1.0)


In [225]:
def classify(matrix, threshold):
    """基于阈值对matrix进行分类"""
    X = matrix >= threshold
    
    res = np.zeros(len(matrix)).astype(np.int32)
    label = 0
    for r in range(1, len(matrix)):
        for i in range(0, r):
            if (X[r, :] == X[i, :]).all():
                res[r] = res[i]
                break
        else:
            label += 1
            res[r] = label
    return res

In [254]:
# print('聚类结果\t原来分类')
# for item in zip(classify(matrix_equal, lmd_vals[2]), y_train):
#     print('%s\t\t%s' % (item[0], item[1]))

聚类结果	原来分类
0		2
1		1
1		1
2		2
1		1
2		2
1		2
1		1
1		2
1		0
