In [None]:
import pandas as pd
from sklearn.cluster import KMeans 
import numpy as np
# where data file and result saved
datafile = '../data/data.csv' 
processedfile = '../tmp/data_processed.csv' 

typelabel ={'肝氣鬱結症型係數':'A', '熱毒蘊結症型係數':'B', '沖任失調症型係數':'C', '氣血兩虛症型係數':'D',
            '脾胃虛弱症型係數':'E', '肝腎陰虛症型係數':'F'}
# number of clusters
k = 4 

# load data
data = pd.read_csv(datafile) 
keys = list(typelabel.keys())

Total scores
---
1. 肝氣鬱結症型 (A): 40,
-  熱毒蘊結症型 (B): 44,
-  沖任失調症型 (C): 41,
-  氣血兩虛症型 (D): 43,
-  脾胃虛弱症型 (E): 43,
-  肝腎陰虛症型 (F): 38.

症型係數 = <font size=+1.5>$\frac{\text{score}}{\text{total}}$</font>

Steps:
---
1. convert each data into 症型係數, <i>score/(total score)</i>
- (KNN's) there are 4 clusters in each feature (A ~ F);
- computer the center of clusters
- determine in which clusters the data are;
- normailze the range, [0,1], of features according to the data frequencies.

In [None]:

result = pd.DataFrame()

for i in range(len(keys)):
    # Using k-means Algorithm
    print('正在進行 “%s” 的聚類...' % keys[i])
    # n_jobs: number of threads, numbers of cores is a good selection
    kmodel = KMeans(n_clusters = k, n_jobs = 4) 
    kmodel.fit(data[[keys[i]]].as_matrix()) # Model training
    
    """
    r1: centers of clusters
    r2: number of each clusters
    r :  
    """    
    r1 = pd.DataFrame(kmodel.cluster_centers_, columns = [typelabel[keys[i]]]) 
    r2 = pd.Series(kmodel.labels_).value_counts() 
    r2 = pd.DataFrame(r2, columns = [typelabel[keys[i]]+'n'])
    r = pd.concat([r1, r2], axis = 1).sort_values(typelabel[keys[i]]) 
    r.index = [1, 2, 3, 4]
    
    r[typelabel[keys[i]]] = pd.rolling_mean(r[typelabel[keys[i]]], 2) #rolling_mean()用来计算相邻2列的均值，以此作为边界点。
    r[typelabel[keys[i]]][1] = 0.0 #这两句代码将原来的聚类中心改为边界点。
    result = result.append(r.T)
    #result= result.sort_values()
    result.to_csv(processedfile)

In [None]:
# find out max value for each feature 
B={}
for key in keys:
    B[typelabel[key]]=max(data[key])

Features=list(typelabel.values())#['A','B','C','D','E','F']

# the range of clusters for each feature
print("Ranges of Clusters\n===")
for f in Features:
    i=0
    for i in range(0,4):
        if i ==3:
           f_val0=result.loc[f].values[i]
           #f_val1=max(data['肝氣鬱結症型係數'])
           f_val1=B[f] 
        else:
           f_val0=result.loc[f].values[i]
           f_val1=result.loc[f].values[i+1]
        print("%s%d: (%.3f,%.3f]" %(f,i+1,f_val0,f_val1))
    print('---\n')

In [None]:
# create the range array create above

f_range=np.zeros([6,4])
i=0
for f in Features:
    for j in range(0,4):
        if j ==3:
           #f_val1=max(data['肝氣鬱結症型係數'])
            f_range[i,j]=B[f] 
        else:
           f_range[i,j]=result.loc[f].values[j+1]
    i=i+1       

In [None]:
def feature_class1(fval,fname=Features[0],range_arr=f_range[0]):
    index_val = fval

    if index_val < range_arr[0]:
        return fname+'1'
    elif index_val < range_arr[1]:
        return fname+'2'
    elif index_val < range_arr[2]:
        return fname+'3'
    else:
        return fname+'4'

In [None]:
data1=data

<big>**Pre-defined variables**</big>

<pre style="font-family:chalkboard;font-face:bold;font-size:1.2em;" />
keys = ['肝氣鬱結症型係數','熱毒蘊結症型係數', '沖任失調症型係數', '氣血兩虛症型係數', '脾胃虛弱症型係數', '肝腎陰虛症型係數']

Features = ['A', 'B', 'C', 'D', 'E', 'F']
f_range = array([
                  [0.17869759, 0.25772406, 0.35184318, 0.504 ],
                  [0.15354254, 0.29821671, 0.48995396, 0.78  ],
                  [0.20214873, 0.28906114, 0.42353655, 0.61  ],
                  [0.17228126, 0.25168344, 0.35935347, 0.552 ],
                  [0.1526978 , 0.25787347, 0.37606164, 0.526 ],
                  [0.17914338, 0.26138639, 0.35464267, 0.607 ]
                ])
</pre>

In [None]:
# create cardinal column
for i in range(6):
    data1[Features[i]]=[feature_class1(item,fname=Features[i],range_arr=f_range[i]) 
                for item in data1[[keys[i]]].values]

In [None]:
data1[['H']]=data[['TNM分期']]

In [None]:
data1.head()

In [None]:
# save the converted cardinal data
data1[['A','B','C','D','E','F','H']].to_csv("../data/apriori.csv",index=False,columns=None,header=None)