# 第2章 試著利用機械學習進行分析
這章要學習執行程式的流程，以便學習機械學習的基礎。

In [None]:
#Colaboratory環境的設定
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/MathProgramming/Chapter2

In [None]:
#函式庫的設定
!pip install -q -r ./requirements.txt

## 2-1 計算顧客行為模式的相似度

### 載入資料

In [None]:
import pandas as pd
df_info = pd.read_csv("accomodation_info.csv", index_col=0, parse_dates=[0])
df_info

### 可視化特徵向量（特徵向量為具有時序的使用次數資料）

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
# 篩選出index
x_0 = df_info.resample('M').count()
x_0 = x_0.drop(x_0.columns.values,axis=1)
# 設定順位
i_rank = 1
j_rank = 2
# 篩選出顧客ID
i_id = df_info['顧客ID'].value_counts().index[i_rank]
j_id = df_info['顧客ID'].value_counts().index[j_rank]
# 將每月使用次數設定為特徵值
x_i = df_info[df_info['顧客ID']==i_id].resample('M').count()
x_j = df_info[df_info['顧客ID']==j_id].resample('M').count()
# 出現缺失值的處理方式
x_i = pd.concat([x_0, x_i], axis=1).fillna(0)
x_j = pd.concat([x_0, x_j], axis=1).fillna(0)
# 繪製圖表
plt.plot(x_i)
plt.plot(x_j)
plt.xticks(rotation=60)
plt.show()

### 計算相似度

In [None]:
import pandas as pd
import numpy as np
# 計算特徵向量的差距
dx = x_i.iloc[:,0].values-x_j.iloc[:,0].values
# 計算向量範數（距離）
n = np.linalg.norm(dx)
# 利用維度標準化
num_dim = len(x_i)
d = n/num_dim
print("相似度:",d)

## 2-3 透過主成分分析確認大顧客的相似程度

### 篩選出特徵向量

In [None]:
import pandas as pd
# 調整index
x_0 = df_info.resample('M').count()
x_0 = x_0.drop(x_0.columns.values,axis=1)
# 建立陣列
list_vector = []
# 設定人數
num = 100
for i_rank in range(num):
    # 篩選出顧客ID
    i_id = df_info['顧客ID'].value_counts().index[i_rank]
    # 將每月使用次數設定為特徵值
    x_i = df_info[df_info['顧客ID']==i_id].resample('M').count()
    # 出現缺失值的處理方式
    x_i = pd.concat([x_0, x_i], axis=1).fillna(0)
    # 新增為特徵向量
    list_vector.append(x_i.iloc[:,0].values.tolist())

### 利用主成分分析(PCA)進行可視化處理

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
# 轉換特徵向量
features = np.array(list_vector)
# 執行主成分分析
pca = PCA()
pca.fit(features)
# 將特徵向量轉換成主成分
transformed = pca.fit_transform(features)
# 可視化
for i in range(len(transformed)):
    plt.scatter(transformed[i,0],transformed[i,1],color="k")
    plt.text(transformed[i,0],transformed[i,1],str(i))
plt.show()

## 2-4. 根據時間軸確認大顧客的行為模式

In [None]:
import pandas as pd
# 篩選出index
x_0 = df_info.resample('M').count()
x_0 = x_0.drop(x_0.columns.values,axis=1)

# 設定順位
list_rank = [0,1,2]
x = []
for i_rank in list_rank:
    # 篩選出顧客ID
    i_id = df_info['顧客ID'].value_counts().index[i_rank]
    # 將每月使用次數設定為特徵值
    x_i = df_info[df_info['顧客ID']==i_id].resample('M').count()
    # 出現缺失值的處理方式
    x_i = pd.concat([x_0, x_i], axis=1).fillna(0)
    # 繪製圖表
    plt.plot(x_i)
    plt.xticks(rotation=60)
plt.show()

## 2-5. 透過集群分析可視化大顧客的行為模式有何差異

### 利用k-means法進行集群分析

In [None]:
from sklearn.cluster import KMeans
# 設定集群數
num_of_cluster = 4
# 指派集群
model = KMeans(n_clusters=num_of_cluster, random_state=0)
model.fit(features)
pred_class = model.labels_
print(pred_class)

### 利用主成分分析（PCA）可視化

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

# 執行主成分分析
pca = PCA()
pca.fit(features)
# 將特徵向量轉換成主成分
transformed = pca.fit_transform(features)
# 可視化
plt.figure(figsize=(12, 8))
plt.scatter(transformed[:,0],transformed[:,1],c=pred_class)
for i in range(len(transformed)):
    text = str(i) + "(" + str(pred_class[i]) + ")"
    plt.text(transformed[i,0],transformed[i,1],text)
plt.show()

## 2-6. 利用決策樹推測行為模式的原因

### 設定目標變數

In [None]:
import numpy as np
# 設定要分析的類別
target_class = 1
# 建立目標變數
num = len(pred_class)
data_o = np.zeros(num)
for i in range(num):
    if pred_class[i]==target_class:
        data_o[i] = True
    else:
        data_o[i] = False
print(data_o)

### 設定說明變數

In [None]:
# 建立說明變數
data_e = features
print(data_e)

### 建立模型

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
# 建立決策樹的模型
clf = DecisionTreeClassifier(max_depth=2)
clf = clf.fit(data_e, data_o)

### 輸出結果

In [None]:
from dtreeviz.trees import dtreeviz

# 篩選出index
x_0 = df_info.resample('M').count()
x_0 = x_0.drop(x_0.columns.values,axis=1)
time_index = x_0.index
print(time_index)

# 繪製決策樹
viz = dtreeviz(
    clf,
    data_e, 
    data_o,
    target_name='Class',
    feature_names=time_index,
    class_names=['False','True'],
) 
viz

## 2-7. 可視化決策樹的分類結果，評估分類的精確度

### 可視化分類

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as pat

# 進行分類
pred_tree = clf.predict(data_e)

# 執行主成分分析
pca = PCA()
pca.fit(features)
# 將特徵向量轉換成主成分
transformed = pca.fit_transform(features)
# 可視化
plt.figure(figsize=(12, 8))
plt.scatter(transformed[:,0],transformed[:,1],c=pred_class)
for i in range(len(transformed)):
    if pred_tree[i]==1:
        if pred_class[i]==1:
            temp_color = "k"
            temp_lw = 1.0
        else:
            temp_color = "b"
            temp_lw = 3.0
        circle = pat.Circle(xy=(transformed[i,0],transformed[i,1]), radius=1.0, ec=temp_color ,fill=False, linewidth = temp_lw)
        plt.axes().add_artist(circle)
    else:
        if pred_class[i]==1:
            temp_color = "r"
            temp_lw = 3.0
            circle = pat.Circle(xy=(transformed[i,0],transformed[i,1]), radius=1.0, ec=temp_color ,fill=False, linewidth = temp_lw)
            plt.axes().add_artist(circle)
    text = str(i) + "(" + str(pred_class[i]) + ")"
    plt.text(transformed[i,0],transformed[i,1],text)
plt.show()
%matplotlib inline

### 輸出混淆矩陣

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(data_o, pred_tree)
print(cm)

## 2-8. 了解評估預測精確度的流程

### 將資料集分割成訓練資料與評估資料

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features,data_o)

### 利用訓練資料建構模型

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
clf = DecisionTreeClassifier(max_depth=2)
clf = clf.fit(x_train, y_train)

### 利用評估資料進行評估

In [None]:
from sklearn.metrics import confusion_matrix

# 計算分數
score = clf.score(x_test, y_test)
print("分數:",score)

# 產生混淆矩陣
pred_tree = clf.predict(x_test)
cm = confusion_matrix(y_test, pred_tree)
print("混淆矩陣")
print(cm)

## 2-9. 比較各種分類演算法

### 與隨機森林演算法比較

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# 將資料集分割成訓練資料與評估資料
x_train, x_test, y_train, y_test = train_test_split(features,data_o)

# 利用訓練資料建立模型
model = RandomForestClassifier(bootstrap=True, n_estimators=10, max_depth=None, random_state=1)
clf = model.fit(x_train, y_train)

# 利用評估資料進行評估
# 計算分數
score = clf.score(x_test, y_test)
print("分數:",score)

# 產生混淆矩陣
pred_tree = clf.predict(x_test)
cm = confusion_matrix(y_test, pred_tree)
print("混淆矩陣")
print(cm)

### 與SVM比較

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

# 將資料集分割成訓練資料與評估資料
x_train, x_test, y_train, y_test = train_test_split(features,data_o)

# 利用訓練資料建立模型
model = SVC(kernel='rbf')
clf = model.fit(x_train, y_train)

# 利用評估資料進行評估
# 計算分數
score = clf.score(x_test, y_test)
print("分數:",score)

# 產生混淆矩陣
pred_tree = clf.predict(x_test)
cm = confusion_matrix(y_test, pred_tree)
print("混淆矩陣")
print(cm)

## 2-10. 試著利用支援向量機迴歸法預測具有時序的資料

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split

# 建立資料
data_target = data_e[data_o==1]
data_y = data_target
data_x = np.stack([np.arange(0,len(data_target[0])) for _ in range(len(data_target))], axis=0)
data_y = np.ravel(data_y)
data_x = np.ravel(data_x)

# 將資料集分割成訓練資料與評估資料
x_train, x_test, y_train, y_test = train_test_split(data_x,data_y)

# 利用訓練資料建立模型（支援向量機迴歸）
model = svm.SVR(kernel='rbf', C=1)
reg = model.fit(x_train.reshape(-1, 1),y_train.reshape(-1, 1))

# 繪製預測曲線
x_pred = np.arange(len(data_target[0])).reshape(-1, 1)
y_pred = model.predict(x_pred)
plt.plot(data_x,data_y,"k.")
plt.plot(x_pred,y_pred,"r.-")
plt.show()

# 決定係數R^2
reg.score(x_test.reshape(-1, 1),y_test.reshape(-1, 1))