# 第1章 取得資料之後的第一件事
接著要學習執行程式的流程，以便學習機率、統計與機械學習的基礎。  

In [None]:
#Colaboratory環境的設定
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/MathProgramming/Chapter1

In [None]:
#函式庫的設定
!pip install -q -r ./requirements.txt

## 1-1 試著載入資料

In [None]:
import pandas as pd
df_info = pd.read_csv("accomodation_info.csv", index_col=0, parse_dates=[0])
df_info

## 1-2 試著讓時間軸資料可視化

### 每月業績

In [None]:
import matplotlib.pyplot as plt
plt.plot(df_info["金額"].resample('M').sum(),color="k")
plt.xticks(rotation=60)
plt.show()

### 每月使用者人數

In [None]:
import matplotlib.pyplot as plt
plt.plot(df_info.resample('M').count(),color="k")
plt.xticks(rotation=60)
plt.show()

## 1-3. 試著輸出平均值、中位數、最小值與最大值

In [None]:
x_mean = df_info['顧客ID'].value_counts().mean()
x_median = df_info['顧客ID'].value_counts().median()
x_min = df_info['顧客ID'].value_counts().min()
x_max = df_info['顧客ID'].value_counts().max()
print("平均値:",x_mean)
print("中位數:",x_median)
print("最小值",x_min)
print("最大值",x_max)

## 1-4. 觀察分佈的形狀

In [None]:
import matplotlib.pyplot as plt
x = df_info['顧客ID'].value_counts()
x_hist,t_hist,_ = plt.hist(x,21,color="k")
plt.show()

## 1-5. 試著計算分佈的近似曲線

### 算出近似曲線的參數

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 設定參數
epsiron = 1
num = 15

# 設定變數
weight = x_hist[1:num]
t = np.zeros(len(t_hist)-1)
for i in range(len(t_hist)-1):
    t[i] = (t_hist[i]+t_hist[i+1])/2

# 利用擬合的方式（最小平方逼近法）算出參數
a, b = np.polyfit(t[1:num], np.log(x_hist[1:num]), 1, w=weight)

# 繪製擬合曲線（直線）
xt = np.zeros(len(t))
for i in range(len(t)):
    xt[i] = a*t[i]+b
plt.plot(t_hist[1:], np.log(x_hist+epsiron),marker=".",color="k")
plt.plot(t,xt,color="r")
plt.show()

### 繪製近似曲線

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math

t = t_hist[1:]
xt = np.zeros(len(t))
for i in range(len(t)):
    xt[i] = math.exp(a*t[i]+b)

plt.bar(t_hist[1:], x_hist,width=8,color="k")
plt.plot(t,xt,color="r")
plt.show()   

## 1-6. 試著篩選每種方案的資料

### 篩選各種方案的資料

In [None]:
print(df_info[df_info["方案"]=="A"])

### 根據各方案的資料繪製直方圖

In [None]:
df_a = df_info[df_info["方案"]=="A"]
x_a = df_a['顧客ID'].value_counts()
xa_hist,ta_hist,_ = plt.hist(x_a,21,color="k")
plt.show()

### 各方案的每月使用者人數

In [None]:
import matplotlib.pyplot as plt

plt.plot(df_info[df_info["方案"]=="A"].resample('M').count(),color="b")
plt.plot(df_info[df_info["方案"]=="B"].resample('M').count(),color="g")
plt.plot(df_info[df_info["方案"]=="C"].resample('M').count(),color="r")
plt.plot(df_info[df_info["方案"]=="D"].resample('M').count(),color="k")
plt.xticks(rotation=60)
plt.show()

## 1-7. 分析大顧客的行為模式

### 輸出使用頻率前10名的資訊

In [None]:
for i_rank in range(10):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    print(df_info[df_info['顧客ID']==id])

### 依照時間順序整理前10名的使用頻率

In [None]:
import matplotlib.pyplot as plt
for i_rank in range(10):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    plt.plot(df_info[df_info['顧客ID']==id].resample('M').count())
    plt.xticks(rotation=60)
plt.show()

### 依照時間順序整理11～20名的每月使用頻率

In [None]:
import matplotlib.pyplot as plt
for i_rank in range(10,20):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    plt.plot(df_info[df_info['顧客ID']==id].resample('M').count())
    plt.xticks(rotation=60)
plt.show()

## 1-8. 試著分析疫情爆發前後的顧客行為模式

### 重設索引值

In [None]:
df_info = df_info.reset_index()

### 將疫情前

In [None]:
import datetime as dt
target_date = dt.datetime(2020,3,1)
df_info_pre = df_info[df_info["日期"] < target_date]
df_info_post = df_info[df_info["日期"] >= target_date]
print(df_info_pre)
print(len(df_info_pre)+len(df_info_post),len(df_info))

### 二維配對疫情爆發前後的資訊

In [None]:
import numpy as np
import matplotlib.pyplot as plt
num = 200
count_pre_and_post = np.zeros((num,2))
for i_rank in range(num):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    count_pre_and_post[i_rank][0] = int(df_info_pre[df_info_pre['顧客ID']==id].count()[0])
    count_pre_and_post[i_rank][1] = int(df_info_post[df_info_post['顧客ID']==id].count()[0])
plt.scatter(count_pre_and_post.T[0], count_pre_and_post.T[1], color="k")
for i_rank in range(num):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    text = str(id) + "(" + str(i_rank) + ")"
    plt.text(count_pre_and_post[i_rank][0], count_pre_and_post[i_rank][1], text, color="k")
plt.xlabel("pre epidemic")
plt.ylabel("post epidemic")
plt.show()

## 1-9. 試著根據條件分類顧客

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 設定參數
num = 200
threshold_post = 50

# 可視化疫情爆發前後的資料
count_pre_and_post = np.zeros((num,2))
for i_rank in range(num):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    count_pre_and_post[i_rank][0] = int(df_info_pre[df_info_pre['顧客ID']==id].count()[0])
    count_pre_and_post[i_rank][1] = int(df_info_post[df_info_post['顧客ID']==id].count()[0])
for i_rank in range(num):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    text = str(id) + "(" + str(i_rank) + ")"
    if count_pre_and_post[i_rank][1]>threshold_post:
        temp_color = "r"
    else:
        temp_color = "k"
    plt.scatter(count_pre_and_post[i_rank][0], count_pre_and_post[i_rank][1], color=temp_color)
    plt.text(count_pre_and_post[i_rank][0], count_pre_and_post[i_rank][1], text, color=temp_color)
plt.xlabel("pre epidemic")
plt.ylabel("post epidemic")
plt.show()

## 1-10. 列出符合條件的顧客

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 設定參數
num = 200
threshold_post = 50

# 產生顧客名單
list_id = []
list_name = []
list_date_pre = []
list_date_post = []
count_pre_and_post = np.zeros((num,2))
for i_rank in range(num):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    count_pre_and_post[i_rank][0] = int(df_info_pre[df_info_pre['顧客ID']==id].count()[0])
    count_pre_and_post[i_rank][1] = int(df_info_post[df_info_post['顧客ID']==id].count()[0])
for i_rank in range(num):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    text = str(id) + "(" + str(i_rank) + ")"
    if count_pre_and_post[i_rank][1]>threshold_post:
        list_id.append(id)
        list_name.append(df_info['住宿者姓名'][df_info['顧客ID']==id].iloc[0])
        list_date_pre.append(count_pre_and_post[i_rank][0])
        list_date_post.append(count_pre_and_post[i_rank][1])
        
# 將列表轉換成資料框架格式
df = pd.DataFrame([list_id])
df = df.T
df.columns = ['顧客ID']
df['住宿者姓名'] = list_name
df['住宿天數（爆發前）'] = list_date_pre
df['住宿天數（爆發後）'] = list_date_post
print(df)