# 第3章 推測必需的資料筆數
在此要學習的是，在學習推測統計的基礎時，執行必要程式的流程。  

In [None]:
#設定Colaboratory環境
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/MathProgramming/Chapter3

In [None]:
#設定函式庫
!pip install -q -r ./requirements.txt

## 3-1 試著模擬統計值

### 產生常態分佈

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 設定母群體的大小
num = 365*2

# 設定亂數的平均值與標準差
ave = 0.0
std = 1.0

# 產生亂數（亂數種子固定）
np.random.seed(seed=0)
x = np.random.normal(ave,std,num)
#x = np.random.exponential(0.5, num)

# 計算平均值與標準差
x_ave = np.average(x)
x_std = np.std(x)
print("平均值:",x_ave)
print("標準差:",x_std)

# 繪製圖表
num_bin = 21
plt.hist(x, num_bin,color="k")
plt.xlim([-5,5])
plt.show()
%matplotlib inline

### 繪製隨機採樣的樣本的平均值分佈狀況

In [None]:
import numpy as np
# 設定樣本數（樣本集合的大小）
num_sample = 30

# 設定模擬次數
num_trial = 10000
x_trial = np.zeros(num_trial)

# 試算樣本平均值
for i in range(num_trial):
    # 抽樣
    x_sample = np.random.choice(x,num_sample)
    # 計算平均值
    x_ave = np.average(x_sample)
    # 儲存平均值
    x_trial[i] = x_ave
    
# 計算樣本的平均值與標準差
x_trial_ave = np.average(x_trial)
x_trial_std = np.std(x_trial)
print("平均值:",x_trial_ave)
print("標準差:",x_trial_std)

# 繪製圖表
num_bin = 21
plt.hist(x_trial, num_bin,color="k")
plt.xlim([-5,5])
plt.show()
%matplotlib inline

## 3-2 了解中央極限定理

In [None]:
import numpy as np
# 設定母群體的變異數
org_std = 1.0
# 設定樣本集合的規模
num_sample = 30
# 計算樣本集合的平均值的標準差
sample_std = org_std/np.sqrt(num_sample)
print("樣本集合的平均值的標準差:",sample_std)

## 3-3 正確取得一個月份的資料

### 載入資料（失竊記錄）

In [None]:
import pandas as pd
df_theft_201811 = pd.read_csv("theft_list_201811.csv", index_col=0, parse_dates=[0])
df_theft_201811

### 載入資料（備品金額）

In [None]:
import pandas as pd
df_amenity_price = pd.read_csv("amenity_price.csv", index_col=0, parse_dates=[0])
df_amenity_price

### 計算一個月份的失竊總金額

In [None]:
total_amount = 0
total_theft = 0
for i_index in range(len(df_theft_201811.index)):
    for i_column in range(len(df_theft_201811.columns)):
        total_amount += df_theft_201811.iloc[i_index,i_column]*df_amenity_price["金額"].iloc[i_column]
        total_theft += df_theft_201811.iloc[i_index,i_column]
        if df_theft_201811.iloc[i_index,i_column]>0:
            print(df_theft_201811.index[i_index],df_theft_201811.columns[i_column],df_theft_201811.iloc[i_index,i_column],"点")
print("失竊總金額",total_amount,"元")
print("失竊件數",total_theft,"件")

## 3-4 根據一個月份的資料推算二年份資料的平均值與標準差

### 列出每天的失竊金額

In [None]:
import numpy as np
import matplotlib.pyplot as plt
list_amount = np.zeros(len(df_theft_201811.index))
for i_index in range(len(df_theft_201811.index)):
    for i_column in range(len(df_theft_201811.columns)):
        list_amount[i_index] += df_theft_201811.iloc[i_index,i_column]*df_amenity_price["金額"].iloc[i_column]
plt.plot(list_amount,color="k")
plt.show()

### 隨機抽出10天份的資料，再計算平均值的分佈情況

In [None]:
import numpy as np
# 設定樣本數（樣本集合的規模）
num_sample = 10

# 設定模擬次數
num_trial = 10000
x_trial = np.zeros(num_trial)

# 試算樣本平均值
for i in range(num_trial):
    # 抽樣
    x = list_amount
    x_sample = np.random.choice(x,num_sample)
    # 計算平均值
    x_ave = np.average(x_sample)
    # 儲存平均值
    x_trial[i] = x_ave
    
# 計算樣本平均值的平均值與標準差
x_trial_ave = np.average(x_trial)
x_trial_std = np.std(x_trial)
print("平均值:",x_trial_ave)
print("標準差:",x_trial_std)

# 繪製圖表
num_bin = 21
plt.hist(x_trial, num_bin,color="k")
plt.xlim([-50000,50000])
plt.show()
%matplotlib inline

### 根據中央極限定理逆推母群體的標準差

In [None]:
import numpy as np
# 預測樣本集合的平均值的標準差
sample_std = 5649
# 設定樣本集合的規模
num_sample = 10
# 計算母群體的變異數
org_std = np.sqrt(num_sample)*sample_std
print("母群體的標準差:",org_std)

## 3-5 了解標準差與信賴度的相關性  

### 產生常態分佈 (與3-1的程式碼相同)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 設定母群體的規模
num = 365*2

# 設定亂數的平均值與標準差
ave = 0.0
std = 1.0

# 產生亂數（亂數種子固定）
np.random.seed(seed=0)
x = np.random.normal(ave,std,num)
#x = np.random.exponential(0.5, num)

# 計算平均值與標準差
x_ave = np.average(x)
x_std = np.std(x)
print("平均值:",x_ave)
print("標準差:",x_std)

# 繪製圖表
num_bin = 21
plt.hist(x, num_bin,color="k")
plt.xlim([-5,5])
plt.show()
%matplotlib inline

### 繪製隨機採樣的樣本的平均值分佈狀況 (與3-1的程式碼相同)

In [None]:
import numpy as np
# 設定樣本數（樣本集合的規模）
num_sample = 30

# 設定模擬次數
num_trial = 10000
x_trial = np.zeros(num_trial)

# 試算樣本平均值
for i in range(num_trial):
    # 抽樣
    x_sample = np.random.choice(x,num_sample)
    # 計算平均值
    x_ave = np.average(x_sample)
    # 儲存平均值
    x_trial[i] = x_ave
    
# 計算樣本平均值的平均值與標準差
x_trial_ave = np.average(x_trial)
x_trial_std = np.std(x_trial)
print("平均值:",x_trial_ave)
print("標準差:",x_trial_std)

# 繪製圖表
num_bin = 21
plt.hist(x_trial, num_bin,color="k")
plt.xlim([-5,5])
plt.show()
%matplotlib inline

### 計算信賴度

In [None]:
# 設定標準差的倍率
ratio = 1.0
# 計算左側範圍之外的比例
x_trial_out1 = x_trial[x_trial>x_trial_ave+ratio*x_trial_std]
# 計算右側範圍之外的比例
x_trial_out2 = x_trial[x_trial<x_trial_ave-ratio*x_trial_std]
# 計算信賴度
reliability = 1-(len(x_trial_out1)/len(x_trial)+len(x_trial_out2)/len(x_trial))
print("信頼度:",reliability)

## 3-6 假設與住宿者人數的相關性，推測失竊總金額的趨勢  

### 計算每位住宿者造成的平均失竊金額

In [None]:
import pandas as pd
import datetime as dt

# 設定每日平均失竊金額
theft_per_day = 5880

# 載入住宿資料
df_info = pd.read_csv("accomodation_info.csv", index_col=0, parse_dates=[0])

# 篩選出每日平均住宿者人數
x = df_info.resample('D').count()
df_num = x.iloc[:,0]

# 篩選出一個月份的住宿者人數
target_date = dt.datetime(2018,11,30)
df_num_201811 = df_num[df_num.index <= target_date]
print("一個月份的住宿者人數:",sum(df_num_201811))

# 根據一個月份的住宿者人數計算每日平均住宿者人數
num_per_day = sum(df_num_201811)/len(df_num_201811)
print("每日平均住宿者人數:",num_per_day)

# 每位住宿者造成的平均失竊金額
theft_per_person = theft_per_day/num_per_day
print("每位住宿者造成的平均失竊金額:",theft_per_person)

### 預測兩年內的失竊金額

In [None]:
import numpy as np
estimated_theft = np.zeros(len(df_num))
for i in range(len(df_num)):
    estimated_theft[i] = df_num.iloc[i]*theft_per_person
df_estimated_theft = pd.DataFrame(estimated_theft,index=df_num.index,columns=["預估失竊金額"])
print("兩年內的預估失竊總金額:",sum(df_estimated_theft["預估失竊金額"]))
plt.plot(df_estimated_theft,color="k")
plt.xticks(rotation=60)
plt.show()

## 3-7  推測年度失竊總金額與對應的信賴區間

In [None]:
import matplotlib.pyplot as plt
# 設定標準差
theft_std_per_day = 17864
theft_std_per_person = theft_std_per_day/num_per_day
print("每位住宿者造成的平均失竊金額的標準差:",theft_std_per_person)

# 設定信賴區間
list_estimated_theft = []
for i in range(len(df_num)):
    temp_ave = df_num.iloc[i]*theft_per_person
    temp_std = df_num.iloc[i]*theft_std_per_person
    temp = [temp_ave-temp_std,temp_ave,temp_ave+temp_std]
    list_estimated_theft.append(temp)

# 繪製圖表
plt.boxplot(list_estimated_theft)
plt.xticks(color="None")
plt.show()

## 3-8 根據平價客戶備品，重新推測兩年份資料的平均值與標準差

### 篩選出平價備品的相關資料

In [None]:
threshold_price = 10000
df_amenity_price_low = df_amenity_price[df_amenity_price["金額"]<threshold_price]
df_theft_201811_low = df_theft_201811[df_amenity_price[df_amenity_price["金額"]<threshold_price].index]
print(df_amenity_price_low)

### 列出每日失竊金額

In [None]:
import numpy as np
import matplotlib.pyplot as plt
list_amount = np.zeros(len(df_theft_201811_low.index))
for i_index in range(len(df_theft_201811_low.index)):
    for i_column in range(len(df_theft_201811_low.columns)):
        list_amount[i_index] += df_theft_201811_low.iloc[i_index,i_column]*df_amenity_price_low["金額"].iloc[i_column] 
plt.plot(list_amount,color="k")
plt.show()

### 隨機抽樣10天份的資料，算出平均值的分佈情況

In [None]:
import numpy as np
# 設定樣本數（樣本集合的規模）
num_sample = 10

# 設定模擬次數
num_trial = 10000
x_trial = np.zeros(num_trial)

# 試算樣本平均值
for i in range(num_trial):
    # 抽樣
    x = list_amount
    x_sample = np.random.choice(x,num_sample)
    # 計算平均值
    x_ave = np.average(x_sample)
    # 儲存平均值
    x_trial[i] = x_ave
    
# 計算樣本平均值的平均值與標準差
x_trial_ave = np.average(x_trial)
x_trial_std = np.std(x_trial)
print("平均值:",x_trial_ave)
print("標準差:",x_trial_std)

# 繪製圖表
num_bin = 21
plt.hist(x_trial, num_bin,color="k")
plt.xlim([-50000,50000])
plt.show()
%matplotlib inline

### 根據中央極限定理逆推母群體的標準差

In [None]:
import numpy as np
# 推測樣本集合的平均值的標準差
sample_std = 553
# 設定樣本集合的規模
num_sample = 10
# 計算群體的變異數
org_std = np.sqrt(num_sample)*sample_std
print("母群體的標準差:",org_std)

## 3-9 針對平價備品的二年內失竊金額趨勢設定信賴區間

### 計算每位住宿者造成的平均失竊金額

In [None]:
import pandas as pd
import datetime as dt

# 設定每日平均失竊金額
theft_per_day = 2595

# 載入住宿資料
df_info = pd.read_csv("accomodation_info.csv", index_col=0, parse_dates=[0])

# 篩選出每日平均住宿者人數
x = df_info.resample('D').count()
df_num = x.iloc[:,0]

# 篩選出一個月份的住宿者人數
target_date = dt.datetime(2018,11,30)
df_num_201811 = df_num[df_num.index <= target_date]
print("一個月份的住宿者人數:",sum(df_num_201811))

# 根據一個月份的住宿者人數計算每日平均住宿者人數
num_per_day = sum(df_num_201811)/len(df_num_201811)
print("每日平均住宿者人數:",num_per_day)

# 每位住宿者造成的平均失竊金額
theft_per_person = theft_per_day/num_per_day
print("每位住宿者造成的平均失竊金額:",theft_per_person)

### 推測兩年內失竊總金額與失竊金額的趨勢

In [None]:
import matplotlib.pyplot as plt
# 設定標準差
theft_std_per_day = 1748
theft_std_per_person = theft_std_per_day/num_per_day
print("每位住宿者造成的平均失竊金額的標準差:",theft_std_per_person)

# 設定信賴區間
list_estimated_theft = []
for i in range(len(df_num)):
    temp_ave = df_num.iloc[i]*theft_per_person
    temp_std = df_num.iloc[i]*theft_std_per_person
    temp = [temp_ave-temp_std,temp_ave,temp_ave+temp_std]
    list_estimated_theft.append(temp)

# 繪製圖表
plt.boxplot(list_estimated_theft)
plt.xticks(color="None")
plt.show()
%matplotlib inline

## 3-10  根據兩年份的資料「驗算」

### 載入資料

In [None]:
import pandas as pd
df_theft_2y = pd.read_csv("theft_list_2y.csv", index_col=0, parse_dates=[0])
df_theft_2y

### 兩年內所有備品的失竊金額趨勢

In [None]:
import numpy as np
import matplotlib.pyplot as plt
list_amount = np.zeros(len(df_theft_2y.index))
threshold_price = 10000
for i_index in range(len(df_theft_2y.index)):
    for i_column in range(len(df_theft_2y.columns)):
        list_amount[i_index] += df_theft_2y.iloc[i_index,i_column]*df_amenity_price["金額"].iloc[i_column]
        if (df_theft_2y.iloc[i_index,i_column]>0)and(df_amenity_price["金額"].iloc[i_column]>threshold_price):
            print(df_theft_2y.index[i_index],df_theft_2y.columns[i_column],df_theft_2y.iloc[i_index,i_column],"件",df_theft_2y.iloc[i_index,i_column]*df_amenity_price["金額"].iloc[i_column],"元")
print("失竊總金額:",sum(list_amount))        
plt.plot(list_amount,color="k")
plt.show()

### 平價備品

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 篩選出平價備品的相關資料
threshold_price = 10000
df_amenity_price_low = df_amenity_price[df_amenity_price["金額"]<threshold_price]
df_theft_2y_low = df_theft_2y[df_amenity_price[df_amenity_price["金額"]<threshold_price].index]

# 失竊金額的趨勢
list_amount = np.zeros(len(df_theft_2y_low.index))
for i_index in range(len(df_theft_2y_low.index)):
    for i_column in range(len(df_theft_2y_low.columns)):
        list_amount[i_index] += df_theft_2y_low.iloc[i_index,i_column]*df_amenity_price_low["金額"].iloc[i_column]
print("失竊總金額:",sum(list_amount))    
plt.plot(list_amount,color="k")
plt.show()