# 第1章 データを手にしてまず行うべきこと
ここでは、確率統計・機械学習の基礎を学ぶうえで必要なプログラムを実行していく流れを学んでいきます。  

In [None]:
#Colaboratory環境の設定
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/MathProgramming/Chapter1

In [None]:
#ライブラリの設定
!pip install -q -r ./requirements.txt

## 1-1 データを読み込んでみよう

In [None]:
import pandas as pd
df_info = pd.read_csv("accomodation_info.csv", index_col=0, parse_dates=[0])
df_info

## 1-2 時系列データを可視化してみよう

### 月ごとの売り上げ

In [None]:
import matplotlib.pyplot as plt
plt.plot(df_info["金額"].resample('M').sum(),color="k")
plt.xticks(rotation=60)
plt.show()

### 月ごとの利用者数

In [None]:
import matplotlib.pyplot as plt
plt.plot(df_info.resample('M').count(),color="k")
plt.xticks(rotation=60)
plt.show()

## 1-3. 平均値、中央値、最小値、最大値を出力してみよう

In [None]:
x_mean = df_info['顧客ID'].value_counts().mean()
x_median = df_info['顧客ID'].value_counts().median()
x_min = df_info['顧客ID'].value_counts().min()
x_max = df_info['顧客ID'].value_counts().max()
print("平均値:",x_mean)
print("中央値:",x_median)
print("最小値",x_min)
print("最大値",x_max)

## 1-4. 分布の形を見てみよう

In [None]:
import matplotlib.pyplot as plt
x = df_info['顧客ID'].value_counts()
x_hist,t_hist,_ = plt.hist(x,21,color="k")
plt.show()

## 1-5. 分布の近似曲線を求めてみよう

### 近似曲線のパラメータの算出

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# パラメータ設定
epsiron = 1
num = 15

# 変数設定
weight = x_hist[1:num]
t = np.zeros(len(t_hist)-1)
for i in range(len(t_hist)-1):
    t[i] = (t_hist[i]+t_hist[i+1])/2

# フィッティング（最小二乗近似）によるパラメータの算出
a, b = np.polyfit(t[1:num], np.log(x_hist[1:num]), 1, w=weight)

# フィッティング曲線（直線）の描画
xt = np.zeros(len(t))
for i in range(len(t)):
    xt[i] = a*t[i]+b
plt.plot(t_hist[1:], np.log(x_hist+epsiron),marker=".",color="k")
plt.plot(t,xt,color="r")
plt.show()

### 近似曲線の描画

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math

t = t_hist[1:]
xt = np.zeros(len(t))
for i in range(len(t)):
    xt[i] = math.exp(a*t[i]+b)

plt.bar(t_hist[1:], x_hist,width=8,color="k")
plt.plot(t,xt,color="r")
plt.show()   

## 1-6. プランごとにデータを抽出してみよう

### プランごとのデータ抽出

In [None]:
print(df_info[df_info["プラン"]=="A"])

### プランごとのヒストグラム表示

In [None]:
df_a = df_info[df_info["プラン"]=="A"]
x_a = df_a['顧客ID'].value_counts()
xa_hist,ta_hist,_ = plt.hist(x_a,21,color="k")
plt.show()

### プランごとの毎月の利用者数

In [None]:
import matplotlib.pyplot as plt

plt.plot(df_info[df_info["プラン"]=="A"].resample('M').count(),color="b")
plt.plot(df_info[df_info["プラン"]=="B"].resample('M').count(),color="g")
plt.plot(df_info[df_info["プラン"]=="C"].resample('M').count(),color="r")
plt.plot(df_info[df_info["プラン"]=="D"].resample('M').count(),color="k")
plt.xticks(rotation=60)
plt.show()

## 1-7. 大口顧客の行動を分析してみよう

### 顧客ごとの利用回数（上位10名）

In [None]:
for i_rank in range(10):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    print(df_info[df_info['顧客ID']==id])

### 上位10名の月ごとの利用回数

In [None]:
import matplotlib.pyplot as plt
for i_rank in range(10):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    plt.plot(df_info[df_info['顧客ID']==id].resample('M').count())
    plt.xticks(rotation=60)
plt.show()

### 11位～20位の月ごとの利用回数

In [None]:
import matplotlib.pyplot as plt
for i_rank in range(10,20):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    plt.plot(df_info[df_info['顧客ID']==id].resample('M').count())
    plt.xticks(rotation=60)
plt.show()

## 1-8. 感染症流行前後の顧客の行動を分析してみよう

### インデックスのリセット

In [None]:
df_info = df_info.reset_index()

### 感染症前後のデータを分離

In [None]:
import datetime as dt
target_date = dt.datetime(2020,3,1)
df_info_pre = df_info[df_info["日時"] < target_date]
df_info_post = df_info[df_info["日時"] >= target_date]
print(df_info_pre)
print(len(df_info_pre)+len(df_info_post),len(df_info))

### 感染症流行の関係を二次元にマッピング

In [None]:
import numpy as np
import matplotlib.pyplot as plt
num = 200
count_pre_and_post = np.zeros((num,2))
for i_rank in range(num):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    count_pre_and_post[i_rank][0] = int(df_info_pre[df_info_pre['顧客ID']==id].count()[0])
    count_pre_and_post[i_rank][1] = int(df_info_post[df_info_post['顧客ID']==id].count()[0])
plt.scatter(count_pre_and_post.T[0], count_pre_and_post.T[1], color="k")
for i_rank in range(num):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    text = str(id) + "(" + str(i_rank) + ")"
    plt.text(count_pre_and_post[i_rank][0], count_pre_and_post[i_rank][1], text, color="k")
plt.xlabel("pre epidemic")
plt.ylabel("post epidemic")
plt.show()

## 1-9. 条件による顧客の分類をしてみよう

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# パラメータ設定
num = 200
threshold_post = 50

# 感染症前後を可視化
count_pre_and_post = np.zeros((num,2))
for i_rank in range(num):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    count_pre_and_post[i_rank][0] = int(df_info_pre[df_info_pre['顧客ID']==id].count()[0])
    count_pre_and_post[i_rank][1] = int(df_info_post[df_info_post['顧客ID']==id].count()[0])
for i_rank in range(num):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    text = str(id) + "(" + str(i_rank) + ")"
    if count_pre_and_post[i_rank][1]>threshold_post:
        temp_color = "r"
    else:
        temp_color = "k"
    plt.scatter(count_pre_and_post[i_rank][0], count_pre_and_post[i_rank][1], color=temp_color)
    plt.text(count_pre_and_post[i_rank][0], count_pre_and_post[i_rank][1], text, color=temp_color)
plt.xlabel("pre epidemic")
plt.ylabel("post epidemic")
plt.show()

## 1-10. 条件にあった顧客をリストアップしよう

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# パラメータ設定
num = 200
threshold_post = 50

# 顧客リストを生成
list_id = []
list_name = []
list_date_pre = []
list_date_post = []
count_pre_and_post = np.zeros((num,2))
for i_rank in range(num):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    count_pre_and_post[i_rank][0] = int(df_info_pre[df_info_pre['顧客ID']==id].count()[0])
    count_pre_and_post[i_rank][1] = int(df_info_post[df_info_post['顧客ID']==id].count()[0])
for i_rank in range(num):
    id = df_info['顧客ID'].value_counts().index[i_rank]
    text = str(id) + "(" + str(i_rank) + ")"
    if count_pre_and_post[i_rank][1]>threshold_post:
        list_id.append(id)
        list_name.append(df_info['宿泊者名'][df_info['顧客ID']==id].iloc[0])
        list_date_pre.append(count_pre_and_post[i_rank][0])
        list_date_post.append(count_pre_and_post[i_rank][1])
        
# リストをデータフレーム形式に変換
df = pd.DataFrame([list_id])
df = df.T
df.columns = ['顧客ID']
df['宿泊者名'] = list_name
df['宿泊日数（流行前）'] = list_date_pre
df['宿泊日数（流行後）'] = list_date_post
print(df)