In [None]:
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

In [None]:
df = pd.read_csv('train_1.csv')
df.head(1)

In [None]:
df = df.fillna(0)
df.info()

In [None]:
data_start_date = df.columns[1]
data_end_date = df.columns[-1]
print('Data ranges from %s to %s' % (data_start_date, data_end_date))

In [None]:
def plot_random_series(df, n_series):
    
    sample = df.sample(n_series, random_state=8)
    page_labels = sample['Page'].tolist()
    series_samples = sample.loc[:,data_start_date:data_end_date]
    
    plt.figure(figsize=(10,6))
    
    for i in range(series_samples.shape[0]):
        np.log1p(pd.Series(series_samples.iloc[i]).astype(np.float64)).plot(linewidth=1.5)
    
    plt.title('Randomly Selected Wikipedia Page Daily Views Over Time (Log(views) + 1)')
    plt.legend(page_labels)
    
plot_random_series(df, 1)

## 整理資料格式

In [None]:
from datetime import timedelta

pred_steps = 14
pred_length=timedelta(pred_steps)

first_day = pd.to_datetime(data_start_date) 
last_day = pd.to_datetime(data_end_date)

val_pred_start = last_day - pred_length + timedelta(1)
val_pred_end = last_day

train_pred_start = val_pred_start - pred_length
train_pred_end = val_pred_start - timedelta(days=1)

In [None]:
enc_length = train_pred_start - first_day

train_enc_start = first_day
train_enc_end = train_enc_start + enc_length - timedelta(1)

val_enc_start = train_enc_start + pred_length
val_enc_end = val_enc_start + enc_length - timedelta(1)

## 分割訓練、驗證樣本

In [None]:
print('Train encoding:', train_enc_start, '-', train_enc_end)
print('Train prediction:', train_pred_start, '-', train_pred_end, '\n')
print('Val encoding:', val_enc_start, '-', val_enc_end)
print('Val prediction:', val_pred_start, '-', val_pred_end)

print('\nEncoding interval:', enc_length.days)
print('Prediction interval:', pred_length.days)

In [None]:
date_to_index = pd.Series(index=pd.Index([pd.to_datetime(c) for c in df.columns[1:]]),
                          data=[i for i in range(len(df.columns[1:]))])

series_array = df[df.columns[1:]].values

def get_time_block_series(series_array, date_to_index, start_date, end_date):
    
    inds = date_to_index[start_date:end_date]
    return series_array[:,inds]

def transform_series_encode(series_array):
    
    series_array = np.log1p(series_array)
    series_array = series_array.reshape((series_array.shape[0],series_array.shape[1], 1))
    
    return series_array

def transform_series_decode(series_array):
    
    series_array = np.log1p(series_array)
    series_array = series_array.reshape((series_array.shape[0],series_array.shape[1], 1))
    
    return series_array

## 建立模型

In [None]:
first_n_samples = 50000

# 取前50000個樣本，並且限制其包含的時間範圍  
encoder_input_data = get_time_block_series(series_array, date_to_index, 
                                           train_enc_start, train_enc_end)[:first_n_samples]
encoder_input_data = transform_series_encode(encoder_input_data)

# 取前50000個樣本，並且限制其包含的時間範圍 
decoder_target_data = get_time_block_series(series_array, date_to_index, 
                                            train_pred_start, train_pred_end)[:first_n_samples]
decoder_target_data = transform_series_decode(decoder_target_data)

# lagged target series for teacher forcing
decoder_input_data = np.zeros(decoder_target_data.shape)
decoder_input_data[:,1:,0] = decoder_target_data[:,:-1,0]
decoder_input_data[:,0,0] = encoder_input_data[:,-1,0]

In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.optimizers import Adam

latent_dim = 50 # LSTM hidden units
#Encoder 
encoder_inputs = Input(shape=(None, 1)) 
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# 只保留最後時刻的hidden state 和 cell state
encoder_states = [state_h, state_c]

#Decoder
decoder_inputs = Input(shape=(None, 1))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)  #decoder的cell state用 encoder_states初始化
decoder_dense = Dense(1)
decoder_outputs = decoder_dense(decoder_outputs)

#定義訓練時的模型
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

In [None]:
#定義預測時的模型
# Encoder
encoder_model = Model(encoder_inputs, encoder_states)
# Decoder
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs] + decoder_states)

In [None]:
encoder_model.summary()

In [None]:
decoder_model.summary()

In [None]:
batch_size = 64
epochs = 2

In [None]:
model.compile(optimizer='adam', loss='mae')
history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
                     batch_size=batch_size,
                     epochs=epochs,
                     validation_split=0.2)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel('Epoch')
plt.ylabel('Mean Absolute Error Loss')
plt.title('Loss Over Time')
plt.legend(['Train','Valid'])

In [None]:
def decode_sequence(input_seq):    
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, 1))
    
    # Populate the first target sequence with end of encoding series pageviews
    target_seq[0, 0, 0] = input_seq[0, -1, 0]

    # Sampling loop for a batch of sequences - we will fill decoded_seq with predictions
    # (to simplify, here we assume a batch of size 1).

    decoded_seq = np.zeros((1,pred_steps,1))
    
    for i in range(pred_steps):
        
        output, h, c = decoder_model.predict([target_seq] + states_value)
        
        decoded_seq[0,i,0] = output[0,0,0]
                # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, 1))
        target_seq[0, 0, 0] = output[0,0,0]

        # Update states
        states_value = [h, c]

    return decoded_seq

In [None]:
#驗證樣本轉換資料格式
encoder_input_data = get_time_block_series(series_array, date_to_index, val_enc_start, val_enc_end)
encoder_input_data = transform_series_encode(encoder_input_data)

decoder_target_data = get_time_block_series(series_array, date_to_index, val_pred_start, val_pred_end)
decoder_target_data = transform_series_decode(decoder_target_data)

In [None]:
def predict_and_plot(encoder_input_data, decoder_target_data, sample_ind, enc_tail_len=50):

    encode_series = encoder_input_data[sample_ind:sample_ind+1,:,:] 
    pred_series = decode_sequence(encode_series)
    
    encode_series = encode_series.reshape(-1,1)
    pred_series = pred_series.reshape(-1,1)   
    target_series = decoder_target_data[sample_ind,:,:1].reshape(-1,1) 
    
    encode_series_tail = np.concatenate([encode_series[-enc_tail_len:],target_series[:1]])
    x_encode = encode_series_tail.shape[0]
    
    plt.figure(figsize=(10,6))   
    
    plt.plot(range(1,x_encode+1),encode_series_tail)
    plt.plot(range(x_encode,x_encode+pred_steps),target_series,color='orange')
    plt.plot(range(x_encode,x_encode+pred_steps),pred_series,color='teal',linestyle='--')
    
    plt.title('Encoder Series Tail of Length %d, Target Series, and Predictions' % enc_tail_len)
    plt.legend(['Encoding Series','Target Series','Predictions'])

In [None]:
predict_and_plot(encoder_input_data, decoder_target_data, 89)

## 讀取大資料

In [6]:
import pandas as pd
data = pd.read_csv('spindetail.tsv', sep='\t',names=col, 
                 usecols=['PLAYERID','LOCAL_TIME'],parse_dates=['LOCAL_TIME']
                )

In [5]:
col_data = pd.read_table('NM_SLOT_Statistic_spinDetail.txt', names=['column_name'])
new = col_data["column_name"].str.split(" ", expand = True) 
col_data["new Name"]= new[2]  
col_data = col_data[['new Name']]
col = col_data['new Name'].values

In [8]:
from tqdm import tqdm

processed = pd.DataFrame()
chunksize = 10**5
with tqdm(total=19447727) as pbar:
    for chunk in pd.read_csv('spindetail.tsv', sep='\t', chunksize=chunksize, names=col, usecols=['PLAYERID','LOCAL_TIME'],parse_dates=['LOCAL_TIME']):
        chunk['LOCAL_TIME'] = chunk['LOCAL_TIME'].dt.floor('D') #修正時間格式為年月日
        chunk = chunk.drop_duplicates('PLAYERID',keep='first')  #去除重複的玩家id
        processed = processed.append(chunk)                     #將結果保存在變數內
        pbar.update(chunksize)

19500000it [02:14, 195002.70it/s]                                                                                      


In [9]:
processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150787 entries, 0 to 19447725
Data columns (total 2 columns):
PLAYERID      150787 non-null object
LOCAL_TIME    150787 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 3.5+ MB


In [13]:
import dask
import dask.dataframe as dd

In [14]:
data_dask = dd.read_csv('spindetail.tsv', sep='\t',names=col,parse_dates=['LOCAL_TIME'], assume_missing=True)

In [15]:
data_dask.head()

Unnamed: 0,GAMESEQNO,SESSIONNO,PLAYERID,PARTNERID,WEBID,MNUM,MTYPE,LOCATION,ACCDENOM,PLAYDENOM,...,SERVER_ID,SETTLE_TYPE,HAS_BONUS,TRANSFER_ID,REPORT_DATE,GAME_TYPE,WIN_TYPE_COMBINATION,DOMAIN,HOUSEID,LOCAL_TIME
0,7288066000000.0,4d1bb7d4fa8f6221be317ef9396fb7b2ccff6c35,jack8263170@GP,,,,8049.0,,1.0,1.0,...,,0.0,0.0,0.0,2018-11-01 00:00:00.000000,8.0,1.0,GP,gphse,2018-11-02 04:31:07.806
1,7288066000000.0,eec6d07a34eae701479a9af7c4891966818a4cea,a9am2261017@GP,,,,8049.0,,1.0,5.0,...,,0.0,0.0,0.0,2018-11-01 00:00:00.000000,8.0,1.0,GP,gphse,2018-11-02 04:31:08.082
2,7288066000000.0,5329e5b7b80661eea3b0b18ad899ac7bf14086c0,thbtbank2425@GH,,,,8049.0,,1.0,1.0,...,,0.0,0.0,0.0,2018-11-01 00:00:00.000000,8.0,1.0,GH,ghhse,2018-11-02 04:31:08.869
3,7288066000000.0,e1f011dd97a36afbb8c85a1a7be0f704caa4bb61,xpj88216279@GP,,,,8049.0,,1.0,1.0,...,,0.0,0.0,0.0,2018-11-01 00:00:00.000000,8.0,1.0,GP,gphse,2018-11-02 04:31:08.874
4,7288066000000.0,fc36fd61d40b56befc497ec4094a7de681c3c415,xin781594712@GP,,,,8049.0,,1.0,1.0,...,,0.0,0.0,0.0,2018-11-01 00:00:00.000000,8.0,1.0,GP,gphse,2018-11-02 04:31:08.899


In [16]:
ip = data_dask.PLAYERID.unique().compute()

In [19]:
len(ip)

44857