## Import the needed libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns

from solution_guidance.cslib import fetch_data, convert_to_ts

## Import the data

In [None]:
def return_all_data(path):
    files_list = [path + '/' + fname for fname in os.listdir(path)]
    
    list_frame = []
    
    for f in files_list:
        df_temp = pd.read_json(f)
        
        cols = set(df_temp.columns.tolist())
        
        if 'StreamID' in cols:
            df_temp.rename(columns={'StreamID':'stream_id'},inplace=True)
        if 'TimesViewed' in cols:
            df_temp.rename(columns={'TimesViewed':'times_viewed'},inplace=True)
        if 'total_price' in cols:
            df_temp.rename(columns={'total_price':'price'},inplace=True)
            
        list_frame.append(df_temp)
        
        
    df = pd.concat(list_frame)
    
    years, months, days = df['year'].values,df['month'].values,df['day'].values 
    dates = ["{}-{}-{}".format(years[i],str(months[i]).zfill(2),str(days[i]).zfill(2)) for i in range(df.shape[0])]
    df['invoice_date'] = np.array(dates,dtype='datetime64[D]')
    
    return df

df = return_all_data('cs-train')

In [None]:
df

In [None]:
df.columns

In [None]:
df['country'].unique()

In [None]:
df.head(1)

In [None]:
df['invoice'].value_conts()

In [None]:
df_grouped_by_country = df[df['country'] == 'United Kingdom'].groupby(['stream_id']).sum()[['times_viewed', 'price']]
#pltdf_grouped_by_country
df_grouped_by_country

In [None]:
plt.figure(figsize=(15,10))
plt.title('Scatterplot: Price X Times Viewed. Only United Kingdom')

plt.scatter(df_grouped_by_country['price'], df_grouped_by_country['times_viewed'], s=6, alpha=0.1)
plt.xlabel('Price')
plt.ylabel('Times Viewed')
print('Max Value PRICE: {}'.format(df_grouped_by_country['price'].max()))
print('Max Value Times Viewed: {}'.format(df_grouped_by_country['price'].max()))

In [None]:
new_df = df_grouped_by_country[df_grouped_by_country['price'] < 10000]
new_df = new_df[new_df['price'] > 0]

In [None]:
plt.figure(figsize=(15,10))
plt.title('Scatter Plot: Price X Times Viewed. Only United Kingdom. Removed visible outlier')

plt.scatter(new_df['price'], new_df['times_viewed'], s=6, alpha=0.3)
plt.xlabel('Price')
plt.ylabel('Times Viewed')
plt.plot()
print('Max Value PRICE: {}'.format(new_df['price'].max()))
print('Max Value Times Viewed: {}'.format(new_df['price'].max()))

In [None]:
from sklearn.cluster import KMeans
X = new_df.values

kmeans = KMeans(n_clusters=7)

kmeans.fit_predict(X)
new_df['cluster'] = kmeans.fit_predict(X)
new_df['cluster'].unique()

In [None]:
plt.figure(figsize=(15,10))
plt.title('Scatterplot: Price X Times Viewed. Only United Kingdom. Removed visible outlier')

for i in new_df['cluster'].unique():
    plt.scatter(new_df[new_df['cluster'] == i]['price'], new_df[new_df['cluster'] == i]['times_viewed'], s=6, alpha=0.6, label='Cluster {}'.format(i))


plt.xlabel('Price')
plt.ylabel('Times Viewed')
plt.legend()
plt.plot()

# Part 2

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

def create_ts_by_df(ds, ):
    print(ds.country.unique())
    start_month = '{}-{}'.format(ds['year'].values[0],str(ds['month'].values[0]).zfill(2))
    stop_month = '{}-{}'.format(ds['year'].values[-1],str(ds['month'].values[-1]).zfill(2))
    all_days_ts = np.arange(start_month, stop_month, dtype='datetime64[D]')
    dates = df['invoice_date'].values.astype('datetime64[D]')
    
    list_ts = []
    
    for day in all_days_ts:
        count_purchases = np.where(dates==day)[0].size
        count_invoices_diff = np.unique(ds[dates==day]['invoice'].values).size
        count_streams_diff = np.unique(ds[dates==day]['stream_id'].values).size
        sum_views =  ds[dates==day]['times_viewed'].values.sum()
        sum_price_revenue = ds[dates==day]['price'].values.sum()
        
        obj_monted = {
                        'date': day,
                        'total_invoice': count_invoices_diff,
                        'purchase': count_purchases,
                        'total_streams': count_streams_diff,
                        'total_views': sum_views,
                        'revenue': sum_price_revenue
                     }
        
        list_ts.append(obj_monted)
        
    
    return pd.DataFrame(data=list_ts)

def plot_df(x, y, title="", xlabel='Date', ylabel='Value', dpi=100):
    plt.figure(figsize=(16,5), dpi=dpi)
    plt.plot(x, y, color='tab:red', linewidth=1)
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()

def plot_decomposed_timeserie(ts, freq = 12, title=''):
    
    fig, axs = plt.subplots(4,1,figsize=(15,15))
    fig.suptitle(title)
    
    
    decomposed = seasonal_decompose(ts, freq = freq)
    
    trend = decomposed.trend
    seasonal = decomposed.seasonal
    random = decomposed.resid
    
    
    axs[0].set_title('Original')
    axs[0].plot(ts,label='Original')
    
    axs[1].set_title('Trend')
    axs[1].plot(trend, label= 'Trend')
    
    axs[2].set_title('Seasonality')
    axs[2].plot(seasonal, label='Seasonality')
    
    axs[3].set_title('Random')
    axs[3].plot(random, label='Random')
    
    plt.tight_layout()
    plt.show()

In [None]:
ts_uk = create_ts_by_df(df)

In [None]:
print(ts_uk['country'])
plot_df(ts_uk['date'],ts_uk['revenue'], 'Revenue Arround Time','Date', 'Revenue')
ts_uk

# Time Series Analysis

In [None]:
plot_decomposed_timeserie(ts_uk['revenue'])
print('The revenue series has strong trend, but there is no seasonality')

# Creating the LSTM Model

In [None]:
from keras.models import Sequential, Model
from keras.layers import LSTM, Input, Activation, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing.sequence import TimeseriesGenerator
from statsmodels.tools.eval_measures import rmse, mse, meanabs

def get_y_from_generator(gen):
    """
    Get all targets y from a TimeseriesGenerator instance.
    """
    y = None
    for i in range(len(gen)):
        batch_y = gen[i][1]
        if y is None:
            y = batch_y
        else:
            y = np.append(y, batch_y)
    y = y.reshape((-1,1))
    print(y.shape)
    return y


# input_shape=x_train.shape[-2:]

def create_model_learning(input_shape):
    
    input = Input(input_shape)

    x = LSTM(512, return_sequences=True)(input)
    x = Activation('relu')(x)
    
    x = LSTM(256, return_sequences=False)(x)
    x = Activation('relu')(x)
    
    x = Dropout(0.2)(x)
    x = Dense(50)(x)
    x = Activation('relu')(x)
    
    x = Dropout(0.1)(x)
    x = Dense(1, name='output')(x)

    model = Model(inputs = input, outputs = x, name='TimeSerieModel')
    return model

In [None]:
ds = ts_uk[['revenue']]

len_train = int(ds.shape[0] * 0.8) 
train = ds.iloc[:len_train]
test = ds.iloc[len_train:]

In [None]:
scaler = MinMaxScaler()
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

In [None]:
train_generator = TimeseriesGenerator(train, train, length=12, batch_size=6)
model = create_model_learning((12, 1))
model.compile(optimizer='adam', loss='mse')

In [None]:
model.fit_generator(train_generator, epochs=70, verbose=1)

In [None]:
pred_list = []

batch = train[-12:].reshape((1, 12, 1))

for i in range(test.shape[0]):   
    pred_list.append(model.predict(batch)[0]) 
    batch = np.append(batch[:,1:,:],[[pred_list[i]]],axis=1)
    
pred_list = np.asarray(pred_list)

In [None]:
df_predict = pd.DataFrame(scaler.inverse_transform(pred_list),
                          index=ds[-len(pred_list):].index, columns=['Prediction'])

df_test = pd.concat([ds,df_predict], axis=1)

In [None]:
df_test

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(df_test.index, df_test['revenue'])
plt.plot(df_test.index, df_test['Prediction'], color='r')
plt.legend(loc='best', fontsize='xx-large')
plt.xticks(fontsize=18)
plt.yticks(fontsize=16)
plt.show()

In [None]:
def MAPE(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

y_test = df_test.iloc[-test.shape[0]:, [0]]
y_predicted = df_test.iloc[-test.shape[0]:, [1]]

pred_actual_rmse = rmse(y_test, y_predicted)
pred_actual_mse = mse(y_test, y_predicted)
pred_actual_mae = meanabs(y_test, y_predicted)
pred_actual_mape = MAPE(y_test.values, y_predicted.values)


print("RMSE: %f" % pred_actual_rmse)
print("MSE: %f" % pred_actual_mse)
print("MAE: %f" % pred_actual_mae)
print("MAPE: %f" % pred_actual_mape)