In [28]:
from statistics import mean
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
import warnings
from math import sqrt

In [8]:
df2 = pd.read_csv("MCIRD_aaic2021_test_week1_with_target(1).csv")
df1 = pd.read_csv("MCIRD_aaic2021_train.csv")
df3 = pd.read_csv("MCIRD_aaic2021_test_week2_with_target.csv")

df1 = df1[['subscriber_ecid', 'data_usage_volume']]
df2 = df2[['subscriber_ecid', 'data_usage_volume']]
df3 = df3[['subscriber_ecid', 'data_usage_volume']]

unique_sub_id_1 = df1['subscriber_ecid'].values
unique_sub_id_1 = list(dict.fromkeys(unique_sub_id_1))
unique_sub_id_2 = df2['subscriber_ecid'].values
unique_sub_id_2 = list(dict.fromkeys(unique_sub_id_2))
unique_sub_id_3 = df3['subscriber_ecid'].values
unique_sub_id_3 = list(dict.fromkeys(unique_sub_id_3))

list(set(unique_sub_id_1) - set(unique_sub_id_2))

unique_sub_id_1.remove('28gWxNYMU_2dg')
unique_sub_id_1.remove('1EN04BS-9nKgc')
unique_sub_id_1.remove('37v4v4PPObMC_')
unique_sub_id_1.remove('-gjfIaG2oxwzj')
unique_sub_id_1.remove('32ez6CX89v6KZ')
unique_sub_id_1.remove('-XU6p4P-782mp')

# print(len(unique_sub_id_1))
data_list = []

for i, sub_id in enumerate(unique_sub_id_1):
    temp1 = df1[df1['subscriber_ecid'] == sub_id].values
    temp2 = df2[df2['subscriber_ecid'] == sub_id].values
    temp3 = df3[df3['subscriber_ecid'] == sub_id].values
    final_temp = np.concatenate((temp1, temp2), axis=0)
    final_temp = np.concatenate((final_temp,temp3), axis=0)
    data_list.append(final_temp)


def evaluate_arima_model(data_list_one, arima_order):
    final_test = []
    final_predict = []
    # split into train and test sets
    X = data_list_one
    size = int(len(X) * 0.66)
    train, test = X[0:size], X[size:len(X)]
    test = list(test + 0.00001 * np.random.rand(len(test)))
    history = list(train + 0.00001 * np.random.rand(len(train)))
    predictions = list()
    # model fit
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit(trend='nc', disp=1)
        output = model_fit.forecast()
        yhat = output[0]
        predictions.append(yhat)
        obs = test[t]
        history.append(obs)
        # print('predicted=%f, expected=%f' % (yhat, obs))
    final_test.extend(test)
    final_predict.extend(predictions)
    # pyplot.plot(final_test)
    # pyplot.plot(final_predict, color='red')
    # pyplot.show()
    ## evaluate forecasts
    mse = mean_squared_error(np.array(final_test), np.array(final_predict))
    # print('Test MSE: %.3f' % mse)
    return mse


# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(data_list_one, p_values, d_values, q_values):
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p, d, q)
                try:
                    mse = evaluate_arima_model(data_list_one, order)
                    # print(mse)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                    #     print("best updated!")
                    # print('ARIMA%s RMSE=%.3f' % (order, mse))
                except Exception as e:
                    # print(e)
                    # print("error catch in ARIMA", order)
                    continue

    print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))
    return best_score, best_cfg


p_values = range(0,5)
d_values = range(0, 3)
q_values = range(0, 3)
final_predict_all_data = []
final_predict_all_ids = []
mse_all = []
warnings.filterwarnings("ignore")
for j in range(len(data_list)):
    data_list_one = data_list[j]
    subscriber_id = data_list_one[0,0]
    data_list_one = data_list_one[:, 1]
    
    print('\n==============================')
    print("user number ",j)
    best_score, best_order = evaluate_models(data_list_one, p_values, d_values, q_values)
    mse_all.append(best_score)
    train = data_list_one
    history = list(train + 0.00001 * np.random.rand(len(train)))
    predictions = list()
    ids = []
    for t in range(7):
        try :
            model = ARIMA(history, order=best_order)
            model_fit = model.fit()
            output = model_fit.forecast()
        except :
            print('except')
            model = ARIMA(history, order=(0,1,1))
            model_fit = model.fit()
            output = model_fit.forecast()
        yhat = output[0]
        predictions.append(yhat[0])
        history.append(yhat[0])
        ids.append(subscriber_id)
    final_predict_all_data.append(predictions)
    final_predict_all_ids.append(ids)

print("======= final result =======")
print("final rmse model : ", sqrt(mean(mse_all)))
print(final_predict_all_data)



user number  0
Best ARIMA(0, 1, 2) MSE=0.832

user number  1
Best ARIMA(0, 0, 2) MSE=0.886

user number  2
Best ARIMA(1, 1, 0) MSE=0.028

user number  3
Best ARIMA(0, 0, 1) MSE=0.018

user number  4
Best ARIMA(1, 1, 0) MSE=0.000

user number  5
Best ARIMA(1, 0, 0) MSE=0.003

user number  6
Best ARIMA(0, 1, 2) MSE=3.544

user number  7
Best ARIMA(0, 1, 1) MSE=1.958

user number  8
Best ARIMA(2, 0, 1) MSE=0.423

user number  9
Best ARIMA(3, 0, 1) MSE=0.071

user number  10
Best ARIMA(1, 0, 1) MSE=20.656

user number  11
Best ARIMA(4, 1, 1) MSE=0.014

user number  12
Best ARIMA(0, 1, 1) MSE=0.093

user number  13
Best ARIMA(0, 1, 1) MSE=1.009

user number  14
Best ARIMA(1, 0, 1) MSE=1.979

user number  15
Best ARIMA(0, 1, 1) MSE=1.425

user number  16
Best ARIMA(2, 1, 1) MSE=0.000

user number  17
Best ARIMA(2, 1, 1) MSE=0.000

user number  18
Best ARIMA(2, 0, 1) MSE=0.118

user number  19
Best ARIMA(0, 1, 1) MSE=0.492

user number  20
Best ARIMA(4, 1, 1) MSE=0.722

user number  21
Best 

In [9]:
final_predict_all_data_export = np.array(final_predict_all_data)
final_predict_all_data_export = final_predict_all_data_export.flatten()

In [10]:
final_predict_all_data_ids = np.array(final_predict_all_ids)
final_predict_all_data_ids = final_predict_all_data_ids.flatten()

In [22]:
days = ['(n+2)22','(n+2)23','(n+2)24','(n+2)25','(n+2)26','(n+2)27','(n+2)28']
day_list = []
for i in range(94):
    day_list.append(days)
    
final_predict_all_data_days = np.array(day_list)
final_predict_all_data_days = final_predict_all_data_days.flatten()

In [23]:
dataset = np.empty((658,3))

In [24]:
b = np.vstack((final_predict_all_data_days,final_predict_all_data_ids))

In [25]:
c = np.vstack((b,final_predict_all_data_export))

In [26]:
finalized = c.T

In [27]:
cols = ['day','subscriber_id','data_usage_volume']
pd.DataFrame(finalized,columns=cols).to_csv("AiOrenda-mci-traffic-week3.csv",index=False)