In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from prophet import Prophet
from tqdm.notebook import tqdm, trange
import datetime
from multiprocessing import Pool, cpu_count
from hyperopt import STATUS_OK, hp, pyll, tpe, fmin, Trials
import csv
import functools

### EDA

In [None]:
data = pd.read_csv('train.csv')
# Month1 is Dec 2013, Month 72 is Nov 2019

In [None]:
regions = data['Region_Name']
data = data.iloc[:,1:]

In [None]:
for i in trange(len(data), desc='Removing Outliers - '):
    region_data = data.iloc[i]
    q1 = np.percentile(region_data, 25)
    q3 = np.percentile(region_data, 75)
    upperFence = q3 + 1.5*(q3-q1)
    lowerFence = q1 - 1.5*(q3-q1)
    for j in range(len(region_data)):
        if region_data[j]>upperFence or region_data[j]<lowerFence:
            region_data[j] = np.nan
    data.iloc[i] = region_data

In [None]:
'''i = 12
fig, (ax1,ax2) = plt.subplots(1,2)
fig.set_figheight(4.8)
fig.set_figwidth(15)
ax1.boxplot(data.iloc[i])
ax1.set(xlabel=regions[i])
xdata = np.arange(1,73,1)
ax2.plot(xdata,data.iloc[i])
ax2.set(xlabel='Month')
ax2.set(ylabel=regions[i])
plt.show()'''

Still, not able to check all regions for outliers! 

### Data pre-processing & Training

For using prophet need to have a ds column with YYYY-MM-DD format, and a y column with numeric measurements. 

### With parallel  processing

In [None]:
train_data = data.iloc[:,0:64]
validate_data = data.iloc[:,64:]
train_ds = pd.date_range('2013-12','2019-04',freq='M')
predicted_data = pd.DataFrame()

In [None]:
def run_prophet(timeseries, changepoint_prior_scale=0.05, seasonality_mode='additive', seasonality_prior_scale=10, holidays_prior_scale=10, changepoint_range=0.8):
    m = Prophet(changepoint_prior_scale=changepoint_prior_scale, seasonality_mode=seasonality_mode, seasonality_prior_scale=seasonality_prior_scale, holidays_prior_scale=holidays_prior_scale, changepoint_range=changepoint_range)
    m.add_country_holidays(country_name='IN')
    m.fit(timeseries)
    forecast = m.predict(m.make_future_dataframe(periods=8, freq='M'))[['yhat']].tail(8).transpose()
    forecast.columns = ['Month 65','Month 66','Month 67','Month 68','Month 69','Month 70','Month 71','Month 72']
    return forecast

In [None]:
timeseries =[]
for i in trange(len(train_data)):
    train_dataset = pd.DataFrame()
    y = []
    for j in range(len(train_data.iloc[i])):
        y.append(train_data.iloc[i][j])
    train_dataset['ds'] = train_ds
    train_dataset['y'] = y
    timeseries.append(train_dataset)

In [None]:
'''p = Pool(cpu_count())
forecast = list(tqdm(p.imap(run_prophet, timeseries), total=len(timeseries), desc='Training Dataset'))
p.close()
p.join()'''

In [None]:
'''predicted_data = pd.DataFrame()
for i in range(len(forecast)):
    predicted_data = pd.concat([predicted_data,forecast[i]])
predicted_data = predicted_data.reset_index(drop=True)'''

In [None]:
def mape(predicted_data, validate_data):
    overall_mape=0
    overall_mape_sum=0
    for i in trange(0,predicted_data.shape[0]): #looping through the regions
        region_mape=0
        for j in range(1,predicted_data.shape[1]): #looping through the months
            actual_value=validate_data.iloc[i][j]
            predicted_value=predicted_data.iloc[i][j]
            
            if np.isnan(actual_value) or np.isnan(predicted_value):
                pass
            else:
                if(actual_value!=0 and predicted_value!=0):
                    mape=((actual_value-predicted_value)/actual_value)*100
                    if(mape<0):
                        mape=mape*(-1)
                    region_mape=region_mape+mape
                else:
                    pass

        # mape_df.at[i,'Mape_score']=region_mape
        overall_mape_sum=overall_mape_sum+region_mape

    #print(overall_mape)
    #print(overall_mape/validation_df.shape[0])
    overall_mape=(overall_mape_sum/predicted_data.shape[0])
    return overall_mape

In [None]:
out_file = 'prophet_trials.csv'
of_connection = open(out_file, 'a')
writer = csv.writer(of_connection)

# Write the headers to the file
writer.writerow(['iteration', 'loss', 'params'])
of_connection.close()

In [None]:
def objective(params):
    global ITERATION
    ITERATION += 1
    pbar = tqdm(total=MAX_EVALS, desc="Hyperopt")
    
    p = Pool(cpu_count())
    forecast = list(tqdm(p.imap(functools.partial(run_prophet, changepoint_prior_scale=params['changepoint_prior_scale'], seasonality_mode=params['seasonality_mode'], seasonality_prior_scale=params['seasonality_prior_scale'], holidays_prior_scale=params['holidays_prior_scale'], changepoint_range=params['changepoint_range']), timeseries), total=len(timeseries), desc=f'Training Dataset {ITERATION}'))
    p.close()
    p.join()
    
    predicted_data = pd.DataFrame()
    for i in range(len(forecast)):
        predicted_data = pd.concat([predicted_data,forecast[i]])
    predicted_data = predicted_data.reset_index(drop=True)
    
    loss = mape(predicted_data, validate_data)
    
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([ITERATION, loss, params])
    of_connection.close()
    pbar.update()
    return {'iteration':ITERATION,'loss':loss, 'params':params, 'status':STATUS_OK}

In [None]:
space = {
    'changepoint_prior_scale': hp.loguniform('changepoint_prior_scale', np.log(0.001), np.log(0.5)),
    'seasonality_mode': hp.choice('seasonality_mode', ['additive', 'multiplicative']),
    'seasonality_prior_scale': hp.loguniform('seasonality_prior_scale', np.log(0.01), np.log(10)),
    'holidays_prior_scale': hp.loguniform('holidays_prior_scale', np.log(0.01), np.log(10)),
    'changepoint_range': hp.uniform('changepoint_range', 0.80, 0.95)
}
# params = pyll.stochastic.sample(space)

In [None]:
tpe_algorithm = tpe.suggest

In [None]:
# %store -r bayes_trials
bayes_trials = Trials()

In [None]:
global  ITERATION
ITERATION = 0

In [None]:
MAX_EVALS = 5
best = fmin(fn = objective, space = space, algo = tpe_algorithm, max_evals = MAX_EVALS, trials = bayes_trials, show_progressbar=False)
pbar.close()
%store bayes_trials

In [None]:
'''final_data = pd.concat([train_data, predicted_data], axis=1)
i = 9
xdata = np.arange(1,73,1)
plt.plot(xdata,data.iloc[i])
plt.plot(xdata,final_data.iloc[i])
plt.xlabel('Month')
plt.ylabel(regions[i])
plt.show()'''