In [82]:
# Prophet with seperate models for seg1 and seg2 data

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.preprocessing import Imputer,LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as sm

In [7]:
# Importing dataset to a dataframe
df = pd.read_csv('train_file.csv')

In [8]:
# Checking if application_date, segment columns have any null values
null_columns=df.columns[df.isnull().any()]
null_columns

Index(['branch_id', 'zone'], dtype='object')

In [9]:
# Changing 'application date' to date format
df['application_date'] = df['application_date'].apply(pd.to_datetime)

In [10]:
# Creating new dataframe df_processing removing unwanted columns. We are removing these because test file doesn't have it
df_processing = df[['application_date','segment','case_count']]

In [11]:
df_processing.head(2)

Unnamed: 0,application_date,segment,case_count
0,2017-04-01,1,40.0
1,2017-04-03,1,5.0


In [12]:
# Verifying dtypes are as expected
df_processing.dtypes

application_date    datetime64[ns]
segment                      int64
case_count                 float64
dtype: object

In [13]:
# Sorting based on application_date
df_processing = df_processing.sort_values('application_date')
df_processing.head(3)

Unnamed: 0,application_date,segment,case_count
0,2017-04-01,1,40.0
20150,2017-04-01,1,2.0
20956,2017-04-01,1,6.0


In [14]:
# Summing up case_count based on date and segment
df_processing_copy = df_processing.groupby(['application_date','segment'],as_index = False).sum()

In [15]:
df_processing_copy.head(3)

Unnamed: 0,application_date,segment,case_count
0,2017-04-01,1,299.0
1,2017-04-01,2,897.0
2,2017-04-02,2,605.0


In [16]:
# Seperating out data based on segments. We will merge the data after test-file prediction before submission
df_seg1 = df_processing_copy[df_processing_copy['segment'] == 1]
df_seg2 = df_processing_copy[df_processing_copy['segment'] == 2]
print('Seg 1 count: ',df_seg1['application_date'].count())
print('Seg 2 count: ',df_seg2['application_date'].count())

Seg 1 count:  806
Seg 2 count:  844


In [17]:
# Preparing train and validation datasets for both segments
X_train_seg1 = df_seg1[:601]
X_train_seg2 = df_seg2[:601]
X_val_seg1 = df_seg1[601:]
X_val_seg2 = df_seg2[601:]

In [18]:
# Ensuring there is no data leak after transformations
print('Original row count before grouping: ',df['application_date'].count())
print('Row count after removing unwanted columns: ',df_processing['application_date'].count())
print('Row count after grouping & sum based on date and segment: ',df_processing_copy['application_date'].count())
print('X_train seg1 count: ',X_train_seg1.shape[0])
print('X_test seg1 count: ', X_val_seg1.shape[0])
print('X_train seg2 count: ',X_train_seg2.shape[0])
print('X_test seg2 count: ', X_val_seg2.shape[0])

Original row count before grouping:  80402
Row count after removing unwanted columns:  80402
Row count after grouping & sum based on date and segment:  1650
X_train seg1 count:  601
X_test seg1 count:  205
X_train seg2 count:  601
X_test seg2 count:  243


In [19]:
# We are using prophet for prediction. Using 2 instances of model because we are giving only date as input feature
from fbprophet import Prophet
model_seg1 = Prophet()
model_seg2 = Prophet()

In [20]:
# Removing 'Segment column' and retaining only 'application_date' & 'case_count'. We are trying to predict 'case_count'
# based on 'application_date'
X_train_seg1 = X_train_seg1[['application_date','case_count']]
X_val_seg1 = X_val_seg1[['application_date','case_count']]
X_train_seg2 = X_train_seg2[['application_date','case_count']]
X_val_seg2 = X_val_seg2[['application_date','case_count']]

In [21]:
# Prophet needs independent variable column named as 'ds' and target variable column named as 'y'. Renaming logic below.
X_train_seg1.rename(columns={'case_count': 'y', 'application_date': 'ds'}, inplace=True)
X_val_seg1.rename(columns={'case_count': 'y', 'application_date': 'ds'}, inplace=True)
X_train_seg2.rename(columns={'case_count': 'y', 'application_date': 'ds'}, inplace=True)
X_val_seg2.rename(columns={'case_count': 'y', 'application_date': 'ds'}, inplace=True)

In [22]:
X_train_seg1.head(2)

Unnamed: 0,ds,y
0,2017-04-01,299.0
3,2017-04-03,42.0


In [23]:
# Fitting both the models using seg1 and seg 2 data
model_seg1.fit(X_train_seg1)
model_seg2.fit(X_train_seg2)

INFO:numexpr.utils:NumExpr defaulting to 8 threads.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


<fbprophet.forecaster.Prophet at 0x1d76240f748>

In [24]:
#Predicting on validation data for both seg1 and seg2. Made history false to avoid train data from getting predicted again
temp_val_seg1 = model_seg1.make_future_dataframe(periods=len(X_val_seg1),include_history=False)
X_val_seg1_pred = model_seg1.predict(temp_val_seg1)
temp_val_seg2 = model_seg2.make_future_dataframe(periods=len(X_val_seg2),include_history=False)
X_val_seg2_pred = model_seg1.predict(temp_val_seg2)

In [25]:
X_val_seg1_pred.head(2)

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2018-12-13,3924.883151,1241.730741,6835.291391,3924.883151,3924.883151,59.008427,59.008427,59.008427,59.008427,59.008427,59.008427,0.0,0.0,0.0,3983.891578
1,2018-12-14,3930.085217,1275.726169,6911.341749,3930.085217,3930.085217,39.752349,39.752349,39.752349,39.752349,39.752349,39.752349,0.0,0.0,0.0,3969.837565


In [65]:
y_pred_seg1 = X_val_seg1_pred['yhat'].astype(int).to_numpy()
y_val_seg1  = df_seg1[601:]['case_count'].to_numpy()
y_pred_seg2 = X_val_seg2_pred['yhat'].astype(int).to_numpy()
y_val_seg2  = df_seg2[601:]['case_count'].to_numpy()
#from sklearn import metrics
#from sklearn.metrics import accuracy_score
#print('Accuracy of Seg1 :', metrics.accuracy_score(y_val_seg1, y_pred_seg1))
#print('Accuracy of Seg2 :', metrics.accuracy_score(y_val_seg2, y_pred_seg2))

In [58]:
# Defining MAPE function
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [59]:
print('MAPE for seg1: ',mean_absolute_percentage_error(y_val_seg1,y_pred_seg1))
print('MAPE for seg2: ',mean_absolute_percentage_error(y_val_seg2,y_pred_seg2))

MAPE for seg1:  277.4434929242776
MAPE for seg2:  65.91440718493378


In [66]:
df_test = pd.read_csv('test_file.csv')
df_test.head(1)

Unnamed: 0,id,application_date,segment
0,1,2019-07-06,1


In [67]:
# Changing 'application date' to date format
df_test['application_date'] = df_test['application_date'].apply(pd.to_datetime)
df_test.head(1)           

Unnamed: 0,id,application_date,segment
0,1,2019-07-06,1


In [68]:
# Seperating both segments
df_test_seg1 = df_test[df_test['segment'] == 1]
df_test_seg2 = df_test[df_test['segment'] == 2]

In [70]:
# Keeping only 'application_date' column and sorting based on it 
df_test_seg1_p = df_test_seg1[['application_date']].sort_values('application_date')
df_test_seg2_p = df_test_seg2[['application_date']].sort_values('application_date')

In [71]:
# Changing the name of independent variable to 'ds'
df_test_seg1_p.rename(columns={'application_date': 'ds'}, inplace=True)
df_test_seg2_p.rename(columns={'application_date': 'ds'}, inplace=True)

In [72]:
X_test_pred_seg1 = model_seg1.predict(df_test_seg1_p)
X_test_pred_seg2 = model_seg2.predict(df_test_seg2_p)

In [74]:
X_test_pred_seg2.head(2)

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2019-07-24,24377.943027,13873.696906,36829.686282,23913.696415,24864.437964,976.015854,976.015854,976.015854,976.015854,976.015854,976.015854,0.0,0.0,0.0,25353.958882
1,2019-07-25,24394.69377,14493.196013,36542.243523,23927.298374,24884.255037,1558.91945,1558.91945,1558.91945,1558.91945,1558.91945,1558.91945,0.0,0.0,0.0,25953.613219


In [75]:
# Selecting only 'ds' & 'yhat' columns and renaming them back
X_test_pred_seg1 = X_test_pred_seg1[['ds','yhat']]
X_test_pred_seg1.rename(columns={'ds':'application_date','yhat': 'case_count'}, inplace=True)
X_test_pred_seg2 = X_test_pred_seg2[['ds','yhat']]
X_test_pred_seg2.rename(columns={'ds':'application_date','yhat': 'case_count'}, inplace=True)

In [76]:
# Merging the predicted case_count columns(yhat) with remaining columns from original test file 
test_seg1_final = pd.merge(df_test_seg1, X_test_pred_seg1, on='application_date')
test_seg2_final = pd.merge(df_test_seg2, X_test_pred_seg2, on='application_date')

In [77]:
# Concatenating both segments to prepare submission file
test_final_pred = pd.concat([test_seg1_final, test_seg2_final], ignore_index=True, sort =False)

In [80]:
# Changing the case_count from float to int
test_final_pred['case_count'] = test_final_pred['case_count'].astype(int)

In [81]:
# Exporting the dataframe to CSV file which will be submitted
pd.DataFrame(test_final_pred).to_csv("file3.csv",index=False)