In [None]:
# Prophet with seg1 data only used for model fitting

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.preprocessing import Imputer,LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as sm

In [2]:
# Importing dataset to a dataframe
df = pd.read_csv('train_file.csv')

In [3]:
# Checking if application_date, segment columns have any null values
null_columns=df.columns[df.isnull().any()]
null_columns

Index(['branch_id', 'zone'], dtype='object')

In [4]:
# Changing 'application date' to date format
df['application_date'] = df['application_date'].apply(pd.to_datetime)

In [5]:
# Creating new dataframe df_processing removing unwanted columns. We are removing these because test file doesn't have it
df_processing = df[['application_date','segment','case_count']]

In [6]:
# Verifying dtypes are as expected
df_processing.dtypes

application_date    datetime64[ns]
segment                      int64
case_count                 float64
dtype: object

In [7]:
# Sorting based on application_date
df_processing = df_processing.sort_values('application_date')

In [8]:
# Summing up case_count based on date and segment
df_processing_copy = df_processing.groupby(['application_date','segment'],as_index = False).sum()

In [9]:
# Seperating out data based on segments. We will merge the data after test-file prediction before submission
df_seg1 = df_processing_copy[df_processing_copy['segment'] == 1]
df_seg2 = df_processing_copy[df_processing_copy['segment'] == 2]

In [10]:
df_seg1['application_date'] = df_seg1['application_date'].apply(pd.to_datetime)
df_seg2['application_date'] = df_seg2['application_date'].apply(pd.to_datetime)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
X_seg1 = df_seg1
X_seg2 = df_seg2

In [12]:
# Preparing train and validation datasets for both segments
X_train_seg1 = X_seg1[:601]
X_train_seg2 = X_seg2[:601]
X_test_seg1 = X_seg1[601:]
X_test_seg2 = X_seg2[601:]

In [13]:
# Ensuring there is no data leak after transformation
print('Original row count before grouping: ',df['application_date'].count())
print('Row count after removing unwanted columns: ',df_processing['application_date'].count())
print('Row count after grouping & sum based on date and segment: ',df_processing_copy['application_date'].count())
print('X_train seg1 count: ',X_train_seg1.shape[0])
print('X_test seg1 count: ', X_test_seg1.shape[0])
print('X_train seg2 count: ',X_train_seg2.shape[0])
print('X_test seg2 count: ', X_test_seg2.shape[0])

Original row count before grouping:  80402
Row count after removing unwanted columns:  80402
Row count after grouping & sum based on date and segment:  1650
X_train seg1 count:  601
X_test seg1 count:  205
X_train seg2 count:  601
X_test seg2 count:  243


In [14]:
# We are using prophet for prediction.
from fbprophet import Prophet
model = Prophet()

ERROR:fbprophet:Importing plotly failed. Interactive plots will not work.


In [15]:
# Removing 'Segment column' and retaining only 'application_date' & 'case_count'. We are trying to predict 'case_count'
# based on 'application_date'
X_train_seg1 = X_train_seg1[['application_date','case_count']]
X_test_seg1 = X_test_seg1[['application_date','case_count']]

In [16]:
# Prophet needs independent variable column named as 'ds' and target variable column named as 'y'. Renaming logic below.
X_train_seg1.rename(columns={'case_count': 'y', 'application_date': 'ds'}, inplace=True)
X_test_seg1.rename(columns={'case_count': 'y', 'application_date': 'ds'}, inplace=True)

In [17]:
# Fitting the model using seg1 data alone
model.fit(X_train_seg1)

INFO:numexpr.utils:NumExpr defaulting to 8 threads.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


<fbprophet.forecaster.Prophet at 0x1dc551d5888>

In [18]:
#Predicting on validation data for seg 1
case_count_pred = model.make_future_dataframe(periods=len(X_test_seg1),include_history=False)
X_test_pred = model.predict(case_count_pred)

In [21]:
y_pred_seg1 = X_test_pred['yhat'].astype(int).to_numpy()
y_val_seg1  = df_seg1[601:]['case_count'].to_numpy()

In [22]:
#print('Accuracy of Seg1 :', metrics.accuracy_score(y_val_seg1, y_pred_seg1))
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [23]:
mean_absolute_percentage_error(y_val_seg1,y_pred_seg1)

277.4434929242776

In [24]:
df_test = pd.read_csv('test_file.csv')
df_test.head(1)

Unnamed: 0,id,application_date,segment
0,1,2019-07-06,1


In [25]:
# Changing 'application date' to date format
df_test['application_date'] = df_test['application_date'].apply(pd.to_datetime)
df_test.head(1)           

Unnamed: 0,id,application_date,segment
0,1,2019-07-06,1


In [26]:
# Seperating both segments
df_test_seg1 = df_test[df_test['segment'] == 1]
df_test_seg2 = df_test[df_test['segment'] == 2]

In [28]:
# Keeping only 'application_date' column and sorting based on it 
df_test_seg1_p = df_test_seg1[['application_date']].sort_values('application_date')
df_test_seg2_p = df_test_seg2[['application_date']].sort_values('application_date')

In [29]:
# Changing the name of independent variable to 'ds'
df_test_seg1_p.rename(columns={'application_date': 'ds'}, inplace=True)
df_test_seg2_p.rename(columns={'application_date': 'ds'}, inplace=True)

In [30]:
X_test_pred_seg1 = model.predict(df_test_seg1_p)
X_test_pred_seg2 = model.predict(df_test_seg2_p)

In [31]:
X_test_pred_seg1.head(2)

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2019-07-06,4991.306528,2161.242821,7593.607679,4934.186014,5043.721561,-80.557046,-80.557046,-80.557046,-80.557046,-80.557046,-80.557046,0.0,0.0,0.0,4910.749482
1,2019-07-07,4996.508594,1313.166815,6676.963981,4939.145683,5049.372791,-1067.950724,-1067.950724,-1067.950724,-1067.950724,-1067.950724,-1067.950724,0.0,0.0,0.0,3928.55787


In [32]:
# Selecting only 'ds' & 'yhat' columns and renaming them back
X_test_pred_seg1 = X_test_pred_seg1[['ds','yhat']]
X_test_pred_seg1.rename(columns={'ds':'application_date','yhat': 'case_count'}, inplace=True)
X_test_pred_seg2 = X_test_pred_seg2[['ds','yhat']]
X_test_pred_seg2.rename(columns={'ds':'application_date','yhat': 'case_count'}, inplace=True)

In [33]:
# Merging the predicted case_count columns(yhat) with remaining columns from original test file 
test_seg1_final = pd.merge(df_test_seg1, X_test_pred_seg1, on='application_date')
test_seg2_final = pd.merge(df_test_seg2, X_test_pred_seg2, on='application_date')

In [34]:
# Concatenating both segments to prepare submission file
test_final_pred = pd.concat([test_seg1_final, test_seg2_final], ignore_index=True, sort =False)

In [35]:
# Changing the case_count from float to int
test_final_pred['case_count'] = test_final_pred['case_count'].astype(int)

In [36]:
# Exporting the dataframe to CSV file which will be submitted
pd.DataFrame(test_final_pred).to_csv("file3.csv",index=False)