In [1]:
from utils import *
pd.set_option('max_rows', 1000)
pd.set_option('max_columns', 1000)
warnings.filterwarnings('ignore')
%matplotlib inline
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)
pd.plotting.register_matplotlib_converters()

In [2]:
train = pd.read_csv("../input/train_fwYjLYX.csv", parse_dates = ['application_date'])
test = pd.read_csv("../input/test_1eLl9Yf.csv", parse_dates = ['application_date'])
sample_submission = pd.read_csv("../input/sample_submission_IIzFVsf.csv")

print(f"train shape: {train.shape}")
print(f"test shape: {test.shape}")
print(f"sample submission shape: {sample_submission.shape}")
print("\n")
print(f"train date min: {train['application_date'].min()}")
print(f"train date max: {train['application_date'].max()}")
print(f"test date min: {test['application_date'].min()}")
print(f"test date max: {test['application_date'].max()}")
print("\n")
print(f"train date min (segment1): {train.query('segment == 1')['application_date'].min()}")
print(f"train date max (segment1): {train.query('segment == 1')['application_date'].max()}")
print(f"train date min (segment2): {train.query('segment == 2')['application_date'].min()}")
print(f"train date max (segment2): {train.query('segment == 2')['application_date'].max()}")
print("\n")
print(f"test date min (segment 1): {test.query('segment == 1')['application_date'].min()}")
print(f"test date max (segment 1): {test.query('segment == 1')['application_date'].max()}")
print(f"test date min (segment 2): {test.query('segment == 2')['application_date'].min()}")
print(f"test date max (segment 2): {test.query('segment == 2')['application_date'].max()}")

INFO:numexpr.utils:Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


train shape: (80402, 6)
test shape: (180, 3)
sample submission shape: (180, 4)


train date min: 2017-04-01 00:00:00
train date max: 2019-07-23 00:00:00
test date min: 2019-07-06 00:00:00
test date max: 2019-10-24 00:00:00


train date min (segment1): 2017-04-01 00:00:00
train date max (segment1): 2019-07-05 00:00:00
train date min (segment2): 2017-04-01 00:00:00
train date max (segment2): 2019-07-23 00:00:00


test date min (segment 1): 2019-07-06 00:00:00
test date max (segment 1): 2019-09-30 00:00:00
test date min (segment 2): 2019-07-24 00:00:00
test date max (segment 2): 2019-10-24 00:00:00


In [3]:
seg2 = train.query('segment == 2').groupby(['application_date'])['case_count'].agg('sum').reset_index()
seg2.rename(columns = {'application_date': 'ds', 'case_count': 'y'}, inplace =  True)
seg2['y'] = np.log1p(seg2['y'])
print(seg2.shape)

(844, 2)


In [4]:
seg2["day"] = pd.Categorical(seg2["ds"].dt.day)
seg2["month"] = pd.Categorical(seg2["ds"].dt.month)
seg2["quarter"] = pd.Categorical((seg2["ds"].dt.month - 1) // 3)
seg2["weekday"] = pd.Categorical(seg2["ds"].dt.weekday)
onehot = pd.get_dummies(seg2[["day", "month", "quarter", "weekday"]])
seg2 = pd.concat([seg2, onehot], axis = 1)

In [5]:
holiday_df = scrape_national_holidays()
holiday_df = holiday_df[holiday_df.holiday != 'Buddha Purnima/Vesak']

In [6]:
params = {
    'holiday': True,
    'holidays_prior_scale': 0.01,
    'changepoint_prior_scale': 0.03,
    'seasonal': {
        'monthly': (30, 1, 0.1, 'additive'),
        'weekly': (7, 1, 0.55, 'additive'),
    }
}
m = build_aggregated_model(params = params, verbose = True, holiday_df=holiday_df)

for col in seg2.columns:
    if col not in ['y', 'ds', 'day', 'quarter', 'month', 'weekday']:
        if 'day' in col:
            day = int(col.split("_")[1])
            if day == 1:
                m.add_regressor(col, prior_scale = 0.1 , mode = 'additive')
            else:
                m.add_regressor(col, prior_scale = 0.1, mode = 'additive')
        elif 'week' in col:
            m.add_regressor(col, prior_scale = 1, mode = 'additive')

using monthly seasonality
using weekly seasonality


In [7]:
seg2_test = test.query('segment == 2')[['application_date']].rename(columns = {'application_date': 'ds'})
seg2_test["day"] = pd.Categorical(seg2_test["ds"].dt.day)
seg2_test["month"] = pd.Categorical(seg2_test["ds"].dt.month)
seg2_test["quarter"] = pd.Categorical((seg2_test["ds"].dt.month - 1) // 3)
seg2_test["weekday"] = pd.Categorical(seg2_test["ds"].dt.weekday)
onehot = pd.get_dummies(seg2_test[["day", "month", "quarter", "weekday"]])
seg2_test = pd.concat([seg2_test, onehot], axis = 1)

In [8]:
m.fit(seg2)
fc_seg2 = m.predict(seg2_test)
fc_seg2['yhat'] = np.expm1(fc_seg2['yhat'])
fc_seg2['yhat'] = np.clip(fc_seg2['yhat'], 0., fc_seg2['yhat'].max())

In [9]:
seg1 = train.query('segment == 1').groupby(['application_date'])['case_count'].agg('sum').reset_index()
seg1.rename(columns = {'application_date': 'ds', 'case_count': 'y'}, inplace =  True)
print(seg1.shape)

(806, 2)


In [10]:
seg1["day"] = pd.Categorical(seg1["ds"].dt.day)
seg1["month"] = pd.Categorical(seg1["ds"].dt.month)
seg1["quarter"] = pd.Categorical((seg1["ds"].dt.month - 1) // 3)
seg1["weekday"] = pd.Categorical(seg1["ds"].dt.weekday)
onehot = pd.get_dummies(seg1[["day", "month", "quarter", "weekday"]])
seg1 = pd.concat([seg1, onehot], axis = 1)

In [11]:
params = {
    'holiday': True,
    'holidays_prior_scale': 0.1,
    'changepoint_prior_scale': .6,
    'seasonal': {
        'weekly': (7, 5, 0.7, 'additive'),
    }
}
m = build_aggregated_model(params = params, verbose = True, holiday_df=holiday_df)

using weekly seasonality


In [12]:
seg1_test = test.query('segment == 1')[['application_date']].rename(columns = {'application_date': 'ds'})
seg1_test["day"] = pd.Categorical(seg1_test["ds"].dt.day)
seg1_test["month"] = pd.Categorical(seg1_test["ds"].dt.month)
seg1_test["quarter"] = pd.Categorical((seg1_test["ds"].dt.month - 1) // 3)
seg1_test["weekday"] = pd.Categorical(seg1_test["ds"].dt.weekday)
onehot = pd.get_dummies(seg1_test[["day", "month", "quarter", "weekday"]])
seg1_test = pd.concat([seg1_test, onehot], axis = 1)

In [13]:
m.fit(seg1)
fc_seg1 = m.predict(seg1_test)
fc_seg1['yhat'] = np.clip(fc_seg1['yhat'], 0., fc_seg1['yhat'].max())

In [14]:
fc_seg1 = fc_seg1[["ds", "yhat"]]
fc_seg1["segment"] = 1
fc_seg2 = fc_seg2[["ds", "yhat"]]
fc_seg2["segment"] = 2
fc = pd.concat([fc_seg1, fc_seg2])

In [15]:
fc.rename(columns = {'ds': 'application_date', 'yhat': 'case_count'}, inplace= True)

In [16]:
sample_submission = pd.read_csv("../input/sample_submission_IIzFVsf.csv", parse_dates = ['application_date'])
sample_submission.drop("case_count", axis = 1, inplace = True)
sample_submission = sample_submission.merge(fc, on = ['segment', 'application_date'], how= 'left')

In [18]:
sample_submission.to_csv("../output/submission3.csv", index = False)