In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.preprocessing import Imputer,LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as sm

In [2]:
df = pd.read_csv('train_file.csv')

In [4]:
df.head()

Unnamed: 0,application_date,segment,branch_id,state,zone,case_count
0,2017-04-01,1,1.0,WEST BENGAL,EAST,40.0
1,2017-04-03,1,1.0,WEST BENGAL,EAST,5.0
2,2017-04-04,1,1.0,WEST BENGAL,EAST,4.0
3,2017-04-05,1,1.0,WEST BENGAL,EAST,113.0
4,2017-04-07,1,1.0,WEST BENGAL,EAST,76.0


In [8]:
null_columns=df.columns[df.isnull().any()]
null_columns

Index(['branch_id', 'zone'], dtype='object')

In [11]:
df['application_date'].isnull().sum()

0

In [12]:
df['segment'].isnull().sum()

0

In [14]:
df.dtypes

application_date     object
segment               int64
branch_id           float64
state                object
zone                 object
case_count          float64
dtype: object

In [18]:
df['application_date'] = df['application_date'].apply(pd.to_datetime)

In [19]:
df.dtypes

application_date    datetime64[ns]
segment                      int64
branch_id                  float64
state                       object
zone                        object
case_count                 float64
dtype: object

In [20]:
df.head()

Unnamed: 0,application_date,segment,branch_id,state,zone,case_count
0,2017-04-01,1,1.0,WEST BENGAL,EAST,40.0
1,2017-04-03,1,1.0,WEST BENGAL,EAST,5.0
2,2017-04-04,1,1.0,WEST BENGAL,EAST,4.0
3,2017-04-05,1,1.0,WEST BENGAL,EAST,113.0
4,2017-04-07,1,1.0,WEST BENGAL,EAST,76.0


In [95]:
df_processing = df[['application_date','segment','case_count']]

In [96]:
df_processing.head(2)

Unnamed: 0,application_date,segment,case_count
0,2017-04-01,1,40.0
1,2017-04-03,1,5.0


In [97]:
df_processing.dtypes

application_date    datetime64[ns]
segment                      int64
case_count                 float64
dtype: object

In [98]:
null_columns=df_processing.columns[df_processing.isnull().any()]
null_columns

Index([], dtype='object')

In [99]:
df_processing = df_processing.sort_values('application_date')
df_processing.head(3)

Unnamed: 0,application_date,segment,case_count
0,2017-04-01,1,40.0
20150,2017-04-01,1,2.0
20956,2017-04-01,1,6.0


In [100]:
df_processing_copy = df_processing.groupby(['application_date','segment'],as_index = False).sum()

In [101]:
df_processing_copy.head(3)

Unnamed: 0,application_date,segment,case_count
0,2017-04-01,1,299.0
1,2017-04-01,2,897.0
2,2017-04-02,2,605.0


In [102]:
import datetime as dt
df_processing_copy['application_date']=df_processing_copy['application_date'].map(dt.datetime.toordinal)

In [104]:
type(df_processing_copy)

pandas.core.frame.DataFrame

In [105]:
df_processing_copy.head(3)

Unnamed: 0,application_date,segment,case_count
0,736420,1,299.0
1,736420,2,897.0
2,736421,2,605.0


In [106]:
X = df_processing_copy.iloc[:,:-1].values
X

array([[736420,      1],
       [736420,      2],
       [736421,      2],
       ...,
       [737261,      2],
       [737262,      2],
       [737263,      2]], dtype=int64)

In [107]:
y = df_processing_copy.iloc[:,-1].values
y

array([  299.,   897.,   605., ..., 12267., 27292., 29183.])

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [109]:
print('Original row count before grouping: ',df['application_date'].count())
print('Row count after removing unwanted columns: ',df_processing['application_date'].count())
print('Row count after grouping & sum based on date and segment: ',df_processing_copy['application_date'].count())
print('X_train count: ',X_train.shape[0])
print('y_train count: ', y_train.shape[0])
print('X_test count: ', X_test.shape[0])
print('y_test count: ', y_test.shape[0])

Original row count before grouping:  80402
Row count after removing unwanted columns:  80402
Row count after grouping & sum based on date and segment:  1650
X_train count:  1320
y_train count:  1320
X_test count:  330
y_test count:  330


In [110]:
from sklearn.tree import DecisionTreeRegressor

In [111]:
decision_tree_reg = DecisionTreeRegressor(random_state=0)

In [112]:
decision_tree_reg.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [114]:
y_pred = decision_tree_reg.predict(X_test)

In [116]:
y_pred[:10]

array([ 1586., 24401.,  6302.,  3154.,  3944., 18900., 22072., 31937.,
        3493.,  7667.])

In [117]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [123]:
mean_absolute_percentage_error(y_test,y_pred)

707.2351986403431

In [126]:
y_test[:8]

array([ 1502., 26953.,  1445.,  3443.,  3058., 15801., 20863., 30480.])

In [127]:
y_pred[:8]

array([ 1586., 24401.,  6302.,  3154.,  3944., 18900., 22072., 31937.])