In [244]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.preprocessing import Imputer,LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as sm

In [245]:
df = pd.read_csv('train_file.csv')

In [246]:
df.head()

Unnamed: 0,application_date,segment,branch_id,state,zone,case_count
0,2017-04-01,1,1.0,WEST BENGAL,EAST,40.0
1,2017-04-03,1,1.0,WEST BENGAL,EAST,5.0
2,2017-04-04,1,1.0,WEST BENGAL,EAST,4.0
3,2017-04-05,1,1.0,WEST BENGAL,EAST,113.0
4,2017-04-07,1,1.0,WEST BENGAL,EAST,76.0


In [247]:
null_columns=df.columns[df.isnull().any()]
null_columns

Index(['branch_id', 'zone'], dtype='object')

In [248]:
df['application_date'].isnull().sum()

0

In [249]:
df['segment'].isnull().sum()

0

In [250]:
df.dtypes

application_date     object
segment               int64
branch_id           float64
state                object
zone                 object
case_count          float64
dtype: object

In [251]:
df['application_date'] = df['application_date'].apply(pd.to_datetime)

In [252]:
df.dtypes

application_date    datetime64[ns]
segment                      int64
branch_id                  float64
state                       object
zone                        object
case_count                 float64
dtype: object

In [253]:
df.head()

Unnamed: 0,application_date,segment,branch_id,state,zone,case_count
0,2017-04-01,1,1.0,WEST BENGAL,EAST,40.0
1,2017-04-03,1,1.0,WEST BENGAL,EAST,5.0
2,2017-04-04,1,1.0,WEST BENGAL,EAST,4.0
3,2017-04-05,1,1.0,WEST BENGAL,EAST,113.0
4,2017-04-07,1,1.0,WEST BENGAL,EAST,76.0


In [254]:
df_processing = df[['application_date','segment','case_count']]

In [255]:
df_processing.head(2)

Unnamed: 0,application_date,segment,case_count
0,2017-04-01,1,40.0
1,2017-04-03,1,5.0


In [256]:
df_processing.dtypes

application_date    datetime64[ns]
segment                      int64
case_count                 float64
dtype: object

In [257]:
null_columns=df_processing.columns[df_processing.isnull().any()]
null_columns

Index([], dtype='object')

In [211]:
df_processing = df_processing.sort_values('application_date')
df_processing.head(3)

Unnamed: 0,application_date,segment,case_count
0,2017-04-01,1,40.0
20150,2017-04-01,1,2.0
20956,2017-04-01,1,6.0


In [258]:
df_processing.describe()

Unnamed: 0,segment,case_count
count,80402.0,80402.0
mean,1.167956,185.481841
std,0.37383,749.602925
min,1.0,0.0
25%,1.0,0.0
50%,1.0,18.0
75%,1.0,60.0
max,2.0,13787.0


In [259]:
df_processing_copy = df_processing.groupby(['application_date','segment'],as_index = False).sum()

In [260]:
df_processing_copy.head(3)

Unnamed: 0,application_date,segment,case_count
0,2017-04-01,1,299.0
1,2017-04-01,2,897.0
2,2017-04-02,2,605.0


In [261]:
df_processing_copy.sort_index(by=['segment','application_date'],inplace=True)
df_processing_copy.head(10)

  """Entry point for launching an IPython kernel.


Unnamed: 0,application_date,segment,case_count
0,2017-04-01,1,299.0
3,2017-04-03,1,42.0
5,2017-04-04,1,23.0
7,2017-04-05,1,1530.0
10,2017-04-07,1,1341.0
16,2017-04-12,1,1468.0
18,2017-04-13,1,1340.0
20,2017-04-14,1,1330.0
22,2017-04-15,1,981.0
24,2017-04-16,1,409.0


In [262]:
df_processing_copy.loc[df_processing_copy['application_date'] == '2017-04-07']

Unnamed: 0,application_date,segment,case_count
10,2017-04-07,1,1341.0
11,2017-04-07,2,2249.0


In [263]:
import datetime as dt
df_processing_copy['application_date']=df_processing_copy['application_date'].map(dt.datetime.toordinal)

In [264]:
type(df_processing_copy)

pandas.core.frame.DataFrame

In [265]:
df_processing_copy.head(3)

Unnamed: 0,application_date,segment,case_count
0,736420,1,299.0
3,736422,1,42.0
5,736423,1,23.0


In [266]:
X = df_processing_copy.iloc[:,:-1].values
X

array([[736420,      1],
       [736422,      1],
       [736423,      1],
       ...,
       [737261,      2],
       [737262,      2],
       [737263,      2]], dtype=int64)

In [267]:
y = df_processing_copy.iloc[:,-1].values
y

array([2.9900e+02, 4.2000e+01, 2.3000e+01, ..., 1.2267e+04, 2.7292e+04,
       2.9183e+04])

In [268]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [269]:
X_train = X[:1320]
X_test  = X[1320:]
y_train = y[:1320]
y_test  = y[1320:]

In [270]:
print('Original row count before grouping: ',df['application_date'].count())
print('Row count after removing unwanted columns: ',df_processing['application_date'].count())
print('Row count after grouping & sum based on date and segment: ',df_processing_copy['application_date'].count())
print('X_train count: ',X_train.shape[0])
print('y_train count: ', y_train.shape[0])
print('X_test count: ', X_test.shape[0])
print('y_test count: ', y_test.shape[0])

Original row count before grouping:  80402
Row count after removing unwanted columns:  80402
Row count after grouping & sum based on date and segment:  1650
X_train count:  1320
y_train count:  1320
X_test count:  330
y_test count:  330


In [271]:
from sklearn.tree import DecisionTreeRegressor

In [272]:
decision_tree_reg = DecisionTreeRegressor(random_state=0)

In [273]:
decision_tree_reg.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [274]:
y_pred = decision_tree_reg.predict(X_test)

In [275]:
y_pred[:10]

array([15361., 15361., 15361., 15361., 15361., 15361., 15361., 15361.,
       15361., 15361.])

In [277]:
y_pred.shape

(330,)

In [184]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [185]:
mean_absolute_percentage_error(y_test,y_pred)

144.4945211503955

In [186]:
y_test[:8]

array([5981., 2177.,  223., 9130., 2407., 8096., 1590., 4482.])

In [187]:
y_pred[:8]

array([3415., 7627., 3415., 7627., 3415., 7627., 3415., 7627.])

In [233]:
df_test = pd.read_csv('test_file.csv')
df_test.head(4)

Unnamed: 0,id,application_date,segment
0,1,2019-07-06,1
1,2,2019-07-07,1
2,3,2019-07-08,1
3,4,2019-07-09,1


In [234]:
df_test = df_test.groupby(['application_date','segment'],as_index = False).sum()
df_test[15:25]

Unnamed: 0,application_date,segment,id
15,2019-07-21,1,16
16,2019-07-22,1,17
17,2019-07-23,1,18
18,2019-07-24,1,19
19,2019-07-24,2,88
20,2019-07-25,1,20
21,2019-07-25,2,89
22,2019-07-26,1,21
23,2019-07-26,2,90
24,2019-07-27,1,22


In [235]:
df_test['application_date'] = df_test['application_date'].apply(pd.to_datetime)
df_test.head(3)           

Unnamed: 0,application_date,segment,id
0,2019-07-06,1,1
1,2019-07-07,1,2
2,2019-07-08,1,3


In [236]:
df_test['application_date'] = df_test['application_date'].map(dt.datetime.toordinal)
df_test[15:22]

Unnamed: 0,application_date,segment,id
15,737261,1,16
16,737262,1,17
17,737263,1,18
18,737264,1,19
19,737264,2,88
20,737265,1,20
21,737265,2,89


In [237]:
df_test = df_test[['application_date','segment']]
df_test.head(3) 

Unnamed: 0,application_date,segment
0,737246,1
1,737247,1
2,737248,1


In [238]:
X_final = df_test.iloc[:,:].values

In [239]:
type(X_final)

numpy.ndarray

In [240]:
X_final[:10]

array([[737246,      1],
       [737247,      1],
       [737248,      1],
       [737249,      1],
       [737250,      1],
       [737251,      1],
       [737252,      1],
       [737253,      1],
       [737254,      1],
       [737255,      1]], dtype=int64)

In [241]:
y_final_pred = decision_tree_reg.predict(X_final)

In [242]:
y_final_pred.shape

(180,)

In [243]:
y_final_pred

array([3415., 3415., 3415., 3415., 3415., 3415., 3415., 3415., 3415.,
       3415., 3415., 3415., 3415., 3415., 3415., 3415., 3415., 3415.,
       3415., 7627., 3415., 7627., 3415., 7627., 3415., 7627., 3415.,
       7627., 3415., 7627., 3415., 7627., 3415., 7627., 3415., 7627.,
       3415., 7627., 3415., 7627., 3415., 7627., 3415., 7627., 3415.,
       7627., 3415., 7627., 3415., 7627., 3415., 7627., 3415., 7627.,
       3415., 7627., 3415., 7627., 3415., 7627., 3415., 7627., 3415.,
       7627., 3415., 7627., 3415., 7627., 3415., 7627., 3415., 7627.,
       3415., 7627., 3415., 7627., 3415., 7627., 3415., 7627., 3415.,
       7627., 3415., 7627., 3415., 7627., 3415., 7627., 3415., 7627.,
       3415., 7627., 3415., 7627., 3415., 7627., 3415., 7627., 3415.,
       7627., 3415., 7627., 3415., 7627., 3415., 7627., 3415., 7627.,
       3415., 7627., 3415., 7627., 3415., 7627., 3415., 7627., 3415.,
       7627., 3415., 7627., 3415., 7627., 3415., 7627., 3415., 7627.,
       3415., 7627.,

In [155]:
type(y_final_pred)

numpy.ndarray

In [157]:
pd.DataFrame(y_final_pred).to_csv("file1.csv")