In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timezone, timedelta
import time
import os

In [2]:
# Default settings
pd.options.display.max_columns = None
%matplotlib inline
sns.set(style="darkgrid")
random_seed = 42

In [4]:
df = pd.read_csv('data/output_objects_timeseries_1.csv', index_col=0)
df.head()

Unnamed: 0,DATE,index,OBJECT_TYPE_NAME,POSTCODE,AVG_DAY
0,2009-12-22,0.0,GZB,0.0,171.916667
1,2009-12-23,0.0,GZB,0.0,171.916667
2,2009-12-24,0.0,GZB,0.0,171.916667
3,2009-12-25,0.0,GZB,0.0,171.916667
4,2009-12-26,0.0,GZB,0.0,171.916667


In [88]:
df.shape

(12869616, 8)

In [23]:
segments = {'Mega': np.divide([10e4, 10e10], 365),\
            'Large': np.divide([10e3, 10e4], 365),\
            'Medium': np.divide([10e2, 10e3], 365),\
            'Small': np.divide([0, 10e2], 365)}
segments

{'Mega': array([2.73972603e+02, 2.73972603e+08]),
 'Large': array([ 27.39726027, 273.97260274]),
 'Medium': array([ 2.73972603, 27.39726027]),
 'Small': array([0.        , 2.73972603])}

In [35]:
df_segments = pd.DataFrame.from_dict(segments, 'index').reset_index()
df_segments.index = pd.IntervalIndex.from_arrays(left=df_segments[0], right=df_segments[1], closed= 'both')
df_segments

Unnamed: 0,index,0,1
"[273.972602739726, 273972602.739726]",Mega,273.972603,273972600.0
"[27.397260273972602, 273.972602739726]",Large,27.39726,273.9726
"[2.73972602739726, 27.397260273972602]",Medium,2.739726,27.39726
"[0.0, 2.73972602739726]",Small,0.0,2.739726


In [47]:
df['Segment'] = df_segments.loc[df.AVG_DAY.abs().replace(np.inf, 0)]['index'].values
df['DATE'] = pd.to_datetime(df.DATE)
df.head()

Unnamed: 0,DATE,index,OBJECT_TYPE_NAME,POSTCODE,AVG_DAY,Segment
0,2009-12-22,0.0,GZB,0.0,171.916667,Large
1,2009-12-23,0.0,GZB,0.0,171.916667,Large
2,2009-12-24,0.0,GZB,0.0,171.916667,Large
3,2009-12-25,0.0,GZB,0.0,171.916667,Large
4,2009-12-26,0.0,GZB,0.0,171.916667,Large


In [86]:
df.YEAR, df.MONTH = df.DATE.dt.year, df.DATE.dt.month
df.head()

Unnamed: 0,DATE,index,OBJECT_TYPE_NAME,POSTCODE,AVG_DAY,Segment,YEAR,MONTH
0,2009-12-22,0.0,GZB,0.0,171.916667,Large,2009,12
1,2009-12-23,0.0,GZB,0.0,171.916667,Large,2009,12
2,2009-12-24,0.0,GZB,0.0,171.916667,Large,2009,12
3,2009-12-25,0.0,GZB,0.0,171.916667,Large,2009,12
4,2009-12-26,0.0,GZB,0.0,171.916667,Large,2009,12


In [87]:
df_monthly = df.groupby(['OBJECT_TYPE_NAME', 'Segment', 'YEAR', 'MONTH'])['AVG_DAY'].sum().reset_index()
df_monthly

Unnamed: 0,OBJECT_TYPE_NAME,Segment,YEAR,MONTH,AVG_DAY
0,GZB,Large,2009,10,399.972222
1,GZB,Large,2009,11,2226.622807
2,GZB,Large,2009,12,39276.580306
3,GZB,Large,2010,1,144259.491799
4,GZB,Large,2010,2,152692.732045
...,...,...,...,...,...
949,HHB,Small,2019,8,7124.414954
950,HHB,Small,2019,9,5314.543037
951,HHB,Small,2019,10,4160.292063
952,HHB,Small,2019,11,2163.500889
