#### Dependencies
_____

In [1]:
import numpy as np
import pandas as pd
import dateinfer

#### Load Datasource
_____

In [158]:
dataset = "../resources/household_power_consumption.csv"
master = pd.read_csv(dataset, parse_dates=True, sep=";", low_memory=False)

master.head(10)

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0
5,16/12/2006,17:29:00,3.52,0.522,235.02,15.0,0.0,2.0,17.0
6,16/12/2006,17:30:00,3.702,0.52,235.09,15.8,0.0,1.0,17.0
7,16/12/2006,17:31:00,3.7,0.52,235.22,15.8,0.0,1.0,17.0
8,16/12/2006,17:32:00,3.668,0.51,233.99,15.8,0.0,1.0,17.0
9,16/12/2006,17:33:00,3.662,0.51,233.86,15.8,0.0,2.0,16.0


### Limiting dataset size (temporary approach)
_____


In [196]:
###df = master.head(30000).copy(deep=True)
#df = master.head(30).copy(deep=True)
df = master.copy(deep=True)

df.shape

(2075259, 9)

#### Create timestamp attribute
____

In [197]:
df['timestamp'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])

#### Check all possible timestamp frequencies
_______

In [118]:
previous_ts = None
ts_format = set()

for index in range(df.shape[0]):
    if (previous_ts is None):
        previous_ts = df.loc[index]['timestamp']
        ts_format.add(dateinfer.infer(previous_ts.__str__()))
       
    else:
        ts_diff = (df.loc[index]['timestamp'] - previous_ts)
        
        if ( ts_diff.seconds != 60):
            print (f"Frequency error found - index {index}")
        
        ts_format.add(dateinfer.infer(df.loc[index]['timestamp'].__str__()))
        previous_ts = None

print (ts_format)
print('done')



{'%H'}
done


#### Change Index to Timestamp
_____

In [198]:
df.set_index('timestamp', inplace=True)
##df.index.freq = 'T'

In [186]:
def add_freq(idx, freq=None):
    """Add a frequency attribute to idx, through inference or directly.

    Returns a copy.  If `freq` is None, it is inferred.
    """

    idx = idx.copy()
    if freq is None:
        if idx.freq is None:
            freq = pd.infer_freq(idx)
        else:
            return idx
    idx.freq = pd.tseries.frequencies.to_offset(freq)
    if idx.freq is None:
        raise AttributeError(f'no discernible frequency found to `idx`. Specify'
                             ' a frequency string with `freq`.')
    return idx

idx=pd.to_datetime(['2003-01-02', '2003-01-03', '2003-01-06'])  # freq=None

df.index


DatetimeIndex(['2006-12-16 17:24:00', '2006-12-16 17:25:00',
               '2006-12-16 17:26:00', '2006-12-16 17:27:00',
               '2006-12-16 17:28:00', '2006-12-16 17:29:00',
               '2006-12-16 17:30:00', '2006-12-16 17:31:00',
               '2006-12-16 17:32:00', '2006-12-16 17:33:00',
               ...
               '2007-06-01 13:14:00', '2007-06-01 13:15:00',
               '2007-06-01 13:16:00', '2007-06-01 13:17:00',
               '2007-06-01 13:18:00', '2007-06-01 13:19:00',
               '2007-06-01 13:20:00', '2007-06-01 13:21:00',
               '2007-06-01 13:22:00', '2007-06-01 13:23:00'],
              dtype='datetime64[ns]', name='timestamp', length=30000, freq=None)

In [178]:
idx=pd.to_datetime(['2003-01-02 17:24:00', '2003-01-02 17:25:00', '2003-01-02 17:26:00'])  # freq=None
idx

DatetimeIndex(['2003-01-02 17:24:00', '2003-01-02 17:25:00',
               '2003-01-02 17:26:00'],
              dtype='datetime64[ns]', freq=None)

In [184]:
print(add_freq(idx))

DatetimeIndex(['2003-01-02', '2003-01-03', '2003-01-06'], dtype='datetime64[ns]', freq='B')


In [187]:
print(add_freq(df.index))

AttributeError: no discernible frequency found to `idx`. Specify a frequency string with `freq`.

In [199]:
df.asfreq('T').index

DatetimeIndex(['2006-12-16 17:24:00', '2006-12-16 17:25:00',
               '2006-12-16 17:26:00', '2006-12-16 17:27:00',
               '2006-12-16 17:28:00', '2006-12-16 17:29:00',
               '2006-12-16 17:30:00', '2006-12-16 17:31:00',
               '2006-12-16 17:32:00', '2006-12-16 17:33:00',
               ...
               '2010-11-26 20:53:00', '2010-11-26 20:54:00',
               '2010-11-26 20:55:00', '2010-11-26 20:56:00',
               '2010-11-26 20:57:00', '2010-11-26 20:58:00',
               '2010-11-26 20:59:00', '2010-11-26 21:00:00',
               '2010-11-26 21:01:00', '2010-11-26 21:02:00'],
              dtype='datetime64[ns]', name='timestamp', length=2075259, freq='T')

In [200]:
df = df.asfreq('T')  ## worked??

In [201]:
df

Unnamed: 0_level_0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2006-12-16 17:24:00,16/12/2006,17:24:00,4.216,0.418,234.840,18.400,0.000,1.000,17.0
2006-12-16 17:25:00,16/12/2006,17:25:00,5.360,0.436,233.630,23.000,0.000,1.000,16.0
2006-12-16 17:26:00,16/12/2006,17:26:00,5.374,0.498,233.290,23.000,0.000,2.000,17.0
2006-12-16 17:27:00,16/12/2006,17:27:00,5.388,0.502,233.740,23.000,0.000,1.000,17.0
2006-12-16 17:28:00,16/12/2006,17:28:00,3.666,0.528,235.680,15.800,0.000,1.000,17.0
...,...,...,...,...,...,...,...,...,...
2010-11-26 20:58:00,26/11/2010,20:58:00,0.946,0.000,240.430,4.000,0.000,0.000,0.0
2010-11-26 20:59:00,26/11/2010,20:59:00,0.944,0.000,240.000,4.000,0.000,0.000,0.0
2010-11-26 21:00:00,26/11/2010,21:00:00,0.938,0.000,239.820,3.800,0.000,0.000,0.0
2010-11-26 21:01:00,26/11/2010,21:01:00,0.934,0.000,239.700,3.800,0.000,0.000,0.0


In [146]:
temp_df = pd.isnull(df).any(axis=1)

temp_df[temp_df == True]





timestamp
2006-12-21 11:23:00    True
2006-12-21 11:24:00    True
2006-12-30 10:08:00    True
2006-12-30 10:09:00    True
dtype: bool

In [154]:
temp_df.asfreq('T')

timestamp
2006-12-16 17:24:00    False
2006-12-16 17:25:00    False
2006-12-16 17:26:00    False
2006-12-16 17:27:00    False
2006-12-16 17:28:00    False
                       ...  
2007-06-01 13:19:00    False
2007-06-01 13:20:00    False
2007-06-01 13:21:00    False
2007-06-01 13:22:00    False
2007-06-01 13:23:00    False
Freq: T, Length: 240240, dtype: object

In [156]:
pd.infer_freq(df.index)

In [157]:
df.index

DatetimeIndex(['2006-12-16 17:24:00', '2006-12-16 17:25:00',
               '2006-12-16 17:26:00', '2006-12-16 17:27:00',
               '2006-12-16 17:28:00', '2006-12-16 17:29:00',
               '2006-12-16 17:30:00', '2006-12-16 17:31:00',
               '2006-12-16 17:32:00', '2006-12-16 17:33:00',
               ...
               '2007-06-01 13:14:00', '2007-06-01 13:15:00',
               '2007-06-01 13:16:00', '2007-06-01 13:17:00',
               '2007-06-01 13:18:00', '2007-06-01 13:19:00',
               '2007-06-01 13:20:00', '2007-06-01 13:21:00',
               '2007-06-01 13:22:00', '2007-06-01 13:23:00'],
              dtype='datetime64[ns]', name='timestamp', length=30000, freq=None)

In [6]:
ts_list = ['2006-12-16 17:24:00', '2006-12-16 17:25:00', '2006-12-16 17:27:00']

In [7]:
dateinfer.infer(ts_list)

'%Y-%m-%d %H:%M:%S'

In [10]:
pd.date_range('2006-12-16 17:24:00', periods=7, freq='T')

DatetimeIndex(['2006-12-16 17:24:00', '2006-12-16 17:25:00',
               '2006-12-16 17:26:00', '2006-12-16 17:27:00',
               '2006-12-16 17:28:00', '2006-12-16 17:29:00',
               '2006-12-16 17:30:00'],
              dtype='datetime64[ns]', freq='T')

In [None]:
### difference between two dates 
#### less than 1 min, error


In [24]:
ts_diff = (df.loc[2]['timestamp'] - df.loc[0]['timestamp'])

(df.loc[2]['timestamp'] - df.loc[0]['timestamp'])

Timedelta('0 days 00:02:00')

In [26]:
ts_diff.seconds

120

In [27]:
## if difference is not = 60 seconds, error

In [None]:
for index in range(df.shape[0]):
    ts_diff = (df.loc[index + 1]['timestamp'] - df.loc[index]['timestamp'])
    
    if ts_diff.seconds != 60:
        print ("Frequency error found")
        print (df.loc[index + 1])
        print (df.loc[index])
        break

print('done')



In [None]:
#dateinfer.infer(df['timestamp'].head(5)

ts_list = set()
for ts in df['timestamp']:
    ts_list.add(dateinfer.infer(str(ts)))

    #print (dateinfer.infer(str(ts)))

    
print(ts_list)
#ts_list = []
#for item in df['timestamp'].head(5).tolist():
#    ts_list.append(item.to_pydatetime())

#dateinfer.infer(ts_list)

##print(dir(ts_list[0]))

#ts_list[0].to_datetime64()
#ts_list[0].to_pydatetime()

#print(ts_list)

#print(ts_list[0].)


In [55]:
#dateinfer.infer(ts_list)

In [None]:
df['timestamp'].head(10)