# Time-related variables in python
## Basic stuff related to time, and time analysis

This workbook is the material for this article: http://ondata.blog/articles/nine-circles-of-hell-time-in-python/

In [161]:
import pandas as pd, numpy as np
import os
import matplotlib.pyplot as plt
import sys

# a utility
def show(x, comment = None):
    if (comment is not None): 
        print(comment)
    print('variable content: {}'.format(x))
    print('variable type: {}\n'.format(type(x)))

In [168]:
np.__version__

'1.19.2'

In [169]:
pd.__version__

'1.2.3'

# 1. Five ways of storing time information

### string

In [170]:
# string
stamp = '2021-04-13 15:33'
show(stamp)

variable content: 2021-04-13 15:33
variable type: <class 'str'>



### python native datetime

In [205]:
# python native: datetime
from datetime import datetime
stamp_datetime = datetime(year=2021, month=4, day=13)
show(stamp_datetime)

variable content: 2021-04-13 00:00:00
variable type: <class 'datetime.datetime'>



### numpy datetime64

In [172]:
# numpy: datetime64
def show_np(x, comment = None):
    if (comment is not None): 
        print(comment)
    print('variable content: {}'.format(x))
    print('variable type: {}'.format(type(x)))
    print('variable dtype: {}\n'.format(x.dtype))
    
import numpy as np
stamp_np = np.datetime64('2021-04-13 15:33')
show_np(stamp_np)
print('Enforcing time unit:\n')
show_np(np.datetime64('2021-04-13 15:33', 'ns'), 'When unit is nanosecond:')
show_np(np.datetime64('2021-04-13 15:33', 'h'), 'When unit is hour:')
show_np(np.datetime64('2021-04-13 15:33', 'D'), 'When unit is day:')


variable content: 2021-04-13T15:33
variable type: <class 'numpy.datetime64'>
variable dtype: datetime64[m]

Enforcing time unit:

When unit is nanosecond:
variable content: 2021-04-13T15:33:00.000000000
variable type: <class 'numpy.datetime64'>
variable dtype: datetime64[ns]

When unit is hour:
variable content: 2021-04-13T15
variable type: <class 'numpy.datetime64'>
variable dtype: datetime64[h]

When unit is day:
variable content: 2021-04-13
variable type: <class 'numpy.datetime64'>
variable dtype: datetime64[D]



In [173]:
print(stamp_np.dtype)
stamp_np.dtype

datetime64[m]


dtype('<M8[m]')

In [72]:
# then why the same function returns two different types, depending on situation?

# as explained: https://stackoverflow.com/questions/29206612/difference-between-data-type-datetime64ns-and-m8ns
# datetime64[ns] is a general dtype, while <M8[ns] is a specific dtype.
# On a machine whose byte order is little endian, there is no difference between np.dtype('datetime64[ns]') and np.dtype('<M8[ns]')
# However, on a big endian machine, np.dtype('datetime64[ns]') would equal np.dtype('>M8[ns]')
# So datetime64[ns] maps to either <M8[ns] or >M8[ns] depending on the endian-ness of the machine.

In [73]:
# indeed:
np.dtype('datetime64[ns]') == np.dtype('<M8[ns]')

True

### pandas Timestamp

In [175]:
import pandas as pd
stamp_pandas = pd.to_datetime('2021-04-13 15:33')
stamp_pandas, type(stamp_pandas)

(Timestamp('2021-04-13 15:33:00'), pandas._libs.tslibs.timestamps.Timestamp)

In [176]:
show(stamp_pandas)

variable content: 2021-04-13 15:33:00
variable type: <class 'pandas._libs.tslibs.timestamps.Timestamp'>



In [177]:
# converting back to native
stamp_pandas.to_pydatetime()

# note: in previous version of Pandas, this was called Timestamp.to_datetime(), causing confusion. Now it is py_datetime()

datetime.datetime(2021, 4, 13, 15, 33)

### Unix (POSIX) time

In [178]:
# the simplest but least readable form: integer number of seconds from the Epoch.

In [179]:
# let's calculate POSIX time for 1st January 2021
stamp_posix = (2021 - 1970) * 365 * 24 * 60 * 60
stamp_posix

1608336000

In [180]:
show(stamp_posix)

variable content: 1608336000
variable type: <class 'int'>



# 2. converting to Unix time

### Pandas Timestamp to Unix time (POSIX time)

In [181]:
# to confuse you, the function to convert time to an integer is also called. timestamp()
# This is referred to as 'POSIX timestamp'
# So: pd.Timestamp = Pandas replacement for python datetime.datetime object.
# pd.Timestamp.timestamp() = POSIX timestamp 

In [182]:
posix_from_pandas = stamp_pandas.timestamp()
posix_from_pandas

1618327980.0

### numpy timedate64 to Unix time (POSIX time)

In [183]:
stamp_np

numpy.datetime64('2021-04-13T15:33')

In [184]:
# converting from timedate64 to posix time implies checking the unit and then simple mathematics
stamp_np.dtype

dtype('<M8[m]')

In [185]:
  print(stamp_np.dtype)

datetime64[m]


In [186]:
# since the unit was minutes, then multiplying this by 60 gives seconds from Epoch
posix_from_numpy_minutes = stamp_np.astype('uint64') * np.uint64(60)
show_np(posix_from_numpy_minutes)

variable content: 1618327980
variable type: <class 'numpy.uint64'>
variable dtype: uint64



In [192]:
assert posix_from_pandas == posix_from_numpy_minutes

In [193]:
# then it is better to enforce the unit as seconds upfront.
# This results in dtype that stores directly the seconds since Epoch
stamp_in_seconds = np.datetime64(stamp, 's')
stamp_in_seconds

numpy.datetime64('2021-04-13T15:33:00')

In [194]:
# now conversion is easy
posix_timestamp = stamp_in_seconds.astype('uint64')
show_np(posix_timestamp)

variable content: 1618327980
variable type: <class 'numpy.uint64'>
variable dtype: uint64



In [195]:
assert posix_from_pandas == posix_from_numpy_seconds

# 3. working with lists of time stamps

In [259]:
# regular list of strings
l = ['2020-09-01 00:37:07', '2020-10-02 00:47:17', '2020-11-22 01:25:10']

In [260]:
# convert this to numpy will reveal time unit as dtype
to64 = np.vectorize(lambda x: np.datetime64(x))
to64(l)

array(['2020-09-01T00:37:07', '2020-10-02T00:47:17',
       '2020-11-22T01:25:10'], dtype='datetime64[s]')

In [261]:
# for example, enforcing the unit ns in each datetime object
# will effect in creation of an array of dtype = 'datetime64[ns]'
to64 = np.vectorize(lambda x: np.datetime64(x, 'ns'))
to64(l)

array(['2020-09-01T00:37:07.000000000', '2020-10-02T00:47:17.000000000',
       '2020-11-22T01:25:10.000000000'], dtype='datetime64[ns]')

In [262]:
#now, how to convert this to pandas?

# aparently, pandas to_datetime behaves differently, depending on type of argument:
# following documentation: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
#If parsing succeeded. Return type depends on input:
#    list-like: DatetimeIndex
#    Series: Series of datetime64 dtype
#    scalar: Timestamp


# therefore, pd.to_datetime applied to list will return DatetimeIndex with dtype='datetime64[ns]'
dti = pd.to_datetime(l, format='%Y-%m-%d %H:%M:%S')
dti

DatetimeIndex(['2020-09-01 00:37:07', '2020-10-02 00:47:17',
               '2020-11-22 01:25:10'],
              dtype='datetime64[ns]', freq=None)

In [264]:
type(dti)

pandas.core.indexes.datetimes.DatetimeIndex

In [283]:
dti.dtype

dtype('<M8[ns]')

In [286]:
type(dti[0])

pandas._libs.tslibs.timestamps.Timestamp

In [290]:
# now let's verify how the same function behaves when results are stored as DataFrame column
df = pd.DataFrame()
df['stamps'] = pd.to_datetime(l, format='%Y-%m-%d %H:%M:%S')
s = df['stamps']
s

0   2020-09-01 00:37:07
1   2020-10-02 00:47:17
2   2020-11-22 01:25:10
Name: stamps, dtype: datetime64[ns]

In [291]:
type(s)

pandas.core.series.Series

In [292]:
s.dtype

dtype('<M8[ns]')

In [293]:
type(s[0])

pandas._libs.tslibs.timestamps.Timestamp

In [305]:
# to confuse you even more
df.dtypes

stamps    datetime64[ns]
dtype: object

In [309]:
a = df.dtypes
a.values

array([dtype('<M8[ns]')], dtype=object)

In [311]:
print(np.dtype('<M8[ns]'))

datetime64[ns]


In [301]:
# are the two results equal?
# the content is...
dti[0] == s[0], dti[1] == s[1], dti[2] == s[2]

(True, True, True)

In [303]:
# ...but the overall data structures aren't because of the type
dti.equals(s)

False

In [324]:
# but if you convert the type, they will be equal
# here is how to convert Series of datetimes to DateTimeIndex
dti_from_s = pd.to_datetime(s.values)
dti_from_s

DatetimeIndex(['2020-09-01 00:37:07', '2020-10-02 00:47:17',
               '2020-11-22 01:25:10'],
              dtype='datetime64[ns]', freq=None)

In [325]:
# now both are equal
dti.equals(dti_from_s)

True

In [222]:
# and how to convert this to Python native datetime?
# converting this to python native datetime format will return np.ndarray of datetime.datetime
# (just for completeness, because I rarely use datetime.datetime)
a = dti.to_pydatetime()
a

array([datetime.datetime(2020, 9, 1, 0, 37, 7),
       datetime.datetime(2020, 10, 2, 0, 47, 17),
       datetime.datetime(2020, 11, 22, 1, 25, 10)], dtype=object)

In [223]:
type(a), a.dtype

(numpy.ndarray, dtype('O'))

In [224]:
# Now prepare for a shock: how to convert the list of timestamps to Unix time?

# A bit of a nightmare...
### some of the futile attempts to convert series of timestamps to numbers

In [225]:
# let's prepare some raw data first

source_short = ['2020-10-15 09:56:56', '2020-11-02 22:18:00',
       '2020-10-28 04:23:36', '2020-10-16 21:38:27',
       '2020-11-07 14:04:26', '2020-10-22 02:22:48',
       '2020-10-08 15:43:56', '2020-10-16 00:59:35',
       '2020-10-13 22:27:10', '2020-10-07 08:57:40',
       '2020-11-05 22:30:13', '2020-10-08 04:14:31',
       '2020-10-13 22:40:27', '2020-11-19 08:22:59',
       '2020-11-20 23:27:20', '2020-11-22 07:02:46',
       '2020-10-20 22:08:36', '2020-11-13 03:46:37',
       '2020-10-25 12:56:39', '2020-11-11 08:15:13',
       '2020-11-25 03:45:36', '2020-10-25 02:25:24',
       '2020-10-27 03:35:26', '2020-10-13 03:43:37',
       '2020-12-01 17:10:23', '2020-10-29 09:13:03',
       '2020-10-27 02:03:44', '2020-11-02 02:09:14',
       '2020-10-23 17:08:41', '2020-10-13 22:16:19',
       '2020-11-28 06:55:57', '2020-10-31 17:34:50',
       '2020-10-26 10:56:33', '2020-10-25 13:38:49',
       '2020-11-07 08:01:35', '2020-10-08 04:22:14',
       '2020-10-30 19:52:58', '2020-11-20 08:09:29',
       '2020-10-08 08:42:26', '2020-11-14 11:46:25',
       '2020-10-18 22:30:01', '2020-10-28 08:25:36',
       '2020-10-19 03:44:04', '2020-11-12 19:41:42',
       '2020-11-12 02:36:31', '2020-11-26 13:46:52',
       '2020-10-08 16:03:06', '2020-12-01 06:46:28',
       '2020-10-16 10:51:53', '2020-11-16 10:44:17']

In [357]:
d = source_short[0]
type(d)

str

In [364]:
np.arange(100)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

array([2.07425951e+01, 5.00143321e+01, 5.98726312e+01, ...,
       1.00069904e+05, 1.00091702e+05, 1.00026018e+05])

In [402]:
# aternatively, here is an artificially generated longer source of 100K elements
# starts at the same time, and each will be increased by 1 minute with random noise of up to 100 minutes

import random

size = 1000000
steps = np.arange(size)
noise = [random.random() * 100 for i in range(size)]
numbers = steps + noise

start_date = pd.to_datetime(source_short[0])
source_long = start_date + pd.to_timedelta(numbers, 'm')
# cast to list of strings
source_long = [str(i) for i in source_long]


In [403]:
# pick the source you like

#source = source_short
source = source_long

In [404]:
# this is our data
df = pd.DataFrame()
format='%Y-%m-%d %H:%M:%S'
df['stamps'] = pd.to_datetime(source, format=format)
data = df['stamps']
data1 = data

In [405]:
data.head(), type(data), type(data[0])

(0   2020-10-15 10:39:27.013349240
 1   2020-10-15 10:35:37.616540600
 2   2020-10-15 10:50:38.049376520
 3   2020-10-15 11:22:58.646504600
 4   2020-10-15 10:27:35.218191140
 Name: stamps, dtype: datetime64[ns],
 pandas.core.series.Series,
 pandas._libs.tslibs.timestamps.Timestamp)

In [406]:
# let's try various attempts to convert oure series to POSIX...

In [407]:
# how about this?
to_timestamp = np.vectorize(lambda x: x.timestamp())
ts_data = to_timestamp(data)

# the reason this won't work is that vectorize does not see our data as not pd.Timestamp, but as np.datetime64
# throws AttributeError: 'numpy.datetime64' object has no attribute 'timestamp'

AttributeError: 'numpy.datetime64' object has no attribute 'timestamp'

In [232]:
# so how about this?
to_posix = np.vectorize(lambda x: x.astype('uint64'))
to_posix(data)
# Also not good! Now numpy thinks this is pd.Timestamp, rather than np.datetime64!
# throws AttributeError: 'Timestamp' object has no attribute 'astype'

AttributeError: 'Timestamp' object has no attribute 'astype'

In [234]:
# then how about this?
# what if we explicitly convert to numpy.datetime64, without pd.Timestamp magic?
v = data.values
to_posix = np.vectorize(lambda x: x.astype('uint64'))
to_posix(v)
# Also not good! Now numpy thinks this is integer, rather than np.datetime64!
# throws AttributeError: 'int' object has no attribute 'astype'

AttributeError: 'int' object has no attribute 'astype'

In [235]:
# I found that I could convert to numpy native int64
v = data.values
million = int(1e9)
to_posix = np.vectorize(lambda x: np.int64(x) // million)
to_posix(v)
# finally... bingo

array([1602755816, 1604355480, 1603859016, 1602884307, 1604757866,
       1603333368, 1602171836, 1602809975, 1602628030, 1602061060,
       1604615413, 1602130471, 1602628827, 1605774179, 1605914840,
       1606028566, 1603231716, 1605239197, 1603630599, 1605082513,
       1606275936, 1603592724, 1603769726, 1602560617, 1606842623,
       1603962783, 1603764224, 1604282954, 1603472921, 1602627379,
       1606546557, 1604165690, 1603709793, 1603633129, 1604736095,
       1602130934, 1604087578, 1605859769, 1602146546, 1605354385,
       1603060201, 1603873536, 1603079044, 1605210102, 1605148591,
       1606398412, 1602172986, 1606805188, 1602845513, 1605523457],
      dtype=int64)

# How to convert series of pd.Timestamps to integers
Now that we know how to do it, we can improve it for efficiency

In [408]:
# option 1:
# if data is stored in DataFrame

df = pd.DataFrame()
format='%Y-%m-%d %H:%M:%S'
# pd.to_datetime converts strings to Datetimeindex, but because the result is kept as Dataframe column, it actually
# gets stored as Series of datetime64 (disguised as pd.Timestamps)
# Utterly confusing.
df['stamps'] = pd.to_datetime(source, format=format)

In [410]:
%%time 
# for 1M records: Wall time: 900 ms

# exctract datetime64
v = df['stamps'].values
million = int(1e9)
assert v.dtype == np.dtype('datetime64[ns]')
# convert to seconds from Epoch
to_posix = np.vectorize(lambda x: np.int64(x) // million)
to_posix(v)

Wall time: 888 ms


array([1602758367, 1602758137, 1602759038, ..., 1662757816, 1662755941,
       1662761417], dtype=int64)

In [411]:
# option 2:
# if the original data is stored as DateTimeIndex, we need less code and less CPU!

# store strings as DateTimeIndex. Just remember to not store the result as DataFrame column 
data = pd.to_datetime(source, format=format)

In [412]:
type(data)

pandas.core.indexes.datetimes.DatetimeIndex

In [413]:
type(source)

list

In [415]:
%%time
# for 150,000 records: Wall time: 50 ms

# ensure the unit is nanoseconds
assert data.dtype == np.dtype('datetime64[ns]')
# convert to seconds from Epoch
# because we use DatetimeIndex, we now CAN use astype
million = int(1e9)
data.astype('int64') // million

Wall time: 37.8 ms


Int64Index([1602758367, 1602758137, 1602759038, 1602760978, 1602757655,
            1602760664, 1602758620, 1602757821, 1602760088, 1602761424,
            ...
            1662757225, 1662755461, 1662756297, 1662757438, 1662756997,
            1662756833, 1662755963, 1662757816, 1662755941, 1662761417],
           dtype='int64', length=1000000)

In [417]:
%%time
# for 150,000 records: Wall time: 60 ms

# option 3

# use Pandas native arithmetics
epoch = pd.Timestamp("1970-01-01")
second = pd.Timedelta("1s")
(df.stamps - epoch) // second

Wall time: 49 ms


0         1602758367
1         1602758137
2         1602759038
3         1602760978
4         1602757655
             ...    
999995    1662756833
999996    1662755963
999997    1662757816
999998    1662755941
999999    1662761417
Name: stamps, Length: 1000000, dtype: int64

# To summarize
The last two ways of converting are much more efficient time-wise. Use (df.stamps - epoch) // pd.Timedelta("1s")