In [1]:
%matplotlib inline
import numpy as np 
import pandas as pd
import matplotlib.pylab

In [28]:
#lets read in a fixed width file
data = pd.read_fwf('ao_monthly.txt', header = None)
data.head()

Unnamed: 0,0,1,2
0,1950,1,-0.06031
1,1950,2,0.62681
2,1950,3,-0.008128
3,1950,4,0.5551
4,1950,5,0.071577


In [31]:
#lets format this data better when calling pd.read_fwf
"""
We specify the columns containing time data(here year from col 0, and month from col 1)
We also tell pandas to infer the date format from the cols specified in parse_dates
"""
CleanerData = pd.read_fwf('ao_monthly.txt', header = None, use_cols = col_names, index_col = 0, parse_dates=[[0,1]], infer_datetime_format=True)

10 loops, best of 3: 33.7 ms per loop


In [32]:
#rename the columns to for better comprehension
CleanerData.rename(columns={1:'Measurement'}, inplace=True)
CleanerData.index.names=['Monthly Data']
CleanerData.head()

Unnamed: 0_level_0,Measurement
Monthly Data,Unnamed: 1_level_1
1950-01-01,-0.06031
1950-02-01,0.62681
1950-03-01,-0.008128
1950-04-01,0.5551
1950-05-01,0.071577


In [33]:
#Now that the data format is easier to work with, we can now extract some useful information

#Lets first find the timespan that the data deals with
print(min(CleanerData.index))
print(max(CleanerData.index))

1950-01-01 00:00:00
2016-06-01 00:00:00


In [34]:
#we see we've got timestamps as output, we dont need timestamps, since this seems to be data for a period(here monthly)
CleanerData.to_period().head()

#we can see that the data type is a period, since the dataframe does not bother setting a date for each of the dates.

Unnamed: 0_level_0,Measurement
Monthly Data,Unnamed: 1_level_1
1950-01,-0.06031
1950-02,0.62681
1950-03,-0.008128
1950-04,0.5551
1950-05,0.071577


In [43]:
import timeit
dateparse = lambda x, y: pd.datetime.strptime('%s-%s'%(x,y), '%Y-%m')

#let's time different ways of reading in fwf data
%timeit data = pd.read_fwf('ao_monthly.txt', header=None, index_col = 0, parse_dates=[[0,1]], date_parser=dateparse)

10 loops, best of 3: 80.9 ms per loop


In [50]:
#lesson - you being helpful, by providing a custom solution, does not always work
%timeit data = pd.read_fwf('ao_monthly.txt', header=None, index_col = 0, parse_dates=[[0,1]])

10 loops, best of 3: 29.6 ms per loop


In [51]:
#lesson - try a few things out with the data, find out which one is the fastest
#because the time difference for reading in data will be relevant for larger datasets.
%timeit data = pd.read_fwf('ao_monthly.txt', header=None, index_col = 0, parse_dates=[[0,1]], infer_datetime_format=True)

10 loops, best of 3: 28.9 ms per loop


In [52]:
"""
Suppose you've already read in the data(maybe it was pickled or something. 
You now want to make a timestamp out of multiple columns in the dataframe.
Enter datetime.
"""
sampleData = pd.DataFrame({'year':[2015, 2016], 'month':[2,3], 'day':[25, 26], 'hour':[12, 13]})
sampleData

Unnamed: 0,day,hour,month,year
0,25,12,2,2015
1,26,13,3,2016


In [54]:
#the to_datetime function automatically inferred what each time element each column contained
#we didn't need to write explicit code for it.
"""
however, if the dataframe had a column which did not have time related column name, 
the above would throw an error.
"""
pd.to_datetime(sampleData)

0   2015-02-25 12:00:00
1   2016-03-26 13:00:00
dtype: datetime64[ns]

In [55]:
#truncating values
series = pd.Series(range(10), index = pd.date_range('4/22/2019', freq='M', periods=10))
series.head()

2019-04-30    0
2019-05-31    1
2019-06-30    2
2019-07-31    3
2019-08-31    4
Freq: M, dtype: int32

In [57]:
#I think truncating is another way of removing the dates you dont want.
series.truncate(before='4/22/2019', after='7/15/2019')

#truncating preserves frequency

2019-04-30    0
2019-05-31    1
2019-06-30    2
Freq: M, dtype: int32

In [58]:
#you can truncate in ways that does not preserve frequency(freq=None in result)
series[[1,6,7]].index

DatetimeIndex(['2019-05-31', '2019-10-31', '2019-11-30'], dtype='datetime64[ns]', freq=None)

In [59]:
#But pandas will try to preserve frequency whenever possible.(see freq = 2M)
series[0:10:2]

2019-04-30    0
2019-06-30    2
2019-08-31    4
2019-10-31    6
2019-12-31    8
Freq: 2M, dtype: int32