# Advanced Pandas - Handling Time Series
**Abid Ali**

Skype: Abd.Soft

Email: [abdsoftfsd@gmail.com](mailto:abdsoftfsd@gmail.com)

In [2]:
import pandas as pd
import os

In [6]:
master = pd.read_pickle(os.path.join('data','modified', 'master.pickle'))

In [7]:
master.head()


Unnamed: 0_level_0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
aaltoan01,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta
abdelju01,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon
abidra01,Ramzi,Abid,L,1980.0,3.0,24.0,Canada,QC,Montreal
abrahth01,Thommy,Abrahamsson,D,1947.0,4.0,12.0,Sweden,,Leksand
actonke01,Keith,Acton,C,1958.0,4.0,15.0,Canada,ON,Stouffville


In [15]:
birthInfo = master.loc[:, ['birthYear', 'birthMon', 'birthDay']]
birthInfo.head()

Unnamed: 0_level_0,birthYear,birthMon,birthDay
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aaltoan01,1975.0,3.0,4.0
abdelju01,1987.0,2.0,25.0
abidra01,1980.0,3.0,24.0
abrahth01,1947.0,4.0,12.0
actonke01,1958.0,4.0,15.0


In [24]:
(birthInfo.loc[birthInfo['birthYear'] == 1970]
    .sample(3)
    .groupby(['birthYear', 'birthMon'])
    .count()
 )

Unnamed: 0_level_0,Unnamed: 1_level_0,birthDay
birthYear,birthMon,Unnamed: 2_level_1
1970.0,6.0,2
1970.0,9.0,1


In [25]:
master = pd.read_pickle(os.path.join('data', 'modified', 'master.pickle'))
team_splits = pd.read_pickle(os.path.join('data', 'modified', 'team_splits_mi.pickle'))


In [26]:
team_splits.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,L,OL,T,W
name,year,month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Anaheim Ducks,2006,Apr,0.0,2.0,,1.0
Anaheim Ducks,2006,Dec,5.0,0.0,,9.0
Anaheim Ducks,2006,Feb,5.0,2.0,,5.0
Anaheim Ducks,2006,Jan,5.0,2.0,,4.0
Anaheim Ducks,2006,Mar,3.0,2.0,,10.0


In [27]:
master.head()


Unnamed: 0_level_0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
aaltoan01,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta
abdelju01,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon
abidra01,Ramzi,Abid,L,1980.0,3.0,24.0,Canada,QC,Montreal
abrahth01,Thommy,Abrahamsson,D,1947.0,4.0,12.0,Sweden,,Leksand
actonke01,Keith,Acton,C,1958.0,4.0,15.0,Canada,ON,Stouffville


In [28]:
master = master.assign(birthDate = pd.to_datetime({
    'year': master.birthYear,
    'month': master.birthMon,
    'day': master.birthDay
}))
master = master.drop(columns=['birthYear', 'birthMon', 'birthDay'])
master.head()


Unnamed: 0_level_0,firstName,lastName,pos,birthCountry,birthState,birthCity,birthDate
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
aaltoan01,Antti,Aalto,C,Finland,,Lappeenranta,1975-03-04
abdelju01,Justin,Abdelkader,L,USA,MI,Muskegon,1987-02-25
abidra01,Ramzi,Abid,L,Canada,QC,Montreal,1980-03-24
abrahth01,Thommy,Abrahamsson,D,Sweden,,Leksand,1947-04-12
actonke01,Keith,Acton,C,Canada,ON,Stouffville,1958-04-15


In [29]:
master.birthDate.head(2)


playerID
aaltoan01   1975-03-04
abdelju01   1987-02-25
Name: birthDate, dtype: datetime64[ns]

In [30]:
master.birthDate[0]


Timestamp('1975-03-04 00:00:00')

In [31]:
ts = master.birthDate[0]


In [32]:
print(u"{}, {} {} {}".format(ts.day_name(), ts.day, ts.month_name(), ts.year))


Tuesday, 4 March 1975


In [34]:
tsz = ts.tz_localize('America/Toronto')
tsz


Timestamp('1975-03-04 00:00:00-0500', tz='America/Toronto')

In [35]:
tsz.tz_convert('Europe/London')


Timestamp('1975-03-04 05:00:00+0000', tz='Europe/London')

In [37]:
years = master.birthDate.dt.year
years.head()

playerID
aaltoan01    1975
abdelju01    1987
abidra01     1980
abrahth01    1947
actonke01    1958
Name: birthDate, dtype: int64

In [43]:
strings = master.birthDate.dt.strftime("%a-%d-%m-%Y")
strings.head()


playerID
aaltoan01    Tue-04-03-1975
abdelju01    Wed-25-02-1987
abidra01     Mon-24-03-1980
abrahth01    Sat-12-04-1947
actonke01    Tue-15-04-1958
Name: birthDate, dtype: object

In [44]:
strings = master.birthDate.dt.strftime('%Y-%m-%d')
strings.head()


playerID
aaltoan01    1975-03-04
abdelju01    1987-02-25
abidra01     1980-03-24
abrahth01    1947-04-12
actonke01    1958-04-15
Name: birthDate, dtype: object

In [45]:
dates = pd.to_datetime(strings)
dates.head()


playerID
aaltoan01   1975-03-04
abdelju01   1987-02-25
abidra01    1980-03-24
abrahth01   1947-04-12
actonke01   1958-04-15
Name: birthDate, dtype: datetime64[ns]

In [51]:
date = dates[0]
date.month_name()


'March'

In [52]:
date.year

1975

In [53]:
date.month


3

In [55]:
date.day_name()


'Tuesday'

In [56]:
strings = strings.str.replace('-', 'xx')
strings.head()


playerID
aaltoan01    1975xx03xx04
abdelju01    1987xx02xx25
abidra01     1980xx03xx24
abrahth01    1947xx04xx12
actonke01    1958xx04xx15
Name: birthDate, dtype: object

In [57]:
try:
    pd.to_datetime(strings)
except Exception as e:
    print(e)


Unknown string format: 1975xx03xx04


In [58]:
parsed_dates = pd.to_datetime(strings, format="%Yxx%mxx%d")
parsed_dates.head()


playerID
aaltoan01   1975-03-04
abdelju01   1987-02-25
abidra01    1980-03-24
abrahth01   1947-04-12
actonke01   1958-04-15
Name: birthDate, dtype: datetime64[ns]

In [60]:
strings_error = strings.copy()
strings_error.iloc[34] = "xx123"

try:
    pd.to_datetime(strings_error, format="%Yxx%mxx%d")
except Exception as e:
    print(e)


time data 'xx123' does not match format '%Yxx%mxx%d' (match)


In [61]:
parsed = pd.to_datetime(strings_error, format="%Yxx%mxx%d", errors="coerce")
parsed.iloc[34]


NaT

In [62]:
parsed.head()


playerID
aaltoan01   1975-03-04
abdelju01   1987-02-25
abidra01    1980-03-24
abrahth01   1947-04-12
actonke01   1958-04-15
Name: birthDate, dtype: datetime64[ns]

In [63]:
parsed = pd.to_datetime(strings_error, format="%Yxx%mxx%d", errors="ignore")
parsed.head()

playerID
aaltoan01    1975xx03xx04
abdelju01    1987xx02xx25
abidra01     1980xx03xx24
abrahth01    1947xx04xx12
actonke01    1958xx04xx15
Name: birthDate, dtype: object

In [64]:
birth_dates = pd.Series(data=master.index, index=master.birthDate)
birth_dates.head(2)


birthDate
1975-03-04    aaltoan01
1987-02-25    abdelju01
Name: playerID, dtype: object

In [65]:
birth_dates.index


DatetimeIndex(['1975-03-04', '1987-02-25', '1980-03-24', '1947-04-12',
               '1958-04-15', '1990-06-18', '1961-05-05', '1977-03-20',
               '1977-04-26', '1960-05-31',
               ...
               '1987-09-01', '1963-05-08', '1987-03-03', '1987-02-14',
               '1970-07-22', '1978-06-16', '1987-09-01', '1992-01-16',
               '1954-04-16', '1978-01-21'],
              dtype='datetime64[ns]', name='birthDate', length=4627, freq=None)

In [66]:
birth_dates = birth_dates.sort_index()
birth_dates.head(3)


birthDate
1940-01-27    harpete01
1940-03-22     keonda01
1940-10-03    ratelje01
Name: playerID, dtype: object

In [74]:
born_1980 = birth_dates['1980']
born_1980.head()


birthDate
1980-01-02    zalesmi01
1980-01-10    stanara01
1980-01-13    bouckty01
1980-01-13     ruppmi01
1980-01-15    papinju01
Name: playerID, dtype: object

In [76]:
birth_dates['1980-7']

birthDate
1980-07-03    miettan01
1980-07-05    huntetr01
1980-07-08    chouier01
1980-07-15    cheecjo01
1980-07-17    millery01
1980-07-19    tanabda01
1980-07-20    arsende01
1980-07-22    kalindm01
1980-07-24    jillsje01
1980-07-24    printda01
1980-07-31    fischji01
Name: playerID, dtype: object

In [79]:
birth_dates['1980-7-24']


birthDate
1980-07-24    jillsje01
1980-07-24    printda01
Name: playerID, dtype: object

In [86]:
birth_dates['1980-7-20': '1980-7-31']


birthDate
1980-07-20    arsende01
1980-07-22    kalindm01
1980-07-24    jillsje01
1980-07-24    printda01
1980-07-31    fischji01
Name: playerID, dtype: object

In [87]:
diff = birth_dates.index[1] - birth_dates.index[0]
diff


Timedelta('55 days 00:00:00')

In [88]:
birth_dates.index[10]


Timestamp('1945-09-08 00:00:00')

In [89]:
birth_dates.index[10] + diff


Timestamp('1945-11-02 00:00:00')

In [94]:
birth_dates.index - birth_dates.index[0]


TimedeltaIndex([    '0 days',    '55 days',   '250 days',   '755 days',
                 '1182 days',  '1508 days',  '1808 days',  '1829 days',
                 '1887 days',  '1975 days',
                ...
                '19179 days', '19218 days', '19245 days', '19267 days',
                '19283 days', '19294 days', '19308 days', '19406 days',
                '19434 days', '19440 days'],
               dtype='timedelta64[ns]', name='birthDate', length=4627, freq=None)

In [95]:
diff.components


Components(days=55, hours=0, minutes=0, seconds=0, milliseconds=0, microseconds=0, nanoseconds=0)

In [116]:
diff = birth_dates.index[1] - birth_dates.index[0]
diff = diff + '5 hours' - '10 minutes'
diff

Timedelta('55 days 04:50:00')

In [127]:
diff.round('10 D')


Timedelta('60 days 00:00:00')

In [131]:
diff.round('1H')


Timedelta('55 days 05:00:00')

In [132]:
birth_dates.head()
# birth_dates = birth_dates.to_period(freq="D")

birthDate
1940-01-27    harpete01
1940-03-22     keonda01
1940-10-03    ratelje01
1942-02-20    esposph01
1943-04-23    esposto01
Name: playerID, dtype: object

In [134]:
birth_dates = birth_dates.to_period(freq="D")

TypeError: unsupported Type PeriodIndex

In [137]:
birth_dates.head()

birthDate
1940-01-27    harpete01
1940-03-22     keonda01
1940-10-03    ratelje01
1942-02-20    esposph01
1943-04-23    esposto01
Freq: D, Name: playerID, dtype: object

In [138]:
birth_dates.index


PeriodIndex(['1940-01-27', '1940-03-22', '1940-10-03', '1942-02-20',
             '1943-04-23', '1944-03-14', '1945-01-08', '1945-01-29',
             '1945-03-28', '1945-06-24',
             ...
             '1992-07-31', '1992-09-08', '1992-10-05', '1992-10-27',
             '1992-11-12', '1992-11-23', '1992-12-07', '1993-03-15',
             '1993-04-12', '1993-04-18'],
            dtype='period[D]', name='birthDate', length=4627)

In [139]:
birth_dates.index[0]


Period('1940-01-27', 'D')

In [141]:
start = birth_dates.index[0].start_time
start

Timestamp('1940-01-27 00:00:00')

In [143]:
end = birth_dates.index[0].end_time
end

Timestamp('1940-01-27 23:59:59.999999999')

In [144]:
start < (start + pd.Timedelta('5H')) < end


True

In [151]:
start < (start + pd.Timedelta('23H59T59S')) < end


True

In [152]:
start < (start + pd.Timedelta('23H59T60S')) < end

False

In [156]:
birth_dates.index.year


Int64Index([1940, 1940, 1940, 1942, 1943, 1944, 1945, 1945, 1945, 1945,
            ...
            1992, 1992, 1992, 1992, 1992, 1992, 1992, 1993, 1993, 1993],
           dtype='int64', name='birthDate', length=4627)

In [157]:
birth_dates['1975'].head(2)


birthDate
1975-01-02    mroziri01
1975-01-03    vyshese01
Freq: D, Name: playerID, dtype: object

# Advanced Strategies


In [161]:
birth_dates.resample('1M')


<pandas.core.resample.PeriodIndexResampler object at 0x0000026A354041C0>

In [162]:
birth_dates.resample('1M').count().head()


birthDate
1940-01    1
1940-02    0
1940-03    1
1940-04    0
1940-05    0
Freq: M, Name: playerID, dtype: int64

In [164]:
birth_dates.resample('2M').count().head()


birthDate
1940-01    1
1940-03    1
1940-05    0
1940-07    0
1940-09    1
Freq: 2M, Name: playerID, dtype: int64

In [165]:
birth_dates.resample('1Q').count().head(2)


birthDate
1940Q1    2
1940Q2    0
Freq: Q-DEC, Name: playerID, dtype: int64

In [166]:
birth_dates.resample('W').count().head()


birthDate
1940-01-22/1940-01-28    1
1940-01-29/1940-02-04    0
1940-02-05/1940-02-11    0
1940-02-12/1940-02-18    0
1940-02-19/1940-02-25    0
Freq: W-SUN, Name: playerID, dtype: int64

In [167]:
from pandas.tseries.offsets import BDay
p = birth_dates.index[2]
print(p.to_timestamp())
print(p.to_timestamp() + BDay(7))


1940-10-03 00:00:00
1940-10-14 00:00:00


In [168]:
# shifting index by 1 day
birth_dates.shift(1, freq="D").head()


birthDate
1940-01-28    harpete01
1940-03-23     keonda01
1940-10-04    ratelje01
1942-02-21    esposph01
1943-04-24    esposto01
Freq: D, Name: playerID, dtype: object

In [169]:
# if we don't add freq, it will shift values instead of index.
birth_dates.shift(1).head()


birthDate
1940-01-27          NaN
1940-03-22    harpete01
1940-10-03     keonda01
1942-02-20    ratelje01
1943-04-23    esposph01
Freq: D, Name: playerID, dtype: object

In [175]:
team_splits = team_splits.sort_index(level='year')
team_splits.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,L,OL,T,W
name,year,month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Boston Bruins,1980,Apr,2.0,,0.0,1.0
Boston Bruins,1980,Dec,6.0,,1.0,6.0
Boston Bruins,1980,Feb,4.0,,2.0,6.0
Boston Bruins,1980,Jan,4.0,,1.0,9.0
Boston Bruins,1980,Mar,4.0,,3.0,8.0


In [176]:
team_splits = team_splits.reset_index()
team_splits.head()


Unnamed: 0,name,year,month,L,OL,T,W
0,Boston Bruins,1980,Apr,2.0,,0.0,1.0
1,Boston Bruins,1980,Dec,6.0,,1.0,6.0
2,Boston Bruins,1980,Feb,4.0,,2.0,6.0
3,Boston Bruins,1980,Jan,4.0,,1.0,9.0
4,Boston Bruins,1980,Mar,4.0,,3.0,8.0


In [178]:
string_date = team_splits.apply(lambda x:
                                pd.Period(f"{x.year}-{x.month}"), axis=1)
string_date[0]


Period('1980-04', 'M')

In [179]:
team_splits = team_splits.assign(month = string_date)
team_splits = team_splits.drop('year', axis=1)
team_splits.head()


Unnamed: 0,name,month,L,OL,T,W
0,Boston Bruins,1980-04,2.0,,0.0,1.0
1,Boston Bruins,1980-12,6.0,,1.0,6.0
2,Boston Bruins,1980-02,4.0,,2.0,6.0
3,Boston Bruins,1980-01,4.0,,1.0,9.0
4,Boston Bruins,1980-03,4.0,,3.0,8.0


In [180]:
team_splits = team_splits.set_index(['month'])


In [181]:
team_splits.head()


Unnamed: 0_level_0,name,L,OL,T,W
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1980-04,Boston Bruins,2.0,,0.0,1.0
1980-12,Boston Bruins,6.0,,1.0,6.0
1980-02,Boston Bruins,4.0,,2.0,6.0
1980-01,Boston Bruins,4.0,,1.0,9.0
1980-03,Boston Bruins,4.0,,3.0,8.0


In [182]:
team_splits.index


PeriodIndex(['1980-04', '1980-12', '1980-02', '1980-01', '1980-03', '1980-11',
             '1980-10', '1980-04', '1980-12', '1980-02',
             ...
             '2011-03', '2011-11', '2011-10', '2011-04', '2011-12', '2011-02',
             '2011-01', '2011-03', '2011-11', '2011-10'],
            dtype='period[M]', name='month', length=5435)

In [183]:
quarter_sums = team_splits.groupby('name').resample("Q").sum()
quarter_sums.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,L,OL,T,W
name,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Anaheim Ducks,2006Q1,13.0,6.0,0.0,19.0
Anaheim Ducks,2006Q2,0.0,2.0,0.0,1.0
Anaheim Ducks,2006Q3,0.0,0.0,0.0,0.0
Anaheim Ducks,2006Q4,7.0,6.0,0.0,28.0
Anaheim Ducks,2007Q1,10.0,3.0,0.0,26.0


In [184]:
quarter_means = team_splits.groupby('name').resample("Q").mean()
quarter_means.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,L,OL,T,W
name,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Anaheim Ducks,2006Q1,4.333333,2.0,,6.333333
Anaheim Ducks,2006Q2,0.0,2.0,,1.0
Anaheim Ducks,2006Q3,,,,
Anaheim Ducks,2006Q4,2.333333,2.0,,9.333333
Anaheim Ducks,2007Q1,3.333333,1.0,,8.666667


In [185]:
team_splits.to_pickle(os.path.join('data', 'modified', 'team_splits_periods.pickle'))
birth_dates.to_pickle(os.path.join('data', 'modified', 'birth_dates.pickle'))

team_splits.to_csv(os.path.join('data', 'modified', 'team_splits_periods.csv'))
birth_dates.to_csv(os.path.join('data', 'modified', 'birth_dates.csv'))
