## A Worked Example: Assembling a Time Series Data Collection

In [3]:
import pandas as pd

path = "~/MLProjects/practical-timeseries-analysis/ch02/data/"
YearJoined = pd.read_csv(path+"year-joined.csv")
emails = pd.read_csv(path+"emails.csv")

In [4]:
YearJoined

Unnamed: 0,user,userStats,yearJoined
0,0,silver,2014
1,1,silver,2015
2,2,silver,2016
3,3,bronze,2018
4,4,silver,2018
...,...,...,...
995,995,bronze,2016
996,996,bronze,2018
997,997,bronze,2018
998,998,bronze,2017


In [5]:
YearJoined.groupby("user", as_index=False).count().groupby("userStats", as_index=False).count()

Unnamed: 0,userStats,user,yearJoined
0,1,1000,1000


In [6]:
emails

Unnamed: 0,emailsOpened,user,week
0,3.0,1.0,2015-06-29 00:00:00
1,2.0,1.0,2015-07-13 00:00:00
2,2.0,1.0,2015-07-20 00:00:00
3,3.0,1.0,2015-07-27 00:00:00
4,1.0,1.0,2015-08-03 00:00:00
...,...,...,...
25483,3.0,998.0,2018-04-30 00:00:00
25484,3.0,998.0,2018-05-07 00:00:00
25485,3.0,998.0,2018-05-14 00:00:00
25486,3.0,998.0,2018-05-21 00:00:00


In [7]:
emails[emails.emailsOpened < 1]

Unnamed: 0,emailsOpened,user,week


In [8]:
emails[emails.user==998]

Unnamed: 0,emailsOpened,user,week
25464,1.0,998.0,2017-12-04 00:00:00
25465,3.0,998.0,2017-12-11 00:00:00
25466,3.0,998.0,2017-12-18 00:00:00
25467,3.0,998.0,2018-01-01 00:00:00
25468,3.0,998.0,2018-01-08 00:00:00
25469,2.0,998.0,2018-01-15 00:00:00
25470,3.0,998.0,2018-01-22 00:00:00
25471,2.0,998.0,2018-01-29 00:00:00
25472,3.0,998.0,2018-02-05 00:00:00
25473,3.0,998.0,2018-02-12 00:00:00


In [9]:
(max(pd.to_datetime(emails[emails.user==998].week)) - min(pd.to_datetime(emails[emails.user==998].week))).days / 7

25.0

In [10]:
emails[emails.user==998].shape

(24, 3)

In [11]:
complete_idx = pd.MultiIndex.from_product(
    (
        set(emails.week), set(emails.user)
    )
)

all_email = emails.set_index(["week", "user"]).reindex(complete_idx, fill_value=0).reset_index()
all_email.columns = ["week", "user", "emailsOpened"]

In [12]:
all_email[all_email.user==998].sort_values("week")

Unnamed: 0,week,user,emailsOpened
59828,2015-02-09 00:00:00,998.0,0.0
73303,2015-02-16 00:00:00,998.0,0.0
54438,2015-02-23 00:00:00,998.0,0.0
69530,2015-03-02 00:00:00,998.0,0.0
17786,2015-03-09 00:00:00,998.0,0.0
...,...,...,...
37729,2018-04-30 00:00:00,998.0,3.0
77615,2018-05-07 00:00:00,998.0,3.0
16708,2018-05-14 00:00:00,998.0,3.0
17247,2018-05-21 00:00:00,998.0,3.0


In [13]:
cutoff_dates = emails.groupby("user").week.agg(["min", "max"]).reset_index()

In [14]:
for _, row in cutoff_dates.iterrows():
    usr = row["user"]
    start_date = row["min"]
    end_date = row["max"]
    
all_email.drop(
    all_email[all_email.user==usr][
        all_email.week<start_date
    ].index, inplace=True
)
all_email.drop(
    all_email[all_email.user==usr][
        all_email.week>end_date
    ].index, inplace=True
)

  all_email[all_email.user==usr][
  all_email[all_email.user==usr][


In [15]:
all_email

Unnamed: 0,week,user,emailsOpened
0,2015-03-30 00:00:00,1.0,0.0
1,2015-03-30 00:00:00,3.0,0.0
2,2015-03-30 00:00:00,5.0,0.0
3,2015-03-30 00:00:00,6.0,0.0
4,2015-03-30 00:00:00,9.0,0.0
...,...,...,...
93241,2015-04-13 00:00:00,987.0,0.0
93242,2015-04-13 00:00:00,991.0,0.0
93243,2015-04-13 00:00:00,992.0,0.0
93244,2015-04-13 00:00:00,993.0,0.0


## Constructing a Found Time Series

In [25]:
donations = pd.read_csv(path+"donations.csv")

In [26]:
donations["timestamp"] = pd.to_datetime(donations["timestamp"])
donations.set_index("timestamp", inplace=True)

agg_donations = donations.groupby("user", as_index=False).apply(lambda df: df["amount"].resample("W-MON").sum().dropna())

In [28]:
#TODO debug "user"
for usr, usr_email in all_email.groupby("user"):
    usr_donations = agg_donations[agg_donations["user"]==usr]
    usr_donations.set_index("timestamp", inplace=True)
    
    usr_email.set_index("week", inplace=True)
    usr_email = all_email[all_email["user"]==usr]
    usr_email.sort_values("week").set_index("week")
    
    df = pd.merge(
        usr_email, usr_donations,
        how="left", left_index=True, right_index=True
    )
    df.fillna(0)
    
    df["user"] = df["user_x"]
    merged_df = merged_df.append(df.reset_index(),
                                 [["user", "week", "emailsOpened", "amount"]])

KeyError: 'user'