# GOALS
To create a GBQ view containing a daily state on users engagement.
- generate dates index
- generate users and dates index
- fill last_login_date for each date/user key
- fill last_consumption_start_date for each date/user key
- fill last_consumption_completion_date for each date/user key

# PACKAGES

In [31]:
import pandas as pd
from google.oauth2 import service_account
import pandas_gbq
import logging

# PARAMETERS

In [32]:
logger = logging.getLogger("pandas_gbq")
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

project_id = "analytics-dev-308300"

credentials = service_account.Credentials.from_service_account_file(
    "../keys/gcp_key.json",
)

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

# FUNCTIONS

## get data

In [33]:
def get_data():
    """
    ()-->df
    """
    query = """
        SELECT DISTINCT
            user_id, created_at, extraction_date
        FROM
            dtm_engagement.dim_users
        WHERE
            group_id=1818
    """
    users_df = pd.read_gbq(
        query=query, credentials=credentials, project_id=project_id)

    users_lst = users_df['user_id'].tolist()

    
    creation_df= users_df[['user_id','created_at']].drop_duplicates(ignore_index=True)
    creation_df['created_at']=pd.to_datetime(
        creation_df['created_at'], utc=True
        )

    query = """
        SELECT DISTINCT
            user_id, last_login, extraction_date
        FROM
            dtm_engagement.hist_users
        WHERE
            group_id=1818
        ORDER BY
            extraction_date DESC
        """

    login_df = pd.read_gbq(
        query=query, credentials=credentials, project_id=project_id)

    login_df=login_df.drop_duplicates(subset=['user_id','extraction_date'],keep='last',ignore_index=True)

    query = """
        SELECT DISTINCT
            user_id,
            started_at,
            completed_at
        FROM
            dtm_engagement.ft_content_consumption
        WHERE
            group_id=1818
        """
    consumption_df = pd.read_gbq(
        query=query, credentials=credentials, project_id=project_id
    )

    consumption_df["started_at"] = pd.to_datetime(
        consumption_df["started_at"], utc=True
    )
    consumption_df["completed_at"] = pd.to_datetime(
        consumption_df["completed_at"], utc=True
    )

    return users_lst, creation_df, login_df, consumption_df

## generate base (users and dates) data frame

In [34]:
def create_base_df(
    ls_users,
    creation_df,
    start_date="2021-08-16",
    end_date=pd.Timestamp.today().strftime("%Y-%m-%d"),
):
    """
    (date-like, date_like, series) --> df
    Create a dataframe with one row for each combination of user and date. Date range is defined by start_date and end_date (excluded).
    """
    dates_index = (
        pd.to_datetime(
            pd.date_range(start=start_date, end=end_date, name="action_date")
        )
        .strftime("%Y-%m-%d")
        .to_list()
    )

    actions_dict = [
        {"action_date": action_date, "user_id": user}
        for action_date in dates_index
        for user in ls_users
    ]
    
    base_df=pd.DataFrame(actions_dict)
    base_df=base_df.merge(creation_df, how='left',on='user_id')
    base_df=base_df.drop(index=base_df[base_df['created_at']>base_df['action_date']].index)
    
    return base_df

## fill last_login_date for each date/user key

In [35]:
def update_last_login(base_df, logins_df):
    """
    (df,df)-->df
    Include last_login and created_at into actions_df.
    """
    actions_df = base_df.merge(
        logins_df,
        how="left",
        left_on=["action_date", "user_id"],
        right_on=["extraction_date", "user_id"],
    ).drop(columns=["extraction_date"])
    
    actions_df['last_login']=actions_df['last_login'].fillna(pd.NaT)
    
    return actions_df

## fill last_consumption_start_date for each date/user key

In [36]:
def max_date(consumption_df, reporting_date, user_id, date_of_interest):
    """
    (df,date like str, int)
    Select the maximum value for date_of_interest field that is inferior to the reporting date (23:59:59), for the specified user_id.
    """
    max_start = consumption_df[
        (consumption_df["user_id"] == user_id)
        & (
            consumption_df[date_of_interest]
            <= pd.Timestamp(reporting_date + " 23:59:59", tz="UTC")
        )
    ][date_of_interest].max()

    return max_start

## update consumption dates

In [37]:
def update_consumption_dates(actions_df,consumption_df):
    """
    (df,df)-->df
    Update the start_date for each user with the maximum value inferior to the action_date. Action date is extended with 23:59:59 to encompass the entire day.
    """
    updated_actions_df=actions_df.copy()
    updated_actions_df['last_start_date']=updated_actions_df.apply(lambda x: max_date(consumption_df,x['action_date'],x['user_id'],'started_at'), axis=1)
    updated_actions_df['last_completion_date']=updated_actions_df.apply(lambda x: max_date(consumption_df,x['action_date'],x['user_id'],'completed_at'), axis=1)
    
    updated_actions_df['timedelta_since_last_login']=pd.to_datetime(updated_actions_df['action_date']+' 23:59:59',utc=True)-pd.to_datetime(updated_actions_df['last_login'],utc=True)
    updated_actions_df['timedelta_since_last_start']=pd.to_datetime(updated_actions_df['action_date']+' 23:59:59',utc=True)-updated_actions_df['last_start_date']
    updated_actions_df['timedelta_since_last_completion']=pd.to_datetime(updated_actions_df['action_date']+' 23:59:59',utc=True)-updated_actions_df['last_completion_date']
    
    updated_actions_df['user_status']=updated_actions_df.apply(lambda x:
                                                               user_status(
                                                                   x['timedelta_since_last_login'],
                                                                   x['timedelta_since_last_start'],
                                                                   x['timedelta_since_last_completion']),
                                                               axis=1
                                                              )
    
    return updated_actions_df
    



## Set user status based on dates

In [38]:
def user_status(timedelta_since_last_login, timedelta_since_last_start,timedelta_since_last_completion):
    """
    (timedelta,timedelta,timedelta)--> str
    """
    if timedelta_since_last_completion <= pd.Timedelta(7,'D'):
        return '4.learner'
    elif timedelta_since_last_start <= pd.Timedelta(7,'D'):
        return '3.consumer'
    elif timedelta_since_last_login <= pd.Timedelta(7,'D'):
        return '2.curious'
    else:
        return '1.missing'

assert user_status(
    timedelta_since_last_login=pd.Timedelta(pd.NaT),
    timedelta_since_last_start=pd.Timedelta('2 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('1 days 11:29:23'))=='4.learner'

assert user_status(
    timedelta_since_last_login=pd.Timedelta(pd.NaT),
    timedelta_since_last_start=pd.Timedelta('1 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('7 days 11:29:23'))=='3.consumer'

assert user_status(
    timedelta_since_last_login=pd.Timedelta(pd.NaT),
    timedelta_since_last_start=pd.Timedelta('1 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta(pd.NaT))=='3.consumer'

assert user_status(
    timedelta_since_last_login=pd.Timedelta('1 days 13:41:36'),
    timedelta_since_last_start=pd.Timedelta('8 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('7 days 11:29:23'))=='2.curious'

assert user_status(
    timedelta_since_last_login=pd.Timedelta('7 days 13:41:36'),
    timedelta_since_last_start=pd.Timedelta('8 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('10 days 11:29:23'))=='1.missing'

# DATA WRANGLING

In [28]:
ls_users, df_creation, df_logins, df_consumption = get_data()

Requesting query... 
Query running...
Job ID: 2f4cf58e-d49e-4cc9-8dfc-4a0bdc79c335
Query done.
Processed: 163.8 KB Billed: 10.0 MB
Standard price: $0.00 USD

Got 45 rows.

Requesting query... 
Query running...
Job ID: 3d67a4c5-479f-4bec-994d-9b7fd16c65b9
Query done.
Processed: 42.9 KB Billed: 10.0 MB
Standard price: $0.00 USD

Got 220 rows.

Requesting query... 
Query running...
Job ID: 830914f2-6980-47a4-9a86-64790b5296df
Query done.
Cache hit.

Got 54 rows.



In [39]:
df_actions = update_last_login(create_base_df(ls_users,df_creation), df_logins)

In [40]:
df_actions_final=update_consumption_dates(df_actions,df_consumption)
df_actions_final

Unnamed: 0,action_date,user_id,created_at,last_login,last_start_date,last_completion_date,timedelta_since_last_login,timedelta_since_last_start,timedelta_since_last_completion,user_status
0,2021-08-16,20029,2021-07-25 00:11:04+00:00,NaT,2021-08-15 10:18:23+00:00,2021-08-15 12:30:36+00:00,NaT,1 days 13:41:36,1 days 11:29:23,4.learner
1,2021-08-16,20030,2021-07-25 02:42:03+00:00,NaT,NaT,NaT,NaT,NaT,NaT,1.missing
2,2021-08-16,20032,2021-07-25 21:00:47+00:00,NaT,2021-08-16 22:11:30+00:00,2021-08-13 17:43:00+00:00,NaT,0 days 01:48:29,3 days 06:16:59,4.learner
3,2021-08-16,20082,2021-07-31 23:54:33+00:00,NaT,NaT,NaT,NaT,NaT,NaT,1.missing
4,2021-08-16,20100,2021-08-03 11:05:51+00:00,NaT,2021-08-14 16:02:44+00:00,2021-08-14 16:04:27+00:00,NaT,2 days 07:57:15,2 days 07:55:32,4.learner
...,...,...,...,...,...,...,...,...,...,...
474,2021-08-27,20161,2021-08-12 11:30:06+00:00,2021-08-18T12:28:59+00:00,NaT,NaT,9 days 11:31:00,NaT,NaT,1.missing
475,2021-08-27,20165,2021-08-12 11:33:06+00:00,2021-08-18T13:56:50+00:00,NaT,NaT,9 days 10:03:09,NaT,NaT,1.missing
476,2021-08-27,20118,2021-08-06 12:29:27+00:00,2021-08-20T20:01:05+00:00,2021-08-23 21:14:03+00:00,2021-08-23 21:14:07+00:00,7 days 03:58:54,4 days 02:45:56,4 days 02:45:52,4.learner
477,2021-08-27,20293,2021-08-25 13:15:49+00:00,2021-08-25T20:38:20+00:00,NaT,NaT,2 days 03:21:39,NaT,NaT,2.curious


In [41]:
df_actions_final.to_gbq('dtm_engagement.ft_users_engagement',project_id=project_id,if_exists='replace',credentials=credentials)

479 out of 479 rows loaded.
479 out of 479 rows loaded.
1it [00:03,  3.90s/it]
