# GOALS
To create a GBQ view containing a daily state on users engagement.
- generate dates index
- generate users and dates index
- fill last_login_date for each date/user key
- fill last_consumption_start_date for each date/user key
- fill last_consumption_completion_date for each date/user key

# PACKAGES

In [1]:
import pandas as pd
from google.oauth2 import service_account
import pandas_gbq
import logging

# PARAMETERS

In [2]:
logger = logging.getLogger("pandas_gbq")
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

project_id = "analytics-dev-308300"

credentials = service_account.Credentials.from_service_account_file(
    "../keys/gcp_key.json",
)

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

# FUNCTIONS

## get data

In [62]:
def get_data():
    """
    ()-->df
    """
    query = """
        SELECT DISTINCT
            user_id, created_at, extraction_date,
            CONCAT(first_name, ' ', last_name) as user_name
        FROM
            dtm_engagement.dim_users
        WHERE
            group_id=1818
    """
    users_df = pd.read_gbq(
        query=query, credentials=credentials, project_id=project_id)

    users_lst = users_df['user_id'].tolist()

    
    creation_df= users_df[['user_id','user_name','created_at']].drop_duplicates(ignore_index=True)
    creation_df['created_at']=pd.to_datetime(
        creation_df['created_at'], utc=True
        )

    query = """
        SELECT DISTINCT
            user_id, last_login, extraction_date
        FROM
            dtm_engagement.hist_users
        WHERE
            group_id=1818
        ORDER BY
            extraction_date DESC
        """

    login_df = pd.read_gbq(
        query=query, credentials=credentials, project_id=project_id)

    login_df=login_df.drop_duplicates(subset=['user_id','extraction_date'],keep='last',ignore_index=True)
    
    login_df["last_login"] = pd.to_datetime(
        login_df["last_login"], utc=True
    )

    query = """
        SELECT DISTINCT
            user_id,
            set_id,
            started_at,
            completed_at
        FROM
            dtm_engagement.ft_content_consumption
        WHERE
            group_id=1818
        """
    consumption_df = pd.read_gbq(
        query=query, credentials=credentials, project_id=project_id
    )

    consumption_df["started_at"] = pd.to_datetime(
        consumption_df["started_at"], utc=True
    )
    consumption_df["completed_at"] = pd.to_datetime(
        consumption_df["completed_at"], utc=True
    )

    return users_lst, creation_df, login_df, consumption_df

## generate base (users and dates) data frame

In [63]:
def create_base_df(
    ls_users,
    creation_df,
    start_date="2021-08-16",
    end_date=pd.Timestamp.today().strftime("%Y-%m-%d"),
):
    """
    (date-like, date_like, series) --> df
    Create a dataframe with one row for each combination of user and date. Date range is defined by start_date and end_date (excluded).
    """
    dates_index = (
        pd.to_datetime(
            pd.date_range(start=start_date, end=end_date, name="action_date")
        )
        .strftime("%Y-%m-%d")
        .to_list()
    )

    actions_dict = [
        {"action_date": action_date, "user_id": user}
        for action_date in dates_index
        for user in ls_users
    ]
    
    base_df=pd.DataFrame(actions_dict)
    base_df=base_df.merge(creation_df, how='left',on='user_id')
    base_df=base_df.drop(index=base_df[base_df['created_at']>base_df['action_date']].index)
    
    return base_df

## fill last_consumption_start_date for each date/user key

In [64]:
def max_date(consumption_df, reporting_date, user_id, date_of_interest):
    """
    (df,date like str, int)
    Select the maximum value for date_of_interest field that is inferior to the reporting date (23:59:59), for the specified user_id.
    """
    max_start = consumption_df[
        (consumption_df["user_id"] == user_id)
        & (
            consumption_df[date_of_interest]
            <= pd.Timestamp(reporting_date + " 23:59:59", tz="UTC")
        )
    ][date_of_interest].max()

    return max_start

## Fill the number of completed sets by period

In [65]:
def calculate_nb_of_sets_of_interest(consumption_df,reporting_date,user_id,date_of_interest,nb_of_days):
    """
    (df, date like str, int, int, date like str)-->number
    For the user_id, count the number of set_ids where completed at is between reporting_date-number_of_days and reporting_date.
    """

    number_of_sets = consumption_df[(consumption_df['user_id']==user_id) & 
                                    (consumption_df[date_of_interest].between(pd.Timestamp(reporting_date + " 23:59:59", tz="UTC")-pd.Timedelta(nb_of_days,'days'),pd.Timestamp(reporting_date + " 23:59:59", tz="UTC")))]['set_id'].count()
    
    return number_of_sets

## Set user status based on dates

In [66]:
def user_status(timedelta_since_last_login, timedelta_since_last_start,timedelta_since_last_completion):
    """
    (timedelta,timedelta,timedelta)--> str
    """
    if timedelta_since_last_completion <= pd.Timedelta(7,'D'):
        return '4.learner'
    elif timedelta_since_last_start <= pd.Timedelta(7,'D'):
        return '3.consumer'
    elif timedelta_since_last_login <= pd.Timedelta(7,'D'):
        return '2.curious'
    else:
        return '1.missing'

assert user_status(
    timedelta_since_last_login=pd.Timedelta(pd.NaT),
    timedelta_since_last_start=pd.Timedelta('2 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('1 days 11:29:23'))=='4.learner'

assert user_status(
    timedelta_since_last_login=pd.Timedelta(pd.NaT),
    timedelta_since_last_start=pd.Timedelta('1 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('7 days 11:29:23'))=='3.consumer'

assert user_status(
    timedelta_since_last_login=pd.Timedelta(pd.NaT),
    timedelta_since_last_start=pd.Timedelta('1 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta(pd.NaT))=='3.consumer'

assert user_status(
    timedelta_since_last_login=pd.Timedelta('1 days 13:41:36'),
    timedelta_since_last_start=pd.Timedelta('8 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('7 days 11:29:23'))=='2.curious'

assert user_status(
    timedelta_since_last_login=pd.Timedelta('7 days 13:41:36'),
    timedelta_since_last_start=pd.Timedelta('8 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('10 days 11:29:23'))=='1.missing'

## create engagement df by completing base_df with calculated fields

In [67]:
def generate_engagement_df(base_df,consumption_df,login_df):
    """
    (df,df)-->df
    Update the start_date for each user with the maximum value inferior to the action_date. Action date is extended with 23:59:59 to encompass the entire day.
    """
    engagement_df=base_df.copy()

    engagement_df['last_login_date']=engagement_df.apply(lambda x: max_date(login_df,x['action_date'],x['user_id'],'last_login'), axis=1)
    engagement_df['timedelta_since_last_login']=pd.to_datetime(engagement_df['action_date']+' 23:59:59',utc=True)-pd.to_datetime(engagement_df['last_login_date'],utc=True)
    engagement_df['days_since_last_login']=engagement_df['timedelta_since_last_login'].dt.days
    
    engagement_df['last_start_date']=engagement_df.apply(lambda x: max_date(consumption_df,x['action_date'],x['user_id'],'started_at'), axis=1)
    engagement_df['timedelta_since_last_start']=pd.to_datetime(engagement_df['action_date']+' 23:59:59',utc=True)-engagement_df['last_start_date']
    engagement_df['days_since_last_start']=engagement_df['timedelta_since_last_start'].dt.days
    
    engagement_df['last_completion_date']=engagement_df.apply(lambda x: max_date(consumption_df,x['action_date'],x['user_id'],'completed_at'), axis=1)
    engagement_df['timedelta_since_last_completion']=pd.to_datetime(engagement_df['action_date']+' 23:59:59',utc=True)-engagement_df['last_completion_date']
    engagement_df['days_since_last_completion']=engagement_df['timedelta_since_last_completion'].dt.days
    
    engagement_df['nb_of_completed_sets']=engagement_df.apply(lambda x: calculate_nb_of_sets_of_interest(consumption_df=consumption_df,reporting_date=x['action_date'],user_id=x['user_id'],date_of_interest='completed_at',nb_of_days=7),axis=1)
    
    engagement_df['user_status']=engagement_df.apply(lambda x:
                                                               user_status(
                                                                   x['timedelta_since_last_login'],
                                                                   x['timedelta_since_last_start'],
                                                                   x['timedelta_since_last_completion']),
                                                               axis=1
                                                              )
    
    return engagement_df

# DATA WRANGLING

In [68]:
ls_users, df_creation, df_login, df_consumption = get_data()

Requesting query... 
Query running...
Job ID: 15ce3b3b-5c7d-4535-a840-e6f95ed91592
Query done.
Processed: 188.2 KB Billed: 10.0 MB
Standard price: $0.00 USD

Got 45 rows.

Requesting query... 
Query running...
Job ID: 9259bd06-8e1e-4057-b757-0e10c3ab808b
Query done.
Cache hit.

Got 265 rows.

Requesting query... 
Query running...
Job ID: 999df03e-344b-4c66-92ca-008f396b9d76
Query done.
Cache hit.

Got 360 rows.



In [69]:
df_engagement = generate_engagement_df(create_base_df(ls_users,df_creation),df_consumption,df_login)
df_engagement

Unnamed: 0,action_date,user_id,user_name,created_at,last_login_date,timedelta_since_last_login,days_since_last_login,last_start_date,timedelta_since_last_start,days_since_last_start,last_completion_date,timedelta_since_last_completion,days_since_last_completion,nb_of_completed_sets,user_status
0,2021-08-16,20029,JM Benedetto,2021-07-25 00:11:04+00:00,NaT,NaT,,2021-08-15 10:18:23+00:00,1 days 13:41:36,1.0,2021-08-15 12:30:36+00:00,1 days 11:29:23,1.0,3,4.learner
1,2021-08-16,20030,Aurelien Jacomy,2021-07-25 02:42:03+00:00,NaT,NaT,,NaT,NaT,,NaT,NaT,,0,1.missing
2,2021-08-16,20032,Martin Ciriani,2021-07-25 21:00:47+00:00,NaT,NaT,,2021-08-16 22:11:30+00:00,0 days 01:48:29,0.0,2021-08-13 17:43:00+00:00,3 days 06:16:59,3.0,2,4.learner
8,2021-08-16,20082,H Aluno,2021-07-31 23:54:33+00:00,NaT,NaT,,NaT,NaT,,NaT,NaT,,0,1.missing
9,2021-08-16,20100,Luana Amarante,2021-08-03 11:05:51+00:00,NaT,NaT,,2021-08-14 16:02:44+00:00,2 days 07:57:15,2.0,2021-08-14 16:04:27+00:00,2 days 07:55:32,2.0,2,4.learner
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580,2021-08-28,20161,Lucas Santos Ribeiro,2021-08-12 11:30:06+00:00,2021-08-18 12:28:59+00:00,10 days 11:31:00,10.0,2021-08-26 19:11:59+00:00,2 days 04:48:00,2.0,2021-08-26 19:12:08+00:00,2 days 04:47:51,2.0,4,4.learner
581,2021-08-28,20165,Jose Carlos Pasquini Catozich,2021-08-12 11:33:06+00:00,2021-08-18 13:56:50+00:00,10 days 10:03:09,10.0,2021-08-27 12:19:06+00:00,1 days 11:40:53,1.0,2021-08-27 12:19:33+00:00,1 days 11:40:26,1.0,3,4.learner
582,2021-08-28,20118,Martin Ciriani,2021-08-06 12:29:27+00:00,2021-08-20 20:01:05+00:00,8 days 03:58:54,8.0,2021-08-28 01:04:47+00:00,0 days 22:55:12,0.0,2021-08-23 21:14:07+00:00,5 days 02:45:52,5.0,1,4.learner
583,2021-08-28,20293,Ricardo CRKS,2021-08-25 13:15:49+00:00,2021-08-25 20:38:20+00:00,3 days 03:21:39,3.0,2021-08-26 13:44:41+00:00,2 days 10:15:18,2.0,2021-08-26 13:45:07+00:00,2 days 10:14:52,2.0,8,4.learner


In [70]:
df_engagement.to_gbq('raw_engagement.users_engagement',project_id=project_id,if_exists='replace',credentials=credentials)

524 out of 524 rows loaded.
1it [00:05,  5.90s/it]
