# GOALS
To create a GBQ view containing a daily state on users engagement.
- generate dates index
- generate users and dates index
- fill last_login_date for each date/user key
- fill last_consumption_start_date for each date/user key
- fill last_consumption_completion_date for each date/user key

# PACKAGES

In [1]:
import pandas as pd
from google.oauth2 import service_account
import pandas_gbq
import logging

# PARAMETERS

In [2]:
logger = logging.getLogger("pandas_gbq")
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

project_id = "analytics-dev-308300"

credentials = service_account.Credentials.from_service_account_file(
    "../keys/gcp_key.json",
)

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

# FUNCTIONS

## get data

In [3]:
def get_data():
    """
    ()-->df
    """
    query = """
        SELECT DISTINCT
            user_id, created_at, extraction_date,
            CONCAT(first_name, ' ', last_name) as user_name
        FROM
            dtm_engagement.dim_users
        WHERE
            group_id=1818
    """
    users_df = pd.read_gbq(
        query=query, credentials=credentials, project_id=project_id)

    users_lst = users_df['user_id'].tolist()

    
    creation_df= users_df[['user_id','user_name','created_at']].drop_duplicates(ignore_index=True)
    creation_df['created_at']=pd.to_datetime(
        creation_df['created_at'], utc=True
        )

    query = """
        SELECT DISTINCT
            user_id, last_login, extraction_date
        FROM
            dtm_engagement.hist_users
        WHERE
            group_id=1818
        ORDER BY
            extraction_date DESC
        """

    login_df = pd.read_gbq(
        query=query, credentials=credentials, project_id=project_id)

    login_df=login_df.drop_duplicates(subset=['user_id','extraction_date'],keep='last',ignore_index=True)
    
    login_df["last_login"] = pd.to_datetime(
        login_df["last_login"], utc=True
    )

    query = """
        SELECT DISTINCT
            user_id,
            set_id,
            started_at,
            completed_at
        FROM
            dtm_engagement.ft_content_consumption
        WHERE
            group_id=1818
        """
    consumption_df = pd.read_gbq(
        query=query, credentials=credentials, project_id=project_id
    )

    consumption_df["started_at"] = pd.to_datetime(
        consumption_df["started_at"], utc=True
    )
    consumption_df["completed_at"] = pd.to_datetime(
        consumption_df["completed_at"], utc=True
    )

    return users_lst, creation_df, login_df, consumption_df

## generate base (users and dates) data frame

In [4]:
def create_base_df(
    ls_users,
    creation_df,
    start_date="2021-08-16",
    end_date=pd.Timestamp.today().strftime("%Y-%m-%d"),
):
    """
    (date-like, date_like, series) --> df
    Create a dataframe with one row for each combination of user and date. Date range is defined by start_date and end_date (excluded).
    """
    dates_index = (
        pd.to_datetime(
            pd.date_range(start=start_date, end=end_date, name="action_date")
        )
        .strftime("%Y-%m-%d")
        .to_list()
    )

    actions_dict = [
        {"action_date": action_date, "user_id": user}
        for action_date in dates_index
        for user in ls_users
    ]
    
    base_df=pd.DataFrame(actions_dict)
    base_df=base_df.merge(creation_df, how='left',on='user_id')
    base_df=base_df.drop(index=base_df[base_df['created_at']>base_df['action_date']].index)
    
    return base_df

## fill max date of interest for each date/user key

In [5]:
def max_date(consumption_df, reporting_date, user_id, date_of_interest):
    """
    (df,date like str, int)
    Select the maximum value for date_of_interest field that is inferior to the reporting date (23:59:59), for the specified user_id.
    """
    max_start = consumption_df[
        (consumption_df["user_id"] == user_id)
        & (
            consumption_df[date_of_interest]
            <= pd.Timestamp(reporting_date + " 23:59:59", tz="UTC")
        )
    ][date_of_interest].max()

    return max_start

## Fill the number of completed sets by period

In [6]:
def calculate_nb_of_sets_of_interest(consumption_df,reporting_date,user_id,date_of_interest,nb_of_days):
    """
    (df, date like str, int, int, date like str)-->number
    For the user_id, count the number of set_ids where completed at is between reporting_date-number_of_days and reporting_date.
    """

    number_of_sets = consumption_df[(consumption_df['user_id']==user_id) & 
                                    (consumption_df[date_of_interest].between(pd.Timestamp(reporting_date + " 23:59:59", tz="UTC")-pd.Timedelta(nb_of_days,'days'),pd.Timestamp(reporting_date + " 23:59:59", tz="UTC")))]['set_id'].count()
    
    return number_of_sets

## Set user status based on dates

In [7]:
def user_status(timedelta_since_last_login, timedelta_since_last_start,timedelta_since_last_completion):
    """
    (timedelta,timedelta,timedelta)--> str
    """
    if timedelta_since_last_completion <= pd.Timedelta(7,'D'):
        return '4.learner'
    elif timedelta_since_last_start <= pd.Timedelta(7,'D'):
        return '3.consumer'
    elif timedelta_since_last_login <= pd.Timedelta(7,'D'):
        return '2.curious'
    else:
        return '1.missing'

assert user_status(
    timedelta_since_last_login=pd.Timedelta(pd.NaT),
    timedelta_since_last_start=pd.Timedelta('2 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('1 days 11:29:23'))=='4.learner'

assert user_status(
    timedelta_since_last_login=pd.Timedelta(pd.NaT),
    timedelta_since_last_start=pd.Timedelta('1 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('7 days 11:29:23'))=='3.consumer'

assert user_status(
    timedelta_since_last_login=pd.Timedelta(pd.NaT),
    timedelta_since_last_start=pd.Timedelta('1 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta(pd.NaT))=='3.consumer'

assert user_status(
    timedelta_since_last_login=pd.Timedelta('1 days 13:41:36'),
    timedelta_since_last_start=pd.Timedelta('8 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('7 days 11:29:23'))=='2.curious'

assert user_status(
    timedelta_since_last_login=pd.Timedelta('7 days 13:41:36'),
    timedelta_since_last_start=pd.Timedelta('8 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('10 days 11:29:23'))=='1.missing'

## create engagement df by completing base_df with calculated fields

In [8]:
def generate_engagement_df(base_df,consumption_df,login_df):
    """
    (df,df)-->df
    Update the start_date for each user with the maximum value inferior to the action_date. Action date is extended with 23:59:59 to encompass the entire day.
    """
    engagement_df=base_df.copy()

    engagement_df['last_login_date']=engagement_df.apply(lambda x: max_date(login_df,x['action_date'],x['user_id'],'last_login'), axis=1)
    engagement_df['timedelta_since_last_login']=pd.to_datetime(engagement_df['action_date']+' 23:59:59',utc=True)-pd.to_datetime(engagement_df['last_login_date'],utc=True)
    engagement_df['days_since_last_login']=engagement_df['timedelta_since_last_login'].dt.days
    
    engagement_df['last_start_date']=engagement_df.apply(lambda x: max_date(consumption_df,x['action_date'],x['user_id'],'started_at'), axis=1)
    engagement_df['timedelta_since_last_start']=pd.to_datetime(engagement_df['action_date']+' 23:59:59',utc=True)-engagement_df['last_start_date']
    engagement_df['days_since_last_start']=engagement_df['timedelta_since_last_start'].dt.days
    
    engagement_df['last_completion_date']=engagement_df.apply(lambda x: max_date(consumption_df,x['action_date'],x['user_id'],'completed_at'), axis=1)
    engagement_df['timedelta_since_last_completion']=pd.to_datetime(engagement_df['action_date']+' 23:59:59',utc=True)-engagement_df['last_completion_date']
    engagement_df['days_since_last_completion']=engagement_df['timedelta_since_last_completion'].dt.days
    
    engagement_df['nb_of_completed_sets']=engagement_df.apply(lambda x: calculate_nb_of_sets_of_interest(consumption_df=consumption_df,reporting_date=x['action_date'],user_id=x['user_id'],date_of_interest='completed_at',nb_of_days=7),axis=1)
    
    engagement_df['user_status']=engagement_df.apply(lambda x:
                                                               user_status(
                                                                   x['timedelta_since_last_login'],
                                                                   x['timedelta_since_last_start'],
                                                                   x['timedelta_since_last_completion']),
                                                               axis=1
                                                              )
    
    return engagement_df

# DATA WRANGLING

In [9]:
ls_users, df_creation, df_login, df_consumption = get_data()

Requesting query... 
Query running...
Job ID: 38d0e4bd-b5e4-443f-9961-98ba59cb36ae
Query done.
Processed: 237.1 KB Billed: 10.0 MB
Standard price: $0.00 USD

Got 45 rows.

Requesting query... 
Query running...
Job ID: a4bdc547-a10e-4229-af0d-5392117ad28b
Query done.
Processed: 62.9 KB Billed: 10.0 MB
Standard price: $0.00 USD

Got 355 rows.

Requesting query... 
Query running...
Job ID: fe1c8887-588b-4a3b-ae05-a51f655d7953
Query done.
Processed: 19.2 KB Billed: 20.0 MB
Standard price: $0.00 USD

Got 264 rows.



In [10]:
df_engagement = generate_engagement_df(create_base_df(ls_users,df_creation),df_consumption,df_login)
df_engagement

Unnamed: 0,action_date,user_id,user_name,created_at,last_login_date,timedelta_since_last_login,days_since_last_login,last_start_date,timedelta_since_last_start,days_since_last_start,last_completion_date,timedelta_since_last_completion,days_since_last_completion,nb_of_completed_sets,user_status
0,2021-08-16,20029,JM Benedetto,2021-07-25 00:11:04+00:00,NaT,NaT,,NaT,NaT,,NaT,NaT,,0,1.missing
1,2021-08-16,20030,Aurelien Jacomy,2021-07-25 02:42:03+00:00,NaT,NaT,,NaT,NaT,,NaT,NaT,,0,1.missing
2,2021-08-16,20032,Martin Ciriani,2021-07-25 21:00:47+00:00,NaT,NaT,,NaT,NaT,,NaT,NaT,,0,1.missing
9,2021-08-16,20082,H Aluno,2021-07-31 23:54:33+00:00,NaT,NaT,,NaT,NaT,,NaT,NaT,,0,1.missing
10,2021-08-16,20100,Luana Amarante,2021-08-03 11:05:51+00:00,NaT,NaT,,NaT,NaT,,NaT,NaT,,0,1.missing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670,2021-08-30,20214,Jemerson Da Silva Claro,2021-08-13 12:59:59+00:00,2021-08-18 09:23:47+00:00,12 days 14:36:12,12.0,2021-08-23 15:39:26+00:00,7 days 08:20:33,7.0,2021-08-23 15:41:04+00:00,7 days 08:18:55,7.0,0,1.missing
671,2021-08-30,20215,Rafael Andrade de Jesus,2021-08-13 13:01:02+00:00,2021-08-16 10:30:41+00:00,14 days 13:29:18,14.0,2021-08-27 12:15:59+00:00,3 days 11:44:00,3.0,2021-08-27 12:16:21+00:00,3 days 11:43:38,3.0,1,4.learner
672,2021-08-30,20216,Paulo Sergio Massato Oshiro,2021-08-13 13:01:57+00:00,2021-08-16 11:46:31+00:00,14 days 12:13:28,14.0,2021-08-23 18:33:04+00:00,7 days 05:26:55,7.0,2021-08-23 18:36:01+00:00,7 days 05:23:58,7.0,0,1.missing
673,2021-08-30,20217,Marcelo Rezk da Silva,2021-08-13 13:02:57+00:00,2021-08-16 13:42:08+00:00,14 days 10:17:51,14.0,2021-08-16 13:53:39+00:00,14 days 10:06:20,14.0,2021-08-16 14:00:26+00:00,14 days 09:59:33,14.0,0,1.missing


In [11]:
df_engagement.to_gbq('raw_engagement.users_engagement',project_id=project_id,if_exists='replace',credentials=credentials)

614 out of 614 rows loaded.
1it [00:06,  6.95s/it]
