# GOALS
To create a GBQ view containing a daily state on users engagement.
- generate dates index
- generate users and dates index
- fill last_login_date for each date/user key
- fill last_consumption_start_date for each date/user key
- fill last_consumption_completion_date for each date/user key

# PACKAGES

In [1]:
import pandas as pd
from google.oauth2 import service_account
import pandas_gbq
import logging

# PARAMETERS

In [2]:
logger = logging.getLogger("pandas_gbq")
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

project_id = "analytics-dev-308300"

credentials = service_account.Credentials.from_service_account_file(
    "../keys/gcp_key.json",
)


pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

In [3]:
user_id = 20129

# FUNCTIONS

## get data

In [4]:
def get_data():
    """
    ()-->df
    """
    users_file=pd.read_excel('../data/in/users.xlsx')
    users_file['last_login']=users_file['last_login'].fillna(pd.NaT)
#     users_file=users_file[users_file['group_id']==1818]
    
    users_list = users_file['user_id'].unique().tolist()
    
    creation_df= users_file[['user_id','created_at']].drop_duplicates(ignore_index=True)
    creation_df['created_at']=pd.to_datetime(
        creation_df['created_at'], utc=True
        )   
    
    login_df = users_file[['date_str','user_id','last_login']]
    login_df = login_df.rename(columns={'date_str':'extraction_date'})
    login_df['last_login']=pd.to_datetime(
        login_df['last_login'], utc=True
        )

    consumption_df = pd.read_excel('../data/in/reports.xlsx')
    consumption_df["started_at"] = pd.to_datetime(
        consumption_df["started_at"], utc=True
    )
    consumption_df["completed_at"] = pd.to_datetime(
        consumption_df["completed_at"], utc=True
    )

    return users_list, creation_df, login_df, consumption_df

## generate base (users and dates) data frame

In [5]:
def create_base_df(
    ls_users,
    creation_df,
    start_date="2021-08-16",
    end_date=pd.Timestamp.today().strftime("%Y-%m-%d"),
):
    """
    (date-like, date_like, series) --> df
    Create a dataframe with one row for each combination of user and date. Date range is defined by start_date and end_date (excluded).
    """
    dates_index = (
        pd.to_datetime(
            pd.date_range(start=start_date, end=end_date, name="action_date")
        )
        .strftime("%Y-%m-%d")
        .to_list()
    )

    actions_dict = [
        {"action_date": action_date, "user_id": user}
        for action_date in dates_index
        for user in ls_users
    ]
    
    base_df=pd.DataFrame(actions_dict)
    base_df=base_df.merge(creation_df, how='left',on='user_id')
    base_df=base_df.drop(index=base_df[base_df['created_at']>base_df['action_date']].index)
    
    return base_df

## fill last_login_date for each date/user key

In [6]:
def update_last_login(base_df, logins_df):
    """
    (df,df)-->df
    Include last_login and created_at into actions_df.
    """
    actions_df = base_df.merge(
        logins_df,
        how="left",
        left_on=["action_date", "user_id"],
        right_on=["extraction_date", "user_id"],
    ).drop(columns=["extraction_date"])
    
    actions_df['last_login']=actions_df['last_login'].fillna(pd.NaT)
    
    return actions_df

## fill last_consumption_date for each date/user key

In [7]:
def max_date(consumption_df, reporting_date, user_id, date_of_interest):
    """
    (df,date like str, int)
    Select the maximum value for date_of_interest field that is inferior to the reporting date (23:59:59), for the specified user_id.
    """
    max_start = consumption_df[
        (consumption_df["user_id"] == user_id)
        & (
            consumption_df[date_of_interest]
            <= pd.Timestamp(reporting_date + " 23:59:59", tz="UTC")
        )
    ][date_of_interest].max()

    return max_start

In [8]:
def user_status(timedelta_since_last_login, timedelta_since_last_start,timedelta_since_last_completion):
    """
    (timedelta,timedelta,timedelta)--> str
    """
    if timedelta_since_last_completion <= pd.Timedelta(7,'D'):
        return '4.learner'
    elif timedelta_since_last_start <= pd.Timedelta(7,'D'):
        return '3.consumer'
    elif timedelta_since_last_login <= pd.Timedelta(7,'D'):
        return '2.curious'
    else:
        return '1.missing'

assert user_status(
    timedelta_since_last_login=pd.Timedelta(pd.NaT),
    timedelta_since_last_start=pd.Timedelta('2 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('1 days 11:29:23'))=='4.learner'

assert user_status(
    timedelta_since_last_login=pd.Timedelta(pd.NaT),
    timedelta_since_last_start=pd.Timedelta('1 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('7 days 11:29:23'))=='3.consumer'

assert user_status(
    timedelta_since_last_login=pd.Timedelta(pd.NaT),
    timedelta_since_last_start=pd.Timedelta('1 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta(pd.NaT))=='3.consumer'

assert user_status(
    timedelta_since_last_login=pd.Timedelta('1 days 13:41:36'),
    timedelta_since_last_start=pd.Timedelta('8 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('7 days 11:29:23'))=='2.curious'

assert user_status(
    timedelta_since_last_login=pd.Timedelta('7 days 13:41:36'),
    timedelta_since_last_start=pd.Timedelta('8 days 13:41:36'),
    timedelta_since_last_completion=pd.Timedelta('10 days 11:29:23'))=='1.missing'

In [9]:
def update_consumption_dates(actions_df,consumption_df):
    """
    (df,df)-->df
    Update the start_date for each user with the maximum value inferior to the action_date. Action date is extended with 23:59:59 to encompass the entire day.
    """
    updated_actions_df=actions_df.copy()
    updated_actions_df['last_start_date']=updated_actions_df.apply(lambda x: max_date(consumption_df,x['action_date'],x['user_id'],'started_at'), axis=1)
    updated_actions_df['last_completion_date']=updated_actions_df.apply(lambda x: max_date(consumption_df,x['action_date'],x['user_id'],'completed_at'), axis=1)
    
    updated_actions_df['timedelta_since_last_login']=pd.to_datetime(updated_actions_df['action_date']+' 23:59:59',utc=True)-pd.to_datetime(updated_actions_df['last_login'],utc=True)
    updated_actions_df['timedelta_since_last_start']=pd.to_datetime(updated_actions_df['action_date']+' 23:59:59',utc=True)-updated_actions_df['last_start_date']
    updated_actions_df['timedelta_since_last_completion']=pd.to_datetime(updated_actions_df['action_date']+' 23:59:59',utc=True)-updated_actions_df['last_completion_date']
    
    updated_actions_df['user_status']=updated_actions_df.apply(lambda x:
                                                               user_status(
                                                                   x['timedelta_since_last_login'],
                                                                   x['timedelta_since_last_start'],
                                                                   x['timedelta_since_last_completion']),
                                                               axis=1
                                                              )
    
    return updated_actions_df
    

# DATA WRANGLING

In [11]:
ls_users, df_creation, df_logins, df_consumption = get_data()

In [12]:
df_actions = update_last_login(create_base_df(ls_users,df_creation), df_logins)

In [13]:
reporting_date = "2021-08-14"
max_date(consumption_df=df_consumption, reporting_date=reporting_date, user_id=user_id, date_of_interest='started_at')

NaT

In [14]:
df_actions_final=update_consumption_dates(df_actions,df_consumption)
df_actions_final

Unnamed: 0,action_date,user_id,created_at,last_login,last_start_date,last_completion_date,timedelta_since_last_login,timedelta_since_last_start,timedelta_since_last_completion,user_status
0,2021-08-16,20029,2021-07-25 00:11:04+00:00,NaT,2021-08-15 10:18:23+00:00,2021-08-15 12:30:36+00:00,NaT,1 days 13:41:36,1 days 11:29:23,4.learner
1,2021-08-16,20030,2021-07-25 02:42:03+00:00,NaT,NaT,NaT,NaT,NaT,NaT,1.missing
2,2021-08-16,20032,2021-07-25 21:00:47+00:00,NaT,2021-08-16 22:11:30+00:00,2021-08-13 17:43:00+00:00,NaT,0 days 01:48:29,3 days 06:16:59,4.learner
3,2021-08-16,20082,2021-07-31 23:54:33+00:00,NaT,NaT,NaT,NaT,NaT,NaT,1.missing
4,2021-08-16,20100,2021-08-03 11:05:51+00:00,NaT,2021-08-14 16:02:44+00:00,2021-08-14 16:04:27+00:00,NaT,2 days 07:57:15,2 days 07:55:32,4.learner
...,...,...,...,...,...,...,...,...,...,...
429,2021-08-26,20293,2021-08-25 13:15:49+00:00,2021-08-25 20:38:20+00:00,2021-08-26 13:44:41+00:00,2021-08-26 13:45:07+00:00,1 days 03:21:39,0 days 10:15:18,0 days 10:14:52,4.learner
430,2021-08-26,20294,2021-08-25 13:16:33+00:00,NaT,NaT,NaT,NaT,NaT,NaT,1.missing
431,2021-08-26,20295,2021-08-25 13:17:01+00:00,NaT,NaT,NaT,NaT,NaT,NaT,1.missing
432,2021-08-26,20296,2021-08-25 13:17:25+00:00,NaT,NaT,NaT,NaT,NaT,NaT,1.missing


In [16]:
df_actions_final.to_gbq('dtm_engagement.ft_users_engagement',project_id=project_id,if_exists='replace',credentials=credentials)

434 out of 434 rows loaded.
1it [00:06,  6.46s/it]
