# GOALS
To create a GBQ view containing a daily state on users engagement.
- generate dates index
- generate users and dates index
- fill last_login_date for each date/user key
- fill last_consumption_start_date for each date/user key
- fill last_consumption_completion_date for each date/user key

# PACKAGES

In [5]:
import pandas as pd
from google.oauth2 import service_account
import pandas_gbq
import logging

# PARAMETERS

In [6]:
logger = logging.getLogger("pandas_gbq")
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

project_id = "analytics-dev-308300"

credentials = service_account.Credentials.from_service_account_file(
    "../keys/gcp_key.json",
)


pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

# FUNCTIONS

## get data

In [7]:
def get_data():
    """
    ()-->df
    """
    query = """
        SELECT DISTINCT
            user_id
        FROM
            dtm_engagement.dim_users
        WHERE
            group_id=1818
    """
    users_list = pd.read_gbq(
        query=query, credentials=credentials, project_id=project_id
    )["user_id"].tolist()

    query = """
        SELECT DISTINCT
            extraction_date, user_id, last_login
        FROM
            dtm_engagement.dim_users
        WHERE group_id=1818
    """
    login_df = pd.read_gbq(query=query, credentials=credentials, project_id=project_id)

    query = """
        SELECT DISTINCT user_id,started_at,completed_at
        FROM dtm_engagement.ft_content_consumption
        WHERE group_id=1818
        """
    consumption_df = pd.read_gbq(
        query=query, credentials=credentials, project_id=project_id
    )

    consumption_df["started_at"] = pd.to_datetime(
        consumption_df["started_at"], utc=True
    )
    consumption_df["completed_at"] = pd.to_datetime(
        consumption_df["completed_at"], utc=True
    )

    return users_list, login_df, consumption_df

## generate base (users and dates) data frame

In [8]:
def create_base_df(
    ls_users,
    start_date="2021-08-16",
    end_date=pd.Timestamp.today().strftime("%Y-%m-%d"),
):
    """
    (date-like, date_like, series) --> df
    Create a dataframe with one row for each combination of user and date. Date range is defined by start_date and end_date (excluded).
    """
    dates_index = (
        pd.to_datetime(
            pd.date_range(start=start_date, end=end_date, name="action_date")
        )
        .strftime("%Y-%m-%d")
        .to_list()
    )

    actions_dict = [
        {"action_date": action_date, "user_id": user}
        for action_date in dates_index
        for user in ls_users
    ]
    return pd.DataFrame(actions_dict)

## fill last_login_date for each date/user key

In [9]:
def update_last_login(actions_df, logins_df):
    """
    (df,df)-->df
    """
    updated_actions_df = actions_df.merge(
        logins_df,
        how="left",
        left_on=["action_date", "user_id"],
        right_on=["extraction_date", "user_id"],
    ).drop(columns=["extraction_date"])
    return updated_actions_df

## fill last_consumption_start_date for each date/user key

In [11]:
def max_start_date(consumption_df, reporting_date, user_id, date_of_interest):
    """
    (df,date like str, int)
    """
    max_start = consumption_df[
        (consumption_df["user_id"] == 20029)
        & (
            consumption_df[date_of_interest]
            <= pd.Timestamp(reporting_date + " 23:59:59", tz="UTC")
        )
    ][date_of_interest].max()

    return max_start

In [4]:
reporting_date = "2021-08-14"
df_consumption_start = df_consumption[["user_id", "started_at"]]

display(
    df_consumption_start[
        (df_consumption_start["user_id"] == 20029)
        & (
            df_consumption_start["started_at"]
            <= pd.Timestamp(reporting_date + " 23:59:59", tz="UTC")
        )
    ]
)

df_consumption_start[
    (df_consumption_start["user_id"] == 20029)
    & (
        df_consumption_start["started_at"]
        <= pd.Timestamp(reporting_date + " 23:59:59", tz="UTC")
    )
]["started_at"].max()

NameError: name 'df_consumption' is not defined

user_id                             20029
started_at      2021-08-15 10:18:23+00:00
completed_at    2021-08-15 12:30:36+00:00
dtype: object

# DATA WRANGLING

In [12]:
ls_users, df_logins, df_consumption = get_data()

Requesting query... 
Query running...
Job ID: 4e6eb7be-2de4-4353-b9b1-cd6765fa09f7
Query done.
Cache hit.

Got 30 rows.

Requesting query... 
Query running...
Job ID: b9317774-c5b0-41cd-be88-85940c3abd3c
Query done.
Cache hit.

Got 30 rows.

Requesting query... 
Query running...
Job ID: 8800d4e4-a8df-4a15-95ec-ce59ad919369
Query done.
Cache hit.

Got 0 rows.



In [97]:
df_actions = update_last_login(create_base_df(ls_user), df_logins)

In [180]:
    query = """
        SELECT DISTINCT user_id,started_at,completed_at
        FROM dtm_engagement.ft_content_consumption
        WHERE group_id=1818
        """
    consumption_df = pd.read_gbq(
        query=query, credentials=credentials, project_id=project_id
    )

Requesting query... 
Requesting query... 
Requesting query... 
Query running...
Query running...
Query running...
Job ID: 1346d194-8635-4e6f-8ac6-2f2505e43040
Job ID: 1346d194-8635-4e6f-8ac6-2f2505e43040
Job ID: 1346d194-8635-4e6f-8ac6-2f2505e43040
Query done.
Cache hit.

Query done.
Cache hit.

Query done.
Cache hit.

Got 0 rows.

Got 0 rows.

Got 0 rows.

