# TL;DR

1. The goal is of the baseline is to establish validation workflow and to have a reference point for future models.
2. The `Median` Model shows better results than the `Mean` Model. 

# Definitions

In [1]:
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, Any

import pandas as pd

from pltv.src.validate import Validator
from validate import Model

In [2]:
data_path = Path('..', 'data')
data_path.exists()

True

In [3]:
targets = ['ltv_day1', 'ltv_day7', 'ltv_day30', 'ltv_day180']

In [4]:
logging.basicConfig(level=logging.INFO)

# Data

In [5]:
train = pd.read_feather(data_path.joinpath('train.feather'))
train.info(verbose=True, show_counts=True)
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6994 entries, 0 to 6993
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   user_id                 6994 non-null   object        
 1   location                6994 non-null   object        
 2   gender                  6994 non-null   object        
 3   acquisition_date        6994 non-null   datetime64[ns]
 4   acquisition_channel     6994 non-null   object        
 5   eCPM                    6994 non-null   float64       
 6   total_time_spent        6994 non-null   float64       
 7   avg_session_time        6994 non-null   float64       
 8   num_games_played        6994 non-null   int64         
 9   favorite_game_genre     6994 non-null   object        
 10  total_rewards           6994 non-null   int64         
 11  total_coins             6994 non-null   int64         
 12  cashouts                6994 non-null   float64 

Unnamed: 0,user_id,location,gender,acquisition_date,acquisition_channel,eCPM,total_time_spent,avg_session_time,num_games_played,favorite_game_genre,...,cashouts,logins_past_7_days,total_logins,avg_daily_logins,notifications_received,ads_watched,ltv_day1,ltv_day7,ltv_day30,ltv_day180
0,u000000,Australia,Female,2024-04-08,In-App Ads,1.341458,128.15037,30.236412,65,Sports,...,173.913497,12,668,7.571228,54,131,2.5,6.7,15.0,40.0
1,u000001,France,Other,2023-09-28,In-App Ads,1.203097,34.064418,9.079346,97,Puzzle,...,85.865826,8,794,7.062567,22,336,2.5,6.7,15.0,40.0
2,u000002,Brazil,Male,2024-04-24,Referral,0.395693,96.559973,31.423285,97,Puzzle,...,195.22845,13,520,4.848931,54,182,2.311357,6.7,14.578598,25.0
3,u000003,Australia,Female,2023-10-17,In-App Ads,0.560207,691.653864,19.581271,19,Puzzle,...,154.958251,14,720,2.716627,7,404,2.5,6.7,15.0,40.0
4,u000004,Germany,Female,2023-12-15,Email,3.318075,411.067262,19.680246,7,Action,...,81.093215,14,175,3.753371,13,66,2.194381,6.7,15.0,25.0


In [6]:
holdout = pd.read_feather(data_path.joinpath('holdout.feather'))
holdout.info(verbose=True, show_counts=True)
holdout.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3006 entries, 0 to 3005
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   user_id                 3006 non-null   object        
 1   location                3006 non-null   object        
 2   gender                  3006 non-null   object        
 3   acquisition_date        3006 non-null   datetime64[ns]
 4   acquisition_channel     3006 non-null   object        
 5   eCPM                    3006 non-null   float64       
 6   total_time_spent        3006 non-null   float64       
 7   avg_session_time        3006 non-null   float64       
 8   num_games_played        3006 non-null   int64         
 9   favorite_game_genre     3006 non-null   object        
 10  total_rewards           3006 non-null   int64         
 11  total_coins             3006 non-null   int64         
 12  cashouts                3006 non-null   float64 

Unnamed: 0,user_id,location,gender,acquisition_date,acquisition_channel,eCPM,total_time_spent,avg_session_time,num_games_played,favorite_game_genre,...,cashouts,logins_past_7_days,total_logins,avg_daily_logins,notifications_received,ads_watched,ltv_day1,ltv_day7,ltv_day30,ltv_day180
0,u000008,UK,Other,2024-06-09,Organic,3.314625,241.654864,36.662943,7,Strategy,...,64.641178,0,238,2.147985,11,153,2.5,6.7,15.0,40.0
1,u00000a,Australia,Other,2024-08-16,Referral,1.436349,197.26016,51.058547,67,Adventure,...,113.004473,13,392,7.490043,75,276,2.5,6.7,15.0,40.0
2,u00000b,Germany,Male,2024-07-11,Organic,3.324685,221.292736,59.881372,75,Action,...,123.754829,12,995,8.996471,61,275,2.5,6.7,15.0,40.0
3,u00000c,Germany,Female,2024-09-06,Social Media,1.884423,69.911627,32.670693,8,Adventure,...,188.125323,3,569,6.501001,38,51,2.5,6.7,15.0,40.0
4,u00000d,Brazil,Male,2024-06-05,In-App Ads,3.542221,93.525544,43.286937,90,Strategy,...,27.785314,5,712,3.789446,8,46,2.5,6.7,15.0,40.0


# Model

In [7]:
@dataclass(slots=True)
class MedianModel(Model):
    name: str = 'median_model'
    
    medians: Dict[str, float] = field(default_factory=dict)
    
    def fit(self, df: pd.DataFrame) -> "MedianModel":
        self.medians = df[targets].median().to_dict()
        return self
    
    def predict(self, X: pd.DataFrame, target_name: str) -> pd.Series:
        return pd.Series(
            self.medians[target_name],
            index=X.index,
            name=target_name,
        )
    
    def get_params(self) -> Dict[str, Any]:
        return self.medians

In [8]:
model = MedianModel().fit(train)
model

MedianModel(name='avg_model', medians={'ltv_day1': 2.3468810063212753, 'ltv_day7': 6.413217600618959, 'ltv_day30': 14.420333602156134, 'ltv_day180': 34.13457203148127})

# Validate

In [9]:
validator = Validator(train, holdout)

In [10]:
validator.log_metrics(model)

INFO:pltv.src.validate:avg_model ltv_day1 metrics: {'train/rmse': np.float64(0.320559159885134), 'holdout/rmse': np.float64(0.31602825204750157), 'train/mae': np.float64(0.23754803879693492), 'holdout/mae': np.float64(0.2337043957604034), 'train/wape': np.float64(0.10121861234425784), 'holdout/wape': np.float64(0.09934513849499474)}
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33maapiskotin-ge[0m ([33mjustplay-case[0m). Use [1m`wandb login --relogin`[0m to force relogin


INFO:pltv.src.validate:avg_model ltv_day7 metrics: {'train/rmse': np.float64(0.6787742209520423), 'holdout/rmse': np.float64(0.6702245313668678), 'train/mae': np.float64(0.4699896753790561), 'holdout/mae': np.float64(0.4651857956821079), 'train/wape': np.float64(0.0732845358831573), 'holdout/wape': np.float64(0.07243338119095376)}


INFO:pltv.src.validate:avg_model ltv_day30 metrics: {'train/rmse': np.float64(1.12412351810831), 'holdout/rmse': np.float64(1.1083423047688106), 'train/mae': np.float64(0.8917486947961129), 'holdout/mae': np.float64(0.8786912623566866), 'train/wape': np.float64(0.06183967163303201), 'holdout/wape': np.float64(0.060832739833067806)}


INFO:pltv.src.validate:avg_model ltv_day180 metrics: {'train/rmse': np.float64(6.470518995886934), 'holdout/rmse': np.float64(6.44496398363376), 'train/mae': np.float64(6.026562757903527), 'holdout/mae': np.float64(6.006481423985642), 'train/wape': np.float64(0.176553048690501), 'holdout/wape': np.float64(0.1754211085726308)}
