In [2]:
!pip install torchsummary

In [3]:
import torch
import torch.utils.data as data
import torch.optim as optim
import torch.nn as nn
from torchsummary import summary
import time
import numpy as np
import os
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from pathlib import Path
import pandas as pd
import math

from torchmetrics import MeanAbsolutePercentageError
from torchmetrics import MeanSquaredError
from torchmetrics import MeanAbsoluteError

In [5]:
ltv_frame = pd.read_csv("data.csv")
ltv_frame = ltv_frame.dropna().drop_duplicates()
ltv_frame.to_csv("clean_data.csv")
ltv_frame.head()

In [7]:
ltv_frame = pd.read_csv("../input/iasa-ds-test/encoded_data.csv")

In [None]:
ltv_frame.columns

In [None]:
income_per_country = ltv_frame[['country_code', 'target_full_ltv_day30']].groupby('country_code').agg(
    average_ltv = pd.NamedAgg(column='target_full_ltv_day30', aggfunc=np.mean)
)['average_ltv']
income_per_country = income_per_country.sort_values(ascending=False)

income_per_source = ltv_frame[['media_source', 'target_full_ltv_day30']].groupby('media_source').agg(
    average_ltv = pd.NamedAgg(column='target_full_ltv_day30', aggfunc=np.mean)
)['average_ltv']
income_per_source = income_per_source.sort_values(ascending=False)

In [None]:
country_categories = [0.1, 0.4]
country_count = len(income_per_country)
source_categories = [0.4]
source_count = len(income_per_source)

fig, ax = plt.subplots(nrows = 2, figsize=(20, 20))

ax[0].bar(range(country_count), income_per_country.values)
for category in country_categories:
    ax[0].plot([-1, country_count], [category, category], color="r")
ax[0].set_xlim(-1, country_count)
ax[0].yaxis.set_ticks(np.arange(0, 1.5, 0.1))
ax[0].grid()

ax[1].bar(range(len(income_per_source)), income_per_source.values)
for category in source_categories:
    ax[1].plot([-1, source_count], [category, category], color="r")
ax[1].set_xlim(-1, source_count)
ax[1].yaxis.set_ticks(np.arange(0, 1.5, 0.1))
ax[1].grid()

In [None]:
category_dict = {}
for id, value in income_per_country.items():
    category = 0
    for threshold in country_categories:
        if value >= threshold: category += 1
        else: break
    category_dict[id] = category

for id, value in income_per_source.items():
    category = 0
    for threshold in source_categories:
        if value >= threshold: category += 1
        else: break
    category_dict[id] = category

category_dict['android'] = 0
category_dict['ios'] = 1
print(category_dict)

In [9]:
columns_float = ['chapters_finished_day0',
'chapters_finished_day1', 'chapters_finished_day3',
'chapters_finished_day7', 'chapters_opened_day0',
'chapters_opened_day1', 'chapters_opened_day3', 'chapters_opened_day7',
'chapters_closed_day0', 'chapters_closed_day1', 'chapters_closed_day3',
'chapters_closed_day7', 'diamonds_received_day0',
'diamonds_received_day1', 'diamonds_received_day3',
'diamonds_received_day7', 'diamonds_spent_day0', 'diamonds_spent_day1',
'diamonds_spent_day3', 'diamonds_spent_day7', 'tickets_spent_day0',
'tickets_spent_day1', 'tickets_spent_day3', 'tickets_spent_day7',
'app_sub_ltv_day0', 'app_sub_ltv_day1',
'app_sub_ltv_day3', 'app_iap_ltv_day0', 'app_iap_ltv_day1',
'app_iap_ltv_day3', 'ad_ltv_day0', 'ad_ltv_day1', 'ad_ltv_day3']

columns_categorical = ['media_source', 'country_code', 'platform']

columns_target = ['target_sub_ltv_day30', 'target_iap_ltv_day30', 'target_ad_ltv_day30']

In [8]:
class LTV_data(Dataset):
    def __init__(self, df, category_dict):
        self.df = df
        self.dict = category_dict
    
    def __getitem__(self, index):
        feature_1 = self.df.iloc[index][columns_float].values.astype(np.float64)
        feature_2 = self.df.iloc[index][columns_categorical].values
        #print(type(feature_2))
        if type(feature_2[0]) == str:
            feature_2 = np.array([self.dict[key] for key in feature_2])
        feature_2 = feature_2.astype(np.float64)
        feature = np.concatenate([feature_1, feature_2])
        target = self.df.iloc[index][columns_target].values.astype(np.float64)
        return feature, target

    def __len__(self) -> int:
        return len(self.df)

    @staticmethod
    def create_dataset(df, category_dict, transform=None):
       dataset = LTV_data(df, category_dict)
       return dataset

    @staticmethod
    def loader(dataset, batch_size, num_workers=0):
       data_loader = DataLoader(
          dataset,
          batch_size=batch_size,
          shuffle=True,
          num_workers=num_workers,
      )
       return data_loader

In [40]:
val_set_size, test_set_size = 50000, 50000

all_data = LTV_data.create_dataset(ltv_frame, category_dict)
train_set, val_set, test_set = torch.utils.data.random_split(all_data, [all_data.__len__() - (val_set_size + test_set_size), val_set_size, test_set_size])
train_loader = LTV_data.loader(train_set, 64)
val_loader = LTV_data.loader(val_set, 64)
test_loader = LTV_data.loader(test_set, 64)

In [13]:
features, targets = next(iter(train_loader))

In [43]:
EPOCHS = 2
input_size = features.shape[1]
learning_rate = 0.001

seed = 123
torch.manual_seed(seed)
np.random.seed(seed)

In [75]:
class LTV_model(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.f1 = nn.Sequential(nn.Linear(input_size, 64), nn.ReLU(), nn.Linear(64, 32), nn.ReLU())
        self.f2 = nn.Linear(32, 3)
        self.f3 = nn.Threshold(0.3, 0)

    def forward(self, X):
        X = self.f1(X)
        X = F.dropout(X, p = 0.25)
        X = self.f2(X)
        X = self.f3(X)
        return X

In [76]:
model = LTV_model()
#checkpoint = torch.load('./model.pt')
#model.load_state_dict(checkpoint['model_state_dict'])
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
criterion = nn.MSELoss()

In [77]:
checkpoint = torch.load('./model.pt')
model.load_state_dict(checkpoint['model_state_dict'])

In [92]:
def train(model, loader, opt, criterion):
    denominator = 100
    print('training to', len(loader) // denominator, end=':')

    rmse = 0
    mape = 0
    mae = 0

    model.train()
    
    i = 1
    for (features, targets) in loader:
        if i % denominator == 0:
            print(i // denominator, end=';')
            if i % (denominator * 50) == 0:
                print('')
        i += 1

        opt.zero_grad()
        
        features = features.float()
        targets = targets.float()

        output = model(features)
        #print(output)
        loss = criterion(output, targets)
        
        loss.backward()

        rmse += MeanSquaredError()(output, targets).item()
        mape += MeanAbsolutePercentageError()(output, targets).item()
        mae += MeanAbsoluteError()(output, targets).item()

        opt.step()

    rmse = math.sqrt(rmse) / len(loader)
    mape = mape / len(loader)
    mae = mae / len(loader)
    return rmse, mape, mae

In [93]:
def evaluate(model, loader, criterion):
    denominator = 100
    print('validation to', len(loader) // denominator, end=':')

    rmse = 0
    mape = 0
    mae = 0

    model.eval()
    
    i = 1
    with torch.no_grad():
        for (features, targets) in loader:
            if i % denominator == 0:
                print(i // denominator, end=';')
            if i % (denominator * 50) == 0:
                print('')
            i += 1
            
            features = features.float()
            targets = targets.float()

            output = model(features)

            rmse += MeanSquaredError()(output, targets).item()
            mape += MeanAbsolutePercentageError()(output, targets).item()
            mae += MeanAbsoluteError()(output, targets).item()
        
    rmse = math.sqrt(rmse) / len(loader)
    mape = mape / len(loader)
    mae = mae / len(loader)
    return rmse, mape, mae

In [79]:
train_rmse_list = []
train_mape_list = []
train_mae_list = []
val_rmse_list = []
val_mae_list = []
val_mape_list = []
test_rmse_list = []
test_mape_list = []
test_mae_list = []

In [95]:
for epoch in range(EPOCHS):
    print("Epoch-%d: " % (epoch))

    train_start_time = time.monotonic()
    train_rmse, train_mape, train_mae = train(model, train_loader, optimizer, criterion)
    train_end_time = time.monotonic()
    
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_rmse,
            }, './model.pt')
    
    val_start_time = time.monotonic()
    val_rmse, val_mape, val_mae = evaluate(model, val_loader, criterion)
    val_end_time = time.monotonic()
    
    test_start_time = time.monotonic()
    test_rmse, test_mape, test_mae = evaluate(model, test_loader, criterion)
    test_end_time = time.monotonic()
    
    train_rmse_list.append(train_rmse)
    train_mape_list.append(train_mape)
    train_mae_list.append(train_mae)
    val_rmse_list.append(val_rmse)
    val_mape_list.append(val_mape)
    val_mae_list.append(val_mae)
    test_rmse_list.append(test_rmse)
    test_mape_list.append(test_mape)
    test_mae_list.append(test_mae)
    
    print("Training: RMSE = %.4f, MAPE = %.4f, MAE = %.4f, Time = %.2f seconds" % (train_rmse, train_mape, train_mae, train_end_time - train_start_time))
    print("Validation: RMSE = %.4f, MAPE = %.4f, MAE = %.4f, Time = %.2f seconds" % (val_rmse, val_mape, val_mae, val_end_time - val_start_time))
    print("Test: RMSE = %.4f, MAPE = %.4f, MAE = %.4f, Time = %.2f seconds" % (test_rmse, test_mape, test_mae, val_end_time - val_start_time))
    print("")

In [98]:
print(train_rmse_list)
print(train_mape_list)
print(train_mae_list)
print(val_rmse_list)
print(val_mae_list)
print(val_mape_list)
print(test_rmse_list)
print(test_mape_list)
print(test_mae_list)

In [85]:
no_zeros_frame = ltv_frame[ltv_frame["target_full_ltv_day30"] > 0]
no_zeros_set = LTV_data.create_dataset(no_zeros_frame, category_dict)
no_zeros_loader = LTV_data.loader(no_zeros_set, 64)

In [86]:
val_start_time = time.monotonic()
val_rmse, val_mape, val_mae = evaluate(model, val_loader, criterion)
val_end_time = time.monotonic()

print("NO_ZEROS: RMSE = %.4f, MAPE = %.4f, MAE = %.4f, Time = %.2f seconds" % (val_rmse, val_mape, val_mae, val_end_time - val_start_time))

In [94]:
#train_start_time = time.monotonic()
#train_rmse, train_mape, train_mae = evaluate(model, train_loader, criterion)
#train_end_time = time.monotonic()

val_start_time = time.monotonic()
val_rmse, val_mape, val_mae = evaluate(model, val_loader, criterion)
val_end_time = time.monotonic()

test_start_time = time.monotonic()
test_rmse, test_mape, test_mae = evaluate(model, test_loader, criterion)
test_end_time = time.monotonic()

print("")
#print("Training: RMSE = %.4f, MAPE = %.4f, MAE = %.4f, Time = %.2f seconds" % (train_rmse, train_mape, train_mae, train_end_time - train_start_time))
print("Validation: RMSE = %.4f, MAPE = %.4f, MAE = %.4f, Time = %.2f seconds" % (val_rmse, val_mape, val_mae, val_end_time - val_start_time))
print("Test: RMSE = %.4f, MAPE = %.4f, MAE = %.4f, Time = %.2f seconds" % (test_rmse, test_mape, test_mae, val_end_time - val_start_time))

In [None]:
for id, value in category_dict.items():
    ltv_frame = ltv_frame.replace([id], value)

In [None]:
ltv_frame.head()

In [None]:
ltv_frame.to_csv('encoded_data.csv')

In [None]:
torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_rmse,
            }, './model.pt')

In [None]:
checkpoint = torch.load('./model.pt')
model.load_state_dict(checkpoint['model_state_dict'])
print(model.state_dict())

In [None]:
money = ltv_frame[['target_full_ltv_day30']]
count = 0
i = 0
for row in money.iterrows():
    if row[1].iloc[0] != 0.0:
        count += 1
print(count)
print(count / len(money))