In [1]:
import secrets
import pickle

import pandas as pd
import numpy as np

In [2]:
from tqdm.auto import tqdm

In [3]:
import xgboost as xgb

  import pandas.util.testing as tm


In [4]:
def generate_ids(size):
    result = []
    
    for i in range(size):
        id = secrets.token_hex(8)
        result.append(id)
    
    return result

In [5]:
def generate_data(n):
    df = pd.DataFrame()
    df['session_long'] = generate_ids(n)
    
    df['f_sessions'] = (1000 * np.random.exponential(scale=0.05, size=n)).astype(int)

    df['f_view_sessions'] = (0.8 * df['f_sessions'] * np.random.rand(n)).astype(int)
    df['f_reply_sessions'] = (0.1 * df['f_sessions'] * np.random.rand(n)).astype(int)
    df['f_scroll_sessions'] = (0.5 * df['f_sessions'] * np.random.rand(n)).astype(int)
    
    return df

In [6]:
!mkdir -p data

In [7]:
for i in tqdm(range(25)):
    df = generate_data(100000)
    df.to_parquet('data/part-%05d.parquet' % i, index=False)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




In [8]:
df_train = generate_data(20000)

In [9]:
df_train['f_views_fraction'] = df_train['f_view_sessions'] / df_train['f_sessions']
df_train['f_replies_fraction'] = df_train['f_reply_sessions'] / df_train['f_sessions']
df_train['f_scrolls_fraction'] = df_train['f_scroll_sessions'] / df_train['f_sessions']

In [10]:
X_train = df_train[['f_views_fraction', 'f_replies_fraction', 'f_scrolls_fraction']].values

In [11]:
X_train

array([[0.        , 0.        , 0.36363636],
       [0.4       , 0.        , 0.4       ],
       [0.13513514, 0.02702703, 0.10810811],
       ...,
       [0.7       , 0.04      , 0.4       ],
       [0.52380952, 0.04761905, 0.42857143],
       [0.33333333, 0.        , 0.        ]])

In [12]:
y_train = np.random.choice([0, 1, 1], size=20000)

In [13]:
dtrain = xgb.DMatrix(X_train, y_train)

In [14]:
params = {
    'objective': 'binary:logistic'
}

n_trees = 100

In [15]:
model = xgb.train(params, dtrain, n_trees)

In [16]:
with open('model.pkl', 'wb') as f_out:
    pickle.dump(model, f_out)