## Imports and constants

*ideas: k-fold, transformers, features generation*

In [None]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import pyarrow as pa

%matplotlib inline
sns.set_style('darkgrid')

!pip install feather-format >> none
!pip install faiss-cpu --no-cache

In [None]:
SPLIT_SEED = 42
FAISS_ITERATIONS = 10
KFOLD_SPLITS = 5
EMBEDDING = True

if (EMBEDDING):
    LOCAL_DATA_PATH = '/kaggle/input/mts-ml-cookies'
    DATA_FILE = 'dataset_full.feather'
    TARGET_FILE = 'target_train.feather'
    SUBMISSION_FILE = 'submission.feather'

In [None]:
if (EMBEDDING):
    id_to_submit = pd.read_feather(f'{LOCAL_DATA_PATH}/{SUBMISSION_FILE}')

## Showing data

In [None]:
display(id_to_submit.head(3))
id_to_submit.info

In [None]:
data = pd.read_feather(f'{LOCAL_DATA_PATH}/{DATA_FILE}')

In [None]:
display(data.head())
data.describe()

In [None]:
data = pa.Table.from_pandas(data)
pd.DataFrame([(z.name, z.type) for z in data.schema], columns = [['field', 'type']])

In [None]:
data.select(['cpe_type_cd']).to_pandas()['cpe_type_cd'].value_counts()

In [None]:
targets = pd.read_feather(f'{LOCAL_DATA_PATH}/{TARGET_FILE}')
targets.head()

In [None]:
targets = pa.Table.from_pandas(targets)
pd.DataFrame([(z.name, z.type) for z in targets.schema], columns = [['field', 'type']])

## Counting users <-> items embedding

In [None]:
%%time
if (EMBEDDING):
    data_agg = data.select(['user_id', 'url_host', 'request_cnt']).\
        group_by(['user_id', 'url_host']).aggregate([('request_cnt', "sum")])

    url_set = set(data_agg.select(['url_host']).to_pandas()['url_host'])
    print(f'{len(url_set)} urls')
    url_dict = {url: idurl for url, idurl in zip(url_set, range(len(url_set)))}
    usr_set = set(data_agg.select(['user_id']).to_pandas()['user_id'])
    print(f'{len(usr_set)} users')
    usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}

In [None]:
%%time
if (EMBEDDING):
    values = np.array(data_agg.select(['request_cnt_sum']).to_pandas()['request_cnt_sum'])
    rows = np.array(data_agg.select(['user_id']).to_pandas()['user_id'].map(usr_dict))
    cols = np.array(data_agg.select(['url_host']).to_pandas()['url_host'].map(url_dict))
    mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))
    als = implicit.approximate_als.FaissAlternatingLeastSquares(factors = 50, iterations = FAISS_ITERATIONS, use_gpu = False, \
           calculate_training_loss = True, regularization = 0.1)

In [None]:
%%time
if (EMBEDDING):
    als.fit(mat)

In [None]:
if (EMBEDDING):
    u_factors = als.user_factors 
    print(len(u_factors))

In [None]:

if (EMBEDDING):
    d_factors = als.item_factors
    print(len(d_factors))

## Получим оценку по полу

In [None]:
id_to_submit = pd.read_feather(f'{LOCAL_DATA_PATH}/{SUBMISSION_FILE}')

In [None]:
if (EMBEDDING):
    inv_usr_map = {v: k for k, v in usr_dict.items()}
    usr_emb = pd.DataFrame(d_factors)
    usr_emb['user_id'] = usr_emb.index.map(inv_usr_map)
    usr_targets = targets.to_pandas()
    df = usr_targets.merge(usr_emb, how = 'inner', on = ['user_id'])
    df = df[df['is_male'] != 'NA']
    df = df.dropna()
    df['is_male'] = df['is_male'].map(int)
    print(df['is_male'].value_counts())

## Model variants

In [None]:
%%time
clf = CatBoostClassifier()

x = np.array(df.drop(['user_id', 'age', 'is_male'], axis = 1))
y = np.array(df['is_male'])

kf = KFold(n_splits=KFOLD_SPLITS)

for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(x_train, y_train, verbose = False)
    print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf.predict_proba(x_test)[:,1]) - 1:2.3f}')

'''
if (EMBEDDING):
    x_train, x_test, y_train, y_test = train_test_split(
        df.drop(['user_id', 'age', 'is_male'], axis = 1), 
        df['is_male'], 
        test_size = 0.33, 
        random_state = SPLIT_SEED
    )
'''

In [None]:
'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=4,
                            n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.predict([[0, 0, 0, 0]]))
'''

In [None]:
id_to_submit.info()

In [None]:
clf.fit(df.drop(['user_id', 'age', 'is_male'], axis = 1), df['is_male'], verbose = False)

In [None]:
id_to_submit['user_id'].unique

In [None]:
sex_prediction = clf.predict_proba(
    id_to_submit.merge(usr_emb, how = 'left', on = ['user_id'])
)[:,1]

print(len(sex_prediction))
id_to_submit['is_male'] = sex_prediction

## Получим оценку по возрасту

In [None]:
def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

In [None]:
df = usr_targets.merge(usr_emb, how = 'inner', on = ['user_id'])
df = df[df['age'] != 'NA']
df = df.dropna()
df['age'] = df['age'].map(age_bucket)
sns.histplot(df['age'], bins = 7)

In [None]:
x = np.array(df.drop(['user_id', 'age', 'is_male'], axis = 1))
y = np.array(df['age'])

kf = KFold(n_splits=KFOLD_SPLITS)

for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(x_train, y_train, verbose = False)

    print(m.classification_report(
        y_test, 
        clf.predict(x_test), 
        target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+']
    )
         )

In [None]:
clf.fit(df.drop(['user_id', 'age', 'is_male'], axis = 1), df['age'], verbose = False)

In [None]:
id_to_submit['age'] = clf.predict(id_to_submit[['user_id']].merge(usr_emb, how = 'left', on = ['user_id']))

## Сабмит

In [None]:
display(id_to_submit.head())
print(id_to_submit.is_male.value_counts())
print(id_to_submit.age.value_counts())
id_to_submit.to_csv('submission.csv', index = False)

In [None]:
print(f'Score: {}')