Метрика соревнования — ROC-AUC – для определения пола, f1 weighted – для определения возраста. Все решения высчитываются по формуле - 2 * f1_weighted(по 6 возрастным бакетам) + gini по полу. Возрастные бакеты 18-24, 25-34, 35-44, 45-54, 55-64 65+.

# Обработка данных

In [1]:
import pandas as pd
%pip install polars -q
import polars as pl
import numpy as np
import time
%pip install pyarrow -q
import pyarrow as pa
import pyarrow.parquet as pq
import scipy
%pip install implicit -q
import implicit
import bisect
import sklearn.metrics as m
%pip install catboost -q
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

%matplotlib inline
sns.set_style('darkgrid')

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
LOCAL_DATA_PATH = './context_data/'
SPLIT_SEED = 42
DATA_FILE = 'competition_data_final_pqt'
TARGET_FILE = 'public_train.pqt'
SUBMISSION_FILE = 'submit_2.pqt'

In [3]:
id_to_submit = pq.read_table(f'{LOCAL_DATA_PATH}/{SUBMISSION_FILE}').to_pandas()
id_to_submit.head()

Unnamed: 0,user_id
221301,221301
31271,31271
211594,211594
253119,253119
192578,192578


In [4]:
id_to_submit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144724 entries, 221301 to 145315
Data columns (total 1 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   user_id  144724 non-null  int64
dtypes: int64(1)
memory usage: 2.2 MB


In [5]:
data = pd.read_feather(f'{LOCAL_DATA_PATH}/dataset_full.feather')

In [6]:
data.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098


In [7]:
data.describe()

Unnamed: 0,price,request_cnt,user_id
count,316310000.0,322899400.0,322899400.0
mean,33085.1,1.724197,207098.3
std,25835.92,1.213835,120058.4
min,90.0,1.0,0.0
25%,13974.0,1.0,102788.0
50%,21990.0,1.0,206887.0
75%,49990.0,2.0,311395.0
max,195657.0,16.0,415316.0


In [8]:
data = pa.Table.from_pandas(data)

In [9]:
pd.DataFrame([(z.name, z.type) for z in data.schema], columns = [['field', 'type']])

Unnamed: 0,field,type
0,region_name,"dictionary<values=string, indices=int8, ordere..."
1,city_name,"dictionary<values=string, indices=int16, order..."
2,cpe_manufacturer_name,"dictionary<values=string, indices=int8, ordere..."
3,cpe_model_name,"dictionary<values=string, indices=int16, order..."
4,url_host,"dictionary<values=string, indices=int32, order..."
5,cpe_type_cd,"dictionary<values=string, indices=int8, ordere..."
6,cpe_model_os_type,"dictionary<values=string, indices=int8, ordere..."
7,price,float
8,date,timestamp[ns]
9,part_of_day,"dictionary<values=string, indices=int8, ordere..."


In [10]:
data.select(['cpe_type_cd']).to_pandas()['cpe_type_cd'].value_counts()

smartphone    322781599
tablet            53768
plain             36116
phablet           27952
Name: cpe_type_cd, dtype: int64

In [12]:
targets = pd.read_feather(f'{LOCAL_DATA_PATH}/target_train.feather')
targets.head()
targets.describe()

Unnamed: 0,age,user_id
count,269970.0,270000.0
mean,38.79394,207547.794719
std,11.739916,119901.87531
min,14.0,0.0
25%,30.0,103651.5
50%,37.0,207456.5
75%,47.0,311565.25
max,91.0,415313.0


In [13]:
targets = pa.Table.from_pandas(targets)
pd.DataFrame([(z.name, z.type) for z in targets.schema], columns = [['field', 'type']])

Unnamed: 0,field,type
0,age,double
1,is_male,string
2,user_id,int64


In [14]:
%%time
data_agg = data.select(['user_id', 'url_host', 'request_cnt']).\
    group_by(['user_id', 'url_host']).aggregate([('request_cnt', "sum")])

CPU times: user 6.79 s, sys: 2.05 s, total: 8.84 s
Wall time: 13.8 s


In [16]:
pd.DataFrame([(z.name, z.type) for z in data_agg.schema], columns = [['field', 'type']])

Unnamed: 0,field,type
0,request_cnt_sum,int64
1,user_id,int32
2,url_host,"dictionary<values=string, indices=int32, order..."


In [17]:
url_set = set(data_agg.select(['url_host']).to_pandas()['url_host'])
print(f'{len(url_set)} urls')
url_dict = {url: idurl for url, idurl in zip(url_set, range(len(url_set)))}
usr_set = set(data_agg.select(['user_id']).to_pandas()['user_id'])
print(f'{len(usr_set)} users')
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}

199683 urls
415317 users


In [21]:
%%time
%pip install annoy -q
values = np.array(data_agg.select(['request_cnt_sum']).to_pandas()['request_cnt_sum'])
rows = np.array(data_agg.select(['user_id']).to_pandas()['user_id'].map(usr_dict))
cols = np.array(data_agg.select(['url_host']).to_pandas()['url_host'].map(url_dict))
mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))
als = implicit.approximate_als.AnnoyAlternatingLeastSquares(factors = 50, iterations = 30, use_gpu = False, \
       calculate_training_loss = False, regularization = 0.1)

Note: you may need to restart the kernel to use updated packages.
CPU times: user 602 ms, sys: 559 ms, total: 1.16 s
Wall time: 3.43 s


In [22]:
%%time
als.fit(mat)



  0%|          | 0/30 [00:00<?, ?it/s]

CPU times: user 42min 22s, sys: 21min 4s, total: 1h 3min 27s
Wall time: 35min 35s


In [23]:
u_factors = als.user_factors 
len(u_factors)

AttributeError: 'AnnoyModel' object has no attribute 'user_factors'

In [24]:
d_factors = als.item_factors
len(d_factors)

AttributeError: 'AnnoyModel' object has no attribute 'item_factors'

# Оценка по полу

In [25]:
%%time
inv_usr_map = {v: k for k, v in usr_dict.items()}
usr_emb = pd.DataFrame(d_factors)
usr_emb['user_id'] = usr_emb.index.map(inv_usr_map)
usr_targets = targets.to_pandas()
df = usr_targets.merge(usr_emb, how = 'inner', on = ['user_id'])
df = df[df['is_male'] != 'NA']
df = df.dropna()
df['is_male'] = df['is_male'].map(int)
df['is_male'].value_counts()

NameError: name 'd_factors' is not defined

In [26]:
%%time
x_train, x_test, y_train, y_test = train_test_split(\
    df.drop(['user_id', 'age', 'is_male'], axis = 1), df['is_male'], test_size = 0.33, random_state = SPLIT_SEED)
clf = CatBoostClassifier()
clf.fit(x_train, y_train, verbose = False)
print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf.predict_proba(x_test)[:,1]) - 1:2.3f}')

NameError: name 'df' is not defined

In [None]:
clf.fit(df.drop(['user_id', 'age', 'is_male'], axis = 1), df['is_male'], verbose = False)

In [None]:
id_to_submit['user_id'].unique

In [None]:
len(clf.predict_proba(id_to_submit.merge(usr_emb, how = 'left', on = ['user_id']))[:,1])

In [None]:
id_to_submit['is_male'] = clf.predict_proba(id_to_submit.merge(usr_emb, how = 'left', on = ['user_id']))[:,1]

# Оценка по возрасту

In [None]:
def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

In [None]:
df = usr_targets.merge(usr_emb, how = 'inner', on = ['user_id'])
df = df[df['age'] != 'NA']
df = df.dropna()
df['age'] = df['age'].map(age_bucket)
sns.histplot(df['age'], bins = 7)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(\
    df.drop(['user_id', 'age', 'is_male'], axis = 1), df['age'], test_size = 0.33, random_state = SPLIT_SEED)

clf = CatBoostClassifier()
clf.fit(x_train, y_train, verbose = False)
print(m.classification_report(y_test, clf.predict(x_test), \
                            target_names = ['18-25','25-34', '35-44', '45-54', '55-65', '65+']))

In [None]:
clf.fit(df.drop(['user_id', 'age', 'is_male'], axis = 1), df['age'], verbose = False)

In [None]:
id_to_submit['age'] = clf.predict(id_to_submit[['user_id']].merge(usr_emb, how = 'left', on = ['user_id']))

# Обработка сабмита

id_to_submit.head()

In [None]:
id_to_submit.is_male.value_counts()

In [None]:
id_to_submit.age.value_counts()

In [None]:
id_to_submit.to_csv('submission.csv', index = False)