In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dask.dataframe as dd
import pyarrow.parquet as pq
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score, classification_report
import bisect
from sklearn.metrics import f1_score
from IPython.display import clear_output

In [2]:
data_train_age = dd.read_parquet('Dataset\\age_data\\data_train.parquet').compute()
print(data_train_age.info(verbose=True, show_counts=True))

data_test_age = dd.read_parquet('Dataset\\age_data\\data_test.parquet').compute()
print(data_test_age.info(verbose=True, show_counts=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269999 entries, 0 to 269998
Data columns (total 3032 columns):
 #     Column                                            Non-Null Count   Dtype  
---    ------                                            --------------   -----  
 0     user_id                                           269999 non-null  int32  
 1     region_name                                       269999 non-null  object 
 2     region_count                                      269999 non-null  int8   
 3     city_name                                         269999 non-null  object 
 4     city_count                                        269999 non-null  int8   
 5     cpe_manufacturer_name                             269999 non-null  object 
 6     cpe_type_cd                                       269999 non-null  object 
 7     price                                             263050 non-null  float32
 8     holyday_fraction                                  269999 non

In [3]:
def age_bucket(x):
    return bisect.bisect_left([25,35,45,55,65], x)

target = data_train_age['age'].map(age_bucket)
features = data_train_age.drop(columns=['age', 'user_id'])
del data_train_age

data_test_age = data_test_age.sort_values('user_id', ignore_index=True)
features_test = data_test_age.drop(columns=['user_id'])
del data_test_age

In [5]:
pool_train = Pool(features, target,
                  cat_features = list(features.select_dtypes(include=['object']).columns))

cat = CatBoostClassifier(task_type = 'GPU')
cat.fit(pool_train, verbose=True)

Learning rate set to 0.203367
0:	learn: 1.6656386	total: 416ms	remaining: 6m 55s
1:	learn: 1.5875874	total: 611ms	remaining: 5m 5s
2:	learn: 1.5310787	total: 798ms	remaining: 4m 25s
3:	learn: 1.4909901	total: 989ms	remaining: 4m 6s
4:	learn: 1.4611187	total: 1.18s	remaining: 3m 55s
5:	learn: 1.4388662	total: 1.36s	remaining: 3m 45s
6:	learn: 1.4209761	total: 1.55s	remaining: 3m 40s
7:	learn: 1.4053030	total: 1.73s	remaining: 3m 34s
8:	learn: 1.3932153	total: 1.91s	remaining: 3m 29s
9:	learn: 1.3832209	total: 2.1s	remaining: 3m 27s
10:	learn: 1.3749481	total: 2.28s	remaining: 3m 24s
11:	learn: 1.3679076	total: 2.43s	remaining: 3m 20s
12:	learn: 1.3618484	total: 2.62s	remaining: 3m 18s
13:	learn: 1.3564509	total: 2.78s	remaining: 3m 15s
14:	learn: 1.3521555	total: 2.95s	remaining: 3m 13s
15:	learn: 1.3487009	total: 3.1s	remaining: 3m 10s
16:	learn: 1.3447575	total: 3.25s	remaining: 3m 7s
17:	learn: 1.3409863	total: 3.41s	remaining: 3m 6s
18:	learn: 1.3379266	total: 3.56s	remaining: 3m 4s

<catboost.core.CatBoostClassifier at 0x20c41dec8b0>

In [10]:
print(f"2 * weightF1 train = {2*f1_score(target, cat.predict(features), average='weighted')}")

2 * weightF1 train = 1.0599888872109278


In [7]:
pred = cat.predict(features_test)
pred

array([[2],
       [1],
       [1],
       ...,
       [1],
       [2],
       [2]], dtype=int64)

In [8]:
submit = pd.read_csv('age_male.csv')
print(submit)

submit = submit.sort_values('user_id', ignore_index=True)
submit['age'] = cat.predict(features_test)
submit['age'] = submit['age'] + 1
submit.to_csv('submit_IIntegration.csv', index=False)
submit

        user_id   is_male  age
0             6  0.330467    2
1            11  0.725477    5
2            19  0.240190    1
3            27  0.536798    2
4            32  0.471325    3
...         ...       ...  ...
144719   161841  0.526730    3
144720   179608  0.314178    3
144721   378281  0.995775    3
144722   169278  0.089454    1
144723   135060  0.174740    2

[144724 rows x 3 columns]


Unnamed: 0,user_id,is_male,age
0,6,0.330467,3
1,7,0.604114,2
2,9,0.144574,2
3,10,0.024849,3
4,11,0.725477,5
...,...,...,...
144719,415306,0.511746,2
144720,415310,0.539346,3
144721,415314,0.517424,2
144722,415315,0.539346,3


In [12]:
submit = dd.read_parquet('submit_2.pqt').compute()
submit = submit.sort_values('user_id', ignore_index=True)


submit['age'] = pd.Series(-1, index=range(len(submit['user_id'])))
submit['is_male'] = pd.Series(-1., index=range(len(submit['user_id'])))

print(submit)
submit.to_csv('submit_IIntegration.csv', index=False)

        user_id  age  is_male
0             6   -1     -1.0
1             7   -1     -1.0
2             9   -1     -1.0
3            10   -1     -1.0
4            11   -1     -1.0
...         ...  ...      ...
144719   415306   -1     -1.0
144720   415310   -1     -1.0
144721   415314   -1     -1.0
144722   415315   -1     -1.0
144723   415316   -1     -1.0

[144724 rows x 3 columns]


In [None]:
features_train, features_valid, target_train, target_valid = train_test_split(features, target, test_size=0.33, random_state=12345)

model = CatBoostClassifier(task_type = 'GPU')

summary = model.select_features(Pool(features_train, target_train), eval_set = Pool(features_valid,target_valid), num_features_to_select=400, plot=True)