## Предсказание кликов пользователя

Одним из важнейших сигналов для рекомендательной системы является поведение пользователя. Таких данных зачастую достаточно, чтобы построить бейзлайн приемлемого качества.

В этом задании нужно построить рекомендательную систему на основе данных о действиях пользователей в персональной ленте рекомендаций Яндекс.Дзена.

Доступны 2 датасета: тренировочный (train.csv) и тестовый (test.csv). Каждая строка в датасетах соответствует взаимодействию некоторого пользователя с некоторым документом, показанным ему в ленте рекомендаций. Датасеты содержат следующие колонки:

sample_id — числовой id взаимодействия,  
item — числовой id показанного пользователю документа,  
publisher — числовой id автора документа,  
user — числовой id пользователя,  
topic_i, weight_i — числовой id i-ой темы документа и степень принадлежности документа данной теме (целое число от 0 до 100) (i = 0,1,2,3,4)  
target — факт клика пользователя на документ (1 — был клик, 0 — был показ без клика). Этот столбец присутствует только в  
    тренировочном датасете.
    
Необходимо построить модель для предсказания кликов пользователя и применить её к тестовому датасету.

В качестве решения необходимо создать csv-файл, состоящий из двух колонок: sample_id и target, где sample_id — id строки из тестового датасета, а target — предсказанная вероятность клика. Количество строк в этом файле должно совпадать с количеством строк в test.csv. Строки в файле с решением должны быть отсортированы по возрастанию значений колонки sample_id (в том же порядке, что и в test.csv). Все значения вероятностей в колонке target должны быть вещественными числами от 0 до 1.

Датасеты можно скачать по ссылке: https://yadi.sk/d/pVna8ejcnQZK_A.

In [1]:
#Standard imports
import pandas as pd
import numpy as np

In [2]:
#csv to pandas dataframe
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
test.head()

Unnamed: 0,sample_id,item,publisher,user,topic_0,topic_1,topic_2,topic_3,topic_4,weight_0,weight_1,weight_2,weight_3,weight_4
0,1009109,1716,349,1053,362,397,430,287,431,54,54,51,26,13
1,1009110,1707,202,254,150,73,356,212,482,29,7,5,5,4
2,1009111,1592,520,1524,397,287,356,330,281,95,46,6,5,3
3,1009112,1541,82,2994,397,287,102,323,356,93,77,25,7,4
4,1009113,52,520,936,201,283,618,249,617,35,33,30,11,9


In [4]:
train.head()

Unnamed: 0,sample_id,item,publisher,user,topic_0,topic_1,topic_2,topic_3,topic_4,weight_0,weight_1,weight_2,weight_3,weight_4,target
0,0,531,147,2925,411,477,618,249,460,27,18,9,8,7,0
1,1,1574,260,2981,212,287,382,302,51,27,11,2,1,0,0
2,2,940,394,1230,145,150,212,170,174,7,6,6,5,5,0
3,3,52,520,2597,201,283,618,249,617,35,33,30,11,9,1
4,4,766,55,1680,362,150,477,305,388,51,15,13,10,9,1


### Dataset preparation

In [5]:
#Is dataset balanced?
train['target'].value_counts()

0    805481
1    203628
Name: target, dtype: int64

In [6]:
#Dataset balancing
from sklearn.utils import resample

In [7]:
train_majority = train[train.target==0]
train_minority = train[train.target==1]

In [8]:
# Upsample minority class
train_minority_upsampled = resample(train_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=805481,    # to match majority class
                                 random_state=42) # reproducible results

 
# Combine majority class with upsampled minority class
train_balanced = pd.concat([train_majority, train_minority_upsampled])
 
# Display new class counts
train_balanced.target.value_counts()

1    805481
0    805481
Name: target, dtype: int64

In [9]:
train_balanced = train_balanced.sample(frac=1).reset_index(drop=True)

In [10]:
#Dataset for training (balanced)
train_ds = train_balanced.drop(['target', 'sample_id'], axis=1)

In [11]:
#Targets for training (balanced)
train_targets = train_balanced['target'].copy()

In [12]:
#Test dataset
test_ds = test.drop(['sample_id'], axis=1)

In [13]:
#Standard import for features manipulation
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [14]:
#Numerical and categorical attributes (features)
num_attribs = ['weight_0', 'weight_1', 'weight_2', 'weight_3', 'weight_4']
cat_attribs = ['item', 'publisher', 'user', 'topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4']

In [15]:
#Applying OneHotEncoder for categorical attributes and StandardScaler for numerical attributes
data_prep = ColumnTransformer([
        ("cat", OneHotEncoder(categories = 'auto'), cat_attribs),
        ("num", StandardScaler(), num_attribs),
    ])

In [16]:
train_ds_prepared = data_prep.fit_transform(train_ds.astype(np.float))

In [17]:
test_ds_prepared = data_prep.transform(test_ds.astype(np.float))

In [18]:
train_ds_prepared = train_ds_prepared.astype('float32')
train_targets = train_targets.astype('float32')
test_ds_prepared = test_ds_prepared.astype('float32')

### Applying Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver = 'sag')

In [20]:
from sklearn.model_selection import cross_val_score

In [21]:
#Cross val ccuracy score
cross_val_score(log_reg, train_ds_prepared, train_targets, cv=3, scoring="accuracy")

array([0.67212861, 0.67261838, 0.67116275])

Accuracy score is about 0.67

### Applying Densed NN

In [22]:
#Standard NN (Keras) import
from keras import models
from keras import layers

Using TensorFlow backend.


In [23]:
#Model construction
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(7573,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.


In [24]:
#Training & validating set split
x_val = train_ds_prepared[:500000]
partial_x_train = train_ds_prepared[500000:]
y_val = train_targets[:500000]
partial_y_train = train_targets[500000:]

In [25]:
#Model compilation
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])

In [26]:
#Model fitting
history = model.fit(partial_x_train,
partial_y_train,
epochs=20,
batch_size=512,
validation_data=(x_val, y_val))

Instructions for updating:
Use tf.cast instead.
Train on 1110962 samples, validate on 500000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Accuracy score on early stopping (epoch 19) is about 0.689

### Applying RandomForestClassifier 

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
rand_forest = RandomForestClassifier(n_estimators = 10)

In [29]:
#Fine tuning with GridSearchCV
from sklearn.model_selection import GridSearchCV

In [30]:
param_grid = {
    'max_features': [40, 60],
    'n_estimators': [5, 10]
}

In [31]:
grid_search = GridSearchCV(rand_forest, param_grid, cv=3, scoring="accuracy", verbose=2, n_jobs=-1, return_train_score=True)

In [32]:
grid_search.fit(train_ds_prepared, train_targets)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 81.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 81.5min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_features': [40, 60], 'n_estimators': [5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=2)

In [33]:
#best parameters
grid_search.best_params_

{'max_features': 40, 'n_estimators': 10}

In [34]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=40, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [35]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.8839221533468822 {'max_features': 40, 'n_estimators': 5}
0.9174778796768639 {'max_features': 40, 'n_estimators': 10}
0.8864945293557515 {'max_features': 60, 'n_estimators': 5}
0.9110848052281805 {'max_features': 60, 'n_estimators': 10}


Best accuracy score is 0.917 for 'max_features': 40, 'n_estimators': 10

In [44]:
#Fine tuning with 'n_estimators': [20, 30]
param_grid = {
    'max_features': [40],
    'n_estimators': [20, 30]
}

In [45]:
grid_search = GridSearchCV(rand_forest, param_grid, cv=3, scoring="accuracy", verbose=2, n_jobs=-1, return_train_score=True)

In [46]:
grid_search.fit(train_ds_prepared, train_targets)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed: 71.5min remaining: 71.5min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 148.5min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_features': [40], 'n_estimators': [20, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=2)

In [47]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.9226158034764321 {'max_features': 40, 'n_estimators': 20}
0.9239994487765695 {'max_features': 40, 'n_estimators': 30}


In [48]:
grid_search.best_params_

{'max_features': 40, 'n_estimators': 30}

Best accuracy score is 0.924 for 'max_features': 40, 'n_estimators': 30

In [49]:
#Best estimator
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=40, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [50]:
best_grid = grid_search.best_estimator_

In [51]:
#Target probabilities prediction using best estimator
prediction_randforest = best_grid.predict_proba(test_ds_prepared)[:,1]

In [52]:
prediction_randforest

array([0.3       , 0.33333333, 0.06666667, ..., 0.13333333, 0.23333333,
       0.33333333])

In [53]:
#Pandas data frame preparation for results
prediction_rforest = test[["sample_id"]].copy()
prediction_rforest["target"] = np.around(prediction_randforest, decimals=4)

In [54]:
#Converting to csv
prediction_rforest.to_csv("prediction_rforest.csv", index=False)

Best accuracy score is 0.924 for RandomForestClassifier with parameters: 'max_features': 40, 'n_estimators': 30