In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *

## Загрузка данных

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,sample_id,item,publisher,user,topic_0,topic_1,topic_2,topic_3,topic_4,weight_0,weight_1,weight_2,weight_3,weight_4,target
0,0,531,147,2925,411,477,618,249,460,27,18,9,8,7,0
1,1,1574,260,2981,212,287,382,302,51,27,11,2,1,0,0
2,2,940,394,1230,145,150,212,170,174,7,6,6,5,5,0
3,3,52,520,2597,201,283,618,249,617,35,33,30,11,9,1
4,4,766,55,1680,362,150,477,305,388,51,15,13,10,9,1


In [4]:
test.head()

Unnamed: 0,sample_id,item,publisher,user,topic_0,topic_1,topic_2,topic_3,topic_4,weight_0,weight_1,weight_2,weight_3,weight_4
0,1009109,1716,349,1053,362,397,430,287,431,54,54,51,26,13
1,1009110,1707,202,254,150,73,356,212,482,29,7,5,5,4
2,1009111,1592,520,1524,397,287,356,330,281,95,46,6,5,3
3,1009112,1541,82,2994,397,287,102,323,356,93,77,25,7,4
4,1009113,52,520,936,201,283,618,249,617,35,33,30,11,9


## Подготовка обучающего датасета

In [5]:
train['weight_0'] = train['weight_0'].astype(float) / 100
train['weight_1'] = train['weight_1'].astype(float) / 100
train['weight_2'] = train['weight_2'].astype(float) / 100
train['weight_3'] = train['weight_3'].astype(float) / 100
train['weight_4'] = train['weight_4'].astype(float) / 100

train['target'] = train['target'].astype(float)

target = train.pop('target')

user = train.pop('user')
user_count = len(user.unique())

publisher = train.pop('publisher')
publisher_count = len(publisher.unique())

topics = train[['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4']]
weights = train[['weight_0', 'weight_1', 'weight_2', 'weight_3', 'weight_4']]
topic_count = len(np.unique(topics.values))

train.pop('sample_id')
train.pop('item')

user_count, publisher_count, topic_count, len(train)

(3000, 608, 822, 1009109)

In [6]:
dataset = tf.data.Dataset.from_tensor_slices(((user.values, publisher.values, topics.values, weights.values), target.values))

In [7]:
train_dataset = dataset.shuffle(len(train)).batch(2048*8)

## Создание модели

In [8]:
# define inputs
inputA = Input(shape=(1,))
inputB = Input(shape=(1,))
inputC = Input(shape=(5,))
inputD = Input(shape=(5,))
 
# operations on user data
x1 = Embedding(user_count, 10)(inputA)
x1 = Reshape((10,))(x1)
x1 = Dropout(0.1)(x1)
x1 = Dense(512, activation="relu")(x1)
x1 = Dropout(0.2)(x1)
x1 = Dense(256, activation="relu")(x1)
x1 = Model(inputs=inputA, outputs=x1)

# operations on publisher data
x2 = Embedding(publisher_count, 10)(inputB)
x2 = Reshape((10,))(x2)
x2 = Dropout(0.1)(x2)
x2 = Dense(512, activation="relu")(x2)
x2 = Dropout(0.2)(x2)
x2 = Dense(256, activation="relu")(x2)
x2 = Model(inputs=inputB, outputs=x2)

# operations on topics data
x3 = Embedding(topic_count, 10)(inputC)
x3 = Dropout(0.1)(x3)
x3 = Dense(512, activation="relu")(x3)
x3 = Dropout(0.2)(x3)
x3 = Dense(256, activation="relu")(x3)
x3 = Model(inputs=inputC, outputs=x3)

# operations on topics weights
x4 = Reshape((5,1))(inputD)
x4 = Dropout(0.1)(x4)
x4 = Dense(512, activation="relu")(x4)
x4 = Dropout(0.2)(x4)
x4 = Dense(256, activation="relu")(x4)
x4 = Model(inputs=inputD, outputs=x4)

# combine the output of the branches
comb_data = concatenate([x1.output, x2.output], axis=1)
comb_data = Reshape((1,512))(comb_data)
comb_topics = concatenate([x3.output, x4.output])
comb_all = concatenate([comb_data, comb_topics], axis=1)

 
# prediction on the combined outputs
z = Flatten()(comb_all)
z = Dropout(0.1)(z)
z = Dense(512, activation="relu")(z)
z = Dropout(0.2)(z)
z = Dense(256, activation="relu")(z)
z = Dense(1, activation="sigmoid")(z)
 
model = Model(inputs=[x1.input, x2.input, x3.input, x4.input], outputs=z)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)

model.compile(optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy'])

W1022 13:56:18.954611 12648 deprecation.py:506] From c:\users\astw\appdata\local\programs\python\python37\lib\site-packages\tensorflow_core\python\keras\initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1022 13:56:18.971626 12648 deprecation.py:506] From c:\users\astw\appdata\local\programs\python\python37\lib\site-packages\tensorflow_core\python\ops\resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
W1022 13:56:19.410032 12648 deprecation.py:323] From c:\users\astw\appdata\local\programs\python\python37\lib\site-packages\tensorflow_core\python\ops

In [9]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 10)        30000       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 10)        6080        input_2[0][0]                    
____________________________________________________________________________________________

## Обучение

In [10]:
model.fit(train_dataset, epochs=5)

Train on 62 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x25e0dbe1550>

## Подготовка тестовых данных

In [11]:
test['weight_0'] = test['weight_0'].astype(float) / 100
test['weight_1'] = test['weight_1'].astype(float) / 100
test['weight_2'] = test['weight_2'].astype(float) / 100
test['weight_3'] = test['weight_3'].astype(float) / 100
test['weight_4'] = test['weight_4'].astype(float) / 100

sample_id = test.pop('sample_id')

user_test = test.pop('user')

publisher_test = test.pop('publisher')

topics_test = test[['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4']]
weights_test = test[['weight_0', 'weight_1', 'weight_2', 'weight_3', 'weight_4']]

test.pop('item')

len(test)

112124

In [12]:
dataset_test = tf.data.Dataset.from_tensor_slices(((user_test.values, publisher_test.values, topics_test.values, weights_test.values), user_test.values))
test_dataset = dataset_test.batch(1000)

## Предсказание данных обученной моделью

In [13]:
test_prediction = model.predict(test_dataset)

In [21]:
prediction = sample_id.to_frame()
prediction['target'] = test_prediction

In [22]:
prediction.head()

Unnamed: 0,sample_id,target
0,1009109,0.27518
1,1009110,0.223217
2,1009111,0.149463
3,1009112,0.02886
4,1009113,0.1465


## Сохранение результатов в файл

In [None]:
prediction.to_csv("prediction.csv", index=False)