In [3]:
import csv
import numpy as np
from nltk.tokenize import word_tokenize
import itertools
import random
import tensorflow as tf
import os

In [4]:
import pandas as pd

articles = pd.read_csv('Data/articles.csv')
history_train = pd.read_csv('Data/history_train.csv')
behaviors_train = pd.read_csv('Data/behaviors_train.csv')
history_test = pd.read_csv('Data/history_val.csv')
behaviors_test = pd.read_csv('Data/behaviors_val.csv')


In [5]:
# Dictionary mapping article_id to tokenized titles
news = {}
for _, row in articles.iterrows():
    news[row['article_id']] = word_tokenize(str(row['title']).lower())

# Dictionary mapping article_id to unique indices
newsindex = {'NULL': 0} # 0 reserved for padding or missing articles
for newsid in news:
    newsindex[newsid] = len(newsindex)

print('Total articles processed: ', len(newsindex))

Total articles processed:  11778


In [6]:
import ast
import random

# Clean data
behaviors_train['article_ids_inview'] = behaviors_train['article_ids_inview'].str.replace(r'\s+', ',', regex=True)
behaviors_train['article_ids_clicked'] = behaviors_train['article_ids_clicked'].str.replace(r'\s+', ',', regex=True)

# Parsing inview and clicked articles columns
behaviors_train['article_ids_inview'] = behaviors_train['article_ids_inview'].apply(ast.literal_eval)
behaviors_train['article_ids_clicked'] = behaviors_train['article_ids_clicked'].apply(ast.literal_eval)

npratio = 4 # Number of negative samples per positive sample
train_candidate = []
train_label = []
train_user_his = []

# process each impression in behaviors_train
for _,row in behaviors_train.iterrows():
    user_id = row['user_id']
    clicked_articles = row['article_ids_clicked']
    inview_articles = row['article_ids_inview'] # positive + negative samples

    # Convert clicked articles to indices using former dict newsindex
    clickids = [newsindex.get(article, 0) for article in clicked_articles]

    # Get negative samples
    ndoc = [newsindex.get(article,0) for article in inview_articles if article not in clicked_articles]
    if len(ndoc) < npratio:
        ndoc = ndoc * (npratio // len(ndoc) + 1)  # Replicate negatives

    # Process each positive sample
    for doc in clickids:
        negd = random.sample(ndoc, min(len(ndoc), npratio))
        negd.append(doc)
        candidate_label = [0] * len(negd[:-1]) + [1]
    
    # Shuffle
    candidate_order = list(range(len(negd)))
    random.shuffle(candidate_order)
    candidate_shuffle = [negd[i] for i in candidate_order]
    candidate_label_shuffle = [candidate_label[i] for i in candidate_order]

    # Append to training data
    train_candidate.append(candidate_shuffle)
    train_label.append(candidate_label_shuffle)
    train_user_his.append(clickids[-50:] + [0] * (50 - len(clickids[-50:]))) # length of maximum 50 for clicked articles

pdoc = clickids


In [7]:
# Clean data
behaviors_test['article_ids_inview'] = behaviors_test['article_ids_inview'].str.replace(r'\s+', ',', regex=True)
behaviors_test['article_ids_clicked'] = behaviors_test['article_ids_clicked'].str.replace(r'\s+', ',', regex=True)

# Parse the columns containing article IDs
behaviors_test['article_ids_inview'] = behaviors_test['article_ids_inview'].apply(ast.literal_eval)
behaviors_test['article_ids_clicked'] = behaviors_test['article_ids_clicked'].apply(ast.literal_eval)

# Initialize lists to store processed test data
test_candidate = []
test_label = []
test_user_his = []
test_index=[]
test_session_data = []

# Process each user session in the test set
for _, row in behaviors_test.iterrows():

    user_id = row['user_id']
    timestamp = row['impression_time']
    
    clickids = [newsindex.get(article, 0) for article in row['article_ids_clicked']]
    user_history = clickids[-50:] + [0] * (50 - len(clickids[-50:]))

    
    candidates = [newsindex.get(article, 0) for article in row['article_ids_inview']]

    # Generate positive/negative labels
    clicked_set = set(row['article_ids_clicked'])
    labels = [1 if article in clicked_set else 0 for article in row['article_ids_inview']]

    # Record the start index of this session's candidates
    start_index = len(test_candidate)

    # Append processed data, we use .extend so each candidate article is added individually and not as samples
    test_candidate.extend(candidates)
    test_label.extend(labels)
    test_user_his.extend([user_history] * len(candidates))

    # Record the end index of this session's candidates
    end_index = len(test_candidate)

    test_index.append([start_index, end_index])

    session_data = [user_id, row['article_ids_inview'], timestamp]
    test_session_data.append(session_data)


In [8]:
# Example of an entry in the training data, new format

print(train_candidate[0])
print(train_label[0])
print(train_user_his[0])

[9361, 9715, 8515, 8480, 9719]
[0, 0, 1, 0, 0]
[8515, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [9]:
# word vocabulary
word_dict = {'PADDING': 0}
news_title = [[0] * 30]

for newsid, tokens in news.items():
    title = []
    for word in tokens:
        if word not in word_dict:
            word_dict[word] = len(word_dict) # add word to vocabulary as incrementing integers (0,1,2,...)
        title.append(word_dict[word])

    title = title[:30] # Max 30 words in length
    news_title.append(title + [0] * (30 - len(title)))

news_title = np.array(news_title, dtype='int32')

print(f'Total words in vocabulary: {len(word_dict)}')
print(f'Shape of news_title array: {news_title.shape}')

Total words in vocabulary: 16003
Shape of news_title array: (11778, 30)


In [10]:
all_lengths = []
len_4 = 0
len_5 = 0

for i in range(len(train_candidate)):
    len_can = len(train_candidate[i])
    if len_can == 5:
        len_5 += 1
    else:
        len_4 += 1
    all_lengths.append(len_can)

print(list(set(all_lengths)))
print('Entries with 4 candidates: ', len_4)
print('Entries with 5 candidates: ', len_5)

[5]
Entries with 4 candidates:  0
Entries with 5 candidates:  24724


In [11]:
# Convert training data to NumPy arrays
train_candidate = np.array(train_candidate, dtype='int32')
train_label = np.array(train_label, dtype='int32')
train_user_his = np.array(train_user_his, dtype='int32')

# Convert test data to NumPy arrays
test_candidate = np.array(test_candidate, dtype='int32')
test_label = np.array(test_label, dtype='int32')
test_user_his = np.array(test_user_his, dtype='int32')

In [12]:
def get_dataset(generator, batch_size, steps_per_epoch):
    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            (tf.TensorSpec(shape=(batch_size, 5, 30), dtype=tf.int32),
             tf.TensorSpec(shape=(batch_size, 50, 30), dtype=tf.int32)),
            tf.TensorSpec(shape=(batch_size, 5), dtype=tf.int32)
        )
    )
    return dataset

In [13]:
def get_test_dataset(generator, batch_size, steps):
    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            (tf.TensorSpec(shape=(batch_size, 5, 30), dtype=tf.int32),
             tf.TensorSpec(shape=(batch_size, 50, 30), dtype=tf.int32))
        )
    )
    return dataset


In [14]:

def generate_batch_data_random(batch_size):
    idlist = np.arange(len(train_label))
    np.random.shuffle(idlist)
    y=train_label
    batches = [idlist[range(batch_size*i, min(len(y), batch_size*(i+1)))] for i in range(len(y)//batch_size+1)]
    while (True):
        for i in batches:
            item = np.array(news_title[train_candidate[i]]).reshape(batch_size, 5, 30)
            user = np.array(news_title[train_user_his[i]]).reshape(batch_size, 5, 30)
            labels = np.array(y[i])
            yield ([item,user], labels)

In [15]:

def generate_batch_data(batch_size):
    idlist = np.arange(len(test_candidate))
    batches = [idlist[range(batch_size*i, min(len(idlist), batch_size*(i+1)))] for i in range(len(idlist)//batch_size+1)]

    while (True):
        for i in batches:
            item = np.array(news_title[test_candidate[i]]).reshape(batch_size, 5, 30)
            user = np.array(news_title[test_user_his[i]]).reshape(batch_size, 5, 30)
            yield ([item,user])

In [17]:
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

In [15]:

class Attention(Layer):

    def __init__(self, nb_head, size_per_head, **kwargs):
        self.nb_head = nb_head
        self.size_per_head = size_per_head
        self.output_dim = nb_head*size_per_head
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.WQ = self.add_weight(name='WQ', 
                                  shape=(input_shape[0][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WK = self.add_weight(name='WK', 
                                  shape=(input_shape[1][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WV = self.add_weight(name='WV', 
                                  shape=(input_shape[2][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        super(Attention, self).build(input_shape)
        
    def Mask(self, inputs, seq_len, mode='mul'):
        if seq_len == None:
            return inputs
        else:
            mask = tf.one_hot(seq_len[:,0], tf.shape(inputs)[1])
            mask = 1 - tf.cumsum(mask, 1)
            for _ in range(len(inputs.shape)-2):
                mask = tf.expand_dims(mask, 2)
            if mode == 'mul':
                return inputs * mask
            if mode == 'add':
                return inputs - (1 - mask) * 1e12
                
    def call(self, x):
        if len(x) == 3:
            Q_seq,K_seq,V_seq = x
            Q_len,V_len = None,None
        elif len(x) == 5:
            Q_seq,K_seq,V_seq,Q_len,V_len = x
        Q_seq = tf.linalg.matmul(Q_seq, self.WQ)
        Q_seq = tf.reshape(Q_seq, (-1, tf.shape(Q_seq)[1], self.nb_head, self.size_per_head))
        Q_seq = tf.transpose(Q_seq, perm=[0,2,1,3])
        K_seq = tf.linalg.matmul(K_seq, self.WK)
        K_seq = tf.reshape(K_seq, (-1, tf.shape(K_seq)[1], self.nb_head, self.size_per_head))
        K_seq = tf.transpose(K_seq, perm=[0,2,1,3])
        V_seq = tf.linalg.matmul(V_seq, self.WV)
        V_seq = tf.reshape(V_seq, (-1, tf.shape(V_seq)[1], self.nb_head, self.size_per_head))
        V_seq = tf.transpose(V_seq, perm=[0,2,1,3])
        A = tf.matmul(Q_seq, K_seq, transpose_b=True) / tf.sqrt(float(self.size_per_head))
        A = tf.transpose(A, perm=[0, 3, 2, 1])
        A = self.Mask(A, V_len, 'add')
        A = tf.transpose(A, perm=[0, 3, 2, 1])
        A = tf.nn.softmax(A)
        O_seq = tf.matmul(A, V_seq)
        O_seq = tf.transpose(O_seq, perm=[0, 2, 1, 3])
        O_seq = tf.reshape(O_seq, (-1, tf.shape(O_seq)[1], self.output_dim))
        O_seq = self.Mask(O_seq, Q_len, 'mul')
        return O_seq
        
    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][1], self.output_dim)

In [18]:
MAX_SENT_LENGTH=30
MAX_SENTS=50

title_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(word_dict), 300, trainable=True)
embedded_sequences = embedding_layer(title_input)
d_emb=Dropout(0.2)(embedded_sequences)
selfatt=Attention(20,20)([d_emb,d_emb,d_emb])
selfatt=Dropout(0.2)(selfatt)
attention = Dense(200,activation='tanh')(selfatt)
attention = Flatten()(Dense(1)(attention))
attention_weight = Activation('softmax')(attention)
rep=Dot((1, 1))([selfatt, attention_weight])
titleEncoder = Model([title_input], rep)

news_input = Input((MAX_SENTS,MAX_SENT_LENGTH,))
news_encoders = TimeDistributed(titleEncoder)(news_input)
news_encoders=Dropout(0.2)(Attention(20,20)([news_encoders,news_encoders,news_encoders]))
candidates = keras.Input((1+npratio,MAX_SENT_LENGTH,))
candidate_vecs = TimeDistributed(titleEncoder)(candidates)  
news_attention= Dense(200,activation='tanh')(news_encoders)
news_attention = Flatten()(Dense(1)(news_attention))
news_attention_weight = Activation('softmax')(news_attention)
userrep=Dot((1, 1))([news_encoders, news_attention_weight])
logits = dot([userrep, candidate_vecs], axes=-1)
logits = Activation(keras.activations.softmax)(logits)      
model = Model([candidates,news_input], logits)
model.compile(loss=['categorical_crossentropy'], optimizer='adam', metrics=['acc'])

candidate_one = keras.Input((MAX_SENT_LENGTH,))
candidate_one_vec = titleEncoder([candidate_one])
score =Activation(keras.activations.sigmoid)(dot([userrep, candidate_one_vec], axes=-1))
modeltest = keras.Model([candidate_one,news_input], score)

In [None]:
for ep in range(2):
    traingen=get_dataset(lambda: generate_batch_data_random(30), 30, len(train_label) // 30)
    model.fit(traingen, epochs=1,steps_per_epoch=len(train_label)//30)

valgen=get_test_dataset(lambda: generate_batch_data(1), 1, len(test_candidate))
pred = modeltest.predict(valgen, steps=len(test_candidate),verbose=1)
predictsession=[]
for i in range(len(test_index)):
    predictsession.append(pred[test_index[i][0]:test_index[i][1],0])

In [None]:
from json import *
with open('answer.json','w')as f:
    for m in range(len(predictsession)):
        p=test_session_data[m] 
        line={"uid": p[0],"impression": {},"time":p[2]}
        for j in range(len(predictsession[m])):
            line["impression"][p[1][j]]=float(predictsession[m][j])
        f.write(JSONEncoder().encode(line)+'\n')