In [2]:
import csv
import numpy as np
from nltk.tokenize import word_tokenize
import itertools
import random 

In [3]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/annemoll/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
import pandas as pd

articles = pd.read_csv('Data/articles.csv')
history_train = pd.read_csv('Data/history_train.csv')
behaviors_train = pd.read_csv('Data/behaviors_train.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'Data/articles.csv'

In [8]:
articles['tokenized_title'] = articles['title'].apply(lambda x: word_tokenize(str(x).lower()))

# Build a vocabulary
word_to_index = {'PADDING': 0}  # Start with a PADDING token
for tokens in articles['tokenized_title']:
    for word in tokens:
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)

# Convert titles to sequences of integers
max_title_length = 30
articles['title_sequence'] = articles['tokenized_title'].apply(
    lambda tokens: [word_to_index[word] for word in tokens[:max_title_length]] +
                   [0] * (max_title_length - len(tokens[:max_title_length]))
)

# Save the processed dataset if needed
articles[['article_id', 'title_sequence']].to_csv('processed_articles.csv', index=False)


In [9]:
import ast


history_train['parsed_article_ids'] = history_train['article_id_fixed'].str.replace(r'\s+', ',', regex=True).apply(ast.literal_eval)

# Group by user_id and aggregate
user_histories = history_train.groupby('user_id')['parsed_article_ids'].sum()

# Truncate or pad click history
max_history_length = 50
user_histories = user_histories.apply(
    lambda article_ids: article_ids[-max_history_length:] + [0] * (max_history_length - len(article_ids))
)

# Map article IDs to indices from the vocabulary
article_to_index = {int(k): v for k, v in articles.set_index('article_id').to_dict()['title_sequence'].items()}
user_histories_mapped = user_histories.apply(
    lambda article_ids: [article_to_index.get(aid, 0) for aid in article_ids]
)

# Save the aggregated user histories
user_histories_df = pd.DataFrame({
    'user_id': user_histories_mapped.index,
    'click_history': user_histories_mapped.values
})
user_histories_df.to_csv('aggregated_user_histories.csv', index=False)


In [11]:
# Fix malformed strings by replacing spaces with commas inside the brackets
behaviors_train['article_ids_inview'] = behaviors_train['article_ids_inview'].str.replace(r'\s+', ',', regex=True)
behaviors_train['article_ids_clicked'] = behaviors_train['article_ids_clicked'].str.replace(r'\s+', ',', regex=True)


# Parse article ID lists
behaviors_train['articles_in_view'] = behaviors_train['article_ids_inview'].apply(ast.literal_eval)
behaviors_train['articles_clicked'] = behaviors_train['article_ids_clicked'].apply(ast.literal_eval)

# Generate positive and negative samples
def generate_samples(row):
    positives = row['articles_clicked']
    negatives = [aid for aid in row['articles_in_view'] if aid not in positives]
    return positives, negatives

behaviors_train['positives'], behaviors_train['negatives'] = zip(*behaviors_train.apply(generate_samples, axis=1))

# Save the processed behaviors dataset
behaviors_train[['user_id', 'positives', 'negatives']].to_csv('processed_behaviors.csv', index=False)


In [12]:
# Link user click histories with behaviors
click_histories_dict = user_histories_df.set_index('user_id')['click_history'].to_dict()

def prepare_model_inputs(row):
    user_history = click_histories_dict.get(row['user_id'], [0] * max_history_length)
    positives = row['positives']
    negatives = row['negatives']
    labels = [1] * len(positives) + [0] * len(negatives)
    candidates = positives + negatives
    return user_history, candidates, labels

behaviors_train['model_inputs'] = behaviors_train.apply(prepare_model_inputs, axis=1)

# Save the prepared inputs
model_inputs = behaviors_train['model_inputs'].tolist()
pd.DataFrame(model_inputs, columns=['user_history', 'candidates', 'labels']).to_csv('model_inputs.csv', index=False)

In [None]:
with open('Data/articles.csv') as f:
    newsdata=f.readlines()
with open('train.tsv')as f:
    trainuser=f.readlines()
with open('valid.tsv')as f:
    validuser=f.readlines()

In [None]:

news={}
for line in newsdata:
    linesplit=line.strip().split('\t')
    news[linesplit[0]]=word_tokenize(linesplit[3].lower())
    
newsindex={'NULL':0}
for newsid in news:
    newsindex[newsid]=len(newsindex)

In [None]:

def newsample(array,ratio):
    if ratio >len(array):
        return random.sample(array*(ratio//len(array)+1),ratio)
    else:
        return random.sample(array,ratio)
    
npratio=4
train_candidate=[]    
train_label=[]
train_user_his=[]

for user in trainuser:
    userline=user.replace('\n','').split('\t')
    clickids=[newsindex[x.split('#TAB#')[0]] for x in userline[1].split('#N#')][-50:]
    pdoc=[newsindex[x] for x in userline[2].split('#TAB#')[0].split()]
    ndoc=[newsindex[x] for x in userline[2].split('#TAB#')[1].split()]
    
    for doc in pdoc:
        negd=newsample(ndoc,npratio)
        negd.append(doc)
        candidate_label=[0]*npratio+[1]
        candidate_order=list(range(npratio+1))
        random.shuffle(candidate_order)
        candidate_shuffle=[]
        candidate_label_shuffle=[]
        for i in candidate_order:
            candidate_shuffle.append(negd[i])
            candidate_label_shuffle.append(candidate_label[i])
        train_candidate.append(candidate_shuffle)
        train_label.append(candidate_label_shuffle)
        train_user_his.append(clickids+[0]*(50-len(clickids))) 
        
test_candidate=[] 
test_user_his=[]
test_index=[]
test_session_data=[]

for user in validuser:
    userline=user.replace('\n','').split('\t')
    clickids=[newsindex[x.split('#TAB#')[0]] for x in userline[1].split('#N#')][-50:]
    alldoc=[newsindex[x] for x in userline[2].split('#TAB#')[0].split()]
    test_session_data.append([userline[0],userline[2].split('#TAB#')[0].split(),userline[2].split('#TAB#')[1]])
    index=[]
    index.append(len(test_candidate))
                  
    for doc in alldoc:
        test_candidate.append(doc)
        test_user_his.append(clickids+[0]*(50-len(clickids)))
    index.append(len(test_candidate))
    test_index.append(index)



In [None]:
word_dict={'PADDING':0}
news_title=[[0]*30]

for newsid in news:
    title=[]
    for word in news[newsid]:
        if word not in word_dict:
            word_dict[word]=len(word_dict)
        title.append(word_dict[word])
    title=title[:30]
    news_title.append(title+[0]*(30-len(title)))

news_title=np.array(news_title,dtype='int32') 

In [None]:
train_candidate=np.array(train_candidate,dtype='int32')
train_label=np.array(train_label,dtype='int32')
train_user_his=np.array(train_user_his,dtype='int32')

test_candidate=np.array(test_candidate,dtype='int32') 
test_user_his=np.array(test_user_his,dtype='int32')

In [None]:

def generate_batch_data_random(batch_size):
    idlist = np.arange(len(train_label))
    np.random.shuffle(idlist)
    y=train_label
    batches = [idlist[range(batch_size*i, min(len(y), batch_size*(i+1)))] for i in range(len(y)//batch_size+1)]
    while (True):
        for i in batches:
            item = news_title[train_candidate[i]]
            user=news_title[train_user_his[i]]
            yield ([item,user], [y[i]])
            

In [None]:

def generate_batch_data(batch_size):
    idlist = np.arange(len(test_candidate))
    batches = [idlist[range(batch_size*i, min(len(idlist), batch_size*(i+1)))] for i in range(len(idlist)//batch_size+1)]

    while (True):
        for i in batches:
            item = news_title[test_candidate[i]]
            user=news_title[test_user_his[i]]
            yield ([item,user])
            

In [None]:
import keras
from keras.layers import *
from keras.models import Model
from keras import backend as K
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [None]:

class Attention(Layer):

    def __init__(self, nb_head, size_per_head, **kwargs):
        self.nb_head = nb_head
        self.size_per_head = size_per_head
        self.output_dim = nb_head*size_per_head
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.WQ = self.add_weight(name='WQ', 
                                  shape=(input_shape[0][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WK = self.add_weight(name='WK', 
                                  shape=(input_shape[1][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WV = self.add_weight(name='WV', 
                                  shape=(input_shape[2][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        super(Attention, self).build(input_shape)
        
    def Mask(self, inputs, seq_len, mode='mul'):
        if seq_len == None:
            return inputs
        else:
            mask = K.one_hot(seq_len[:,0], K.shape(inputs)[1])
            mask = 1 - K.cumsum(mask, 1)
            for _ in range(len(inputs.shape)-2):
                mask = K.expand_dims(mask, 2)
            if mode == 'mul':
                return inputs * mask
            if mode == 'add':
                return inputs - (1 - mask) * 1e12
                
    def call(self, x):
        if len(x) == 3:
            Q_seq,K_seq,V_seq = x
            Q_len,V_len = None,None
        elif len(x) == 5:
            Q_seq,K_seq,V_seq,Q_len,V_len = x
        Q_seq = K.dot(Q_seq, self.WQ)
        Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
        Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))
        K_seq = K.dot(K_seq, self.WK)
        K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
        K_seq = K.permute_dimensions(K_seq, (0,2,1,3))
        V_seq = K.dot(V_seq, self.WV)
        V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
        V_seq = K.permute_dimensions(V_seq, (0,2,1,3))
        A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5
        A = K.permute_dimensions(A, (0,3,2,1))
        A = self.Mask(A, V_len, 'add')
        A = K.permute_dimensions(A, (0,3,2,1))    
        A = K.softmax(A)
        O_seq = K.batch_dot(A, V_seq, axes=[3,2])
        O_seq = K.permute_dimensions(O_seq, (0,2,1,3))
        O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
        O_seq = self.Mask(O_seq, Q_len, 'mul')
        return O_seq
        
    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][1], self.output_dim)

In [None]:

    
MAX_SENT_LENGTH=30
MAX_SENTS=50

title_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(word_dict), 300, trainable=True)
embedded_sequences = embedding_layer(title_input)
d_emb=Dropout(0.2)(embedded_sequences)
selfatt=Attention(20,20)([d_emb,d_emb,d_emb])
selfatt=Dropout(0.2)(selfatt)
attention = Dense(200,activation='tanh')(selfatt)
attention = Flatten()(Dense(1)(attention))
attention_weight = Activation('softmax')(attention)
rep=Dot((1, 1))([selfatt, attention_weight])
titleEncoder = Model([title_input], rep)

news_input = Input((MAX_SENTS,MAX_SENT_LENGTH,))
news_encoders = TimeDistributed(titleEncoder)(news_input)
news_encoders=Dropout(0.2)(Attention(20,20)([news_encoders,news_encoders,news_encoders]))
candidates = keras.Input((1+npratio,MAX_SENT_LENGTH,))
candidate_vecs = TimeDistributed(titleEncoder)(candidates)  
news_attention= Dense(200,activation='tanh')(news_encoders)
news_attention = Flatten()(Dense(1)(news_attention))
news_attention_weight = Activation('softmax')(news_attention)
userrep=Dot((1, 1))([news_encoders, news_attention_weight])
logits = dot([userrep, candidate_vecs], axes=-1)
logits = Activation(keras.activations.softmax)(logits)      
model = Model([candidates,news_input], logits)
model.compile(loss=['categorical_crossentropy'], optimizer='adam', metrics=['acc'])

candidate_one = keras.Input((MAX_SENT_LENGTH,))
candidate_one_vec = titleEncoder([candidate_one])
score =Activation(keras.activations.sigmoid)(dot([userrep, candidate_one_vec], axes=-1))
modeltest = keras.Model([candidate_one,news_input], score)

In [None]:
for ep in range(2):
    traingen=generate_batch_data_random(30)
    model.fit_generator(traingen, epochs=1,steps_per_epoch=len(train_label)//30)
    
valgen=generate_batch_data(1)
pred = modeltest.predict_generator(valgen, steps=len(test_candidate),verbose=1)
predictsession=[]
for i in range(len(test_index)):
    predictsession.append(pred[test_index[i][0]:test_index[i][1],0])

In [None]:
from json import *
with open('answer.json','w')as f:
    for m in range(len(predictsession)):
        p=test_session_data[m] 
        line={"uid": p[0],"impression": {},"time":p[2]}
        for j in range(len(predictsession[m])):
            line["impression"][p[1][j]]=float(predictsession[m][j])
        f.write(JSONEncoder().encode(line)+'\n')