In [1]:
import pandas as pd
import os
from typing import List, Tuple
import numpy as np

## LOad raw Data 
def load_articles(data_folder: str) -> pd.DataFrame:
    """
    Load the articles as pandas dataframe

    Keyword arguments:
        data_folder -- the path to the folder that contains the data

    Returns:
        pandas dataframe containing the article data
    """
    return pd.read_csv(
        open(os.path.join(data_folder, "raw-data.csv"), encoding="latin_1")
    )


# Load the tags (Useless)
def load_tags(data_folder: str) -> List[str]:
    """
    Load the tag names as list

    Keyword arguments:
        data_folder -- the path to the folder that contains the data

    Returns:
        list of tag names. The index of the tags in the list are the ids of the tags
    """
    tags = []
    with open(os.path.join(data_folder, "tags.dat")) as tag_file:
        for tag in tag_file.readlines():
            tags.append(tag.strip())
    return tags

## Article id -> tag matching
def load_article_tags(data_folder: str) -> List[List[str]]:
    """
    Load the tags for each article

    Keyword arguments:
        data_folder -- the path to the folder that contains the data

    Returns:
        list of tags per article. This is represented as a 2d list. Each element of the outer list
        represents the article at that index in the pandas Dataframe. The inner lists contain the
        tag ids for each article. So to get the first tag name for the first article you would call:
        tags[article_tags[0][0]]
    """
    item_tag = []
    with open(os.path.join(data_folder, "item-tag.dat")) as item_tag_file:
        for i, line in enumerate(item_tag_file.readlines()):
            item_tag.append([int(tag) for tag in line.strip().split()[1:]])
    return item_tag


def load_citations(data_folder: str, num_articles: int = 16980) -> np.ndarray:
    """
    Load the citation network

    Keyword arguments:
        data_folder -- the path to the folder that contains the data
        num_articles -- the number of articles in the dataset

    Returns:
        square 2d numpy matrix with citation information about articles.
        The citation matrix is num_articles X num_articles.
        There is a 1 in locations in which there is a citation between articles a -> b
        citations[a][b] and 0 otherwise.
    """
    citations = np.zeros([num_articles, num_articles])
    with open(os.path.join(data_folder, "citations.dat")) as citations_file:
        for i, line in enumerate(citations_file.readlines()):
            for citation in line.strip().split()[1:]:
                citations[i][int(citation)] = 1
    return citations

## train data user->article matching
def load_user_article_likes(
    data_folder: str, num_users: int = 5551, num_articles: int = 13584
) -> np.ndarray:
    """
    Load the article likes for each user

    Keyword arguments:
        data_folder -- the path to the folder that contains the data
        num_users -- the number of users in the dataset - default is 5551
        num_articles -- the number of articles in the dataset - default is 13584

    Returns:
        2d numpy matrix with describing user like information.
        This matrix is num_users X num_articles.
        There is a 1 in locations in which a user likes an article u -> a
        citations[u][a] and 0 otherwise.
    """
    user_items = np.zeros([num_users, num_articles])
    with open(os.path.join(data_folder, "train_data.dat")) as users_file:
        for i, line in enumerate(users_file.readlines()):
            for article in line.strip().split()[1:]:
                user_items[i][int(article)] = 1
    return user_items


def load_articles_and_user_article_likes(
    data_folder: str,
) -> Tuple[pd.DataFrame, np.ndarray]:
    """
    Load both the article dataframe and the user article likes numpy array.

    Keyword arguments:
        data_folder -- the path to the folder that contains the data

    Returns:
        Tuple
        First element is a dataframe containing article info
        Second is 2d numpy matrix with describing user like information.
    """
    articles_df = load_articles(data_folder)
    user_article_likes = load_user_article_likes(data_folder)
    return (articles_df, user_article_likes)

In [3]:
raw_data = load_articles('data')

In [4]:
raw_data.head(10)

Unnamed: 0,doc.id,title,citeulike.id,raw.title,raw.abstract
0,1,the metabolic world of escherichia coli is not...,42.0,The metabolic world of Escherichia coli is not...,To elucidate the organizational and evolutiona...
1,2,reverse engineering of biological complexity,43.0,Reverse Engineering of Biological Complexity,Advanced technologies and biology have extreme...
2,3,exploring complex networks,44.0,Exploring complex networks,"The study of networks pervades all of science,..."
3,4,comparative assessment of largescale data sets...,46.0,Comparative assessment of large-scale data set...,Comprehensive protein protein interaction maps...
4,5,navigation in a small world,47.0,Navigation in a small world,The small-world phenomenon â the principle t...
5,6,random graphs with arbitrary degree distributi...,48.0,Random graphs with arbitrary degree distributi...,Recent work on the structure of social network...
6,7,artificial gene networks for objective compari...,49.0,Artificial gene networks for objective compari...,Motivation: Large-scale gene expression profil...
7,8,the segment polarity network is a robust devel...,50.0,The segment polarity network is a robust devel...,"All insects possess homologous segments, but s..."
8,9,the evolutionary origin of complex features,52.0,The evolutionary origin of complex features.,A long-standing challenge to evolutionary theo...
9,10,early language acquisition cracking the speech...,60.0,Early language acquisition: cracking the speec...,"Infants learn language with remarkable speed, ..."


In [6]:
tags = load_tags('data')
tags

['ucsc-browser',
 'rr_interval',
 'userscripts',
 'newsciencenetwork',
 'sequence_similarity_search',
 'naturalcapital',
 'spiders',
 'localized',
 'segmentation_algorithm',
 'polymer_physics',
 'tandem-repeats',
 'gene_orthology',
 'ontologylearning',
 'environmental_effects',
 'originality',
 'trial_and_error',
 'fundation',
 '12_mar_10',
 'ioannidis',
 'canem',
 'mahalanobis_distance',
 'climate_variability',
 'eugenics',
 'appropriation',
 'communicating-uncertainty',
 'chemical_reaction_network_theory',
 'bringing',
 'advices',
 'berryphase',
 'noncoding_rna',
 'tcbr',
 'disease-network',
 'circuitry',
 'digital_researcher',
 'elgar',
 'projeto_tese',
 'geraint-rees',
 'ligation_independent_cloning',
 'fisherinformation',
 'oscilaltions',
 'regulatory_nw_inference',
 'cybermetrics',
 'preface',
 'feasibility',
 'open_innovation',
 'network_graphs',
 'dft_general',
 'reengineering',
 'scraped',
 'molecule_detection',
 'errors',
 'paired-end',
 'neuroprosthesis',
 'graphssocial-netw

In [12]:
vocab = []
with open(os.path.join("data/mult.dat")) as tag_file:
    for tag in tag_file.readlines():
        vocab.append(tag.strip())
vocab

['63 1:2 1666:1 132:1 901:1 1537:2 8:1 9:1 912:1 6594:1 6670:1 1168:1 1041:1 403:1 660:1 278:1 151:2 282:1 3483:1 1309:2 1438:2 32:1 1051:1 1572:1 37:2 1830:1 423:1 680:1 7:1 1197:1 349:1 5298:1 54:1 188:1 319:4 3265:1 322:1 1091:2 2373:1 454:1 4729:1 1747:2 469:1 86:1 1751:1 5024:1 93:2 229:2 6113:1 296:1 1659:1 633:1 101:1 486:1 1512:1 935:1 2159:1 1320:1 147:1 116:1 1401:2 890:1 1531:1 893:2',
 '59 1792:1 771:1 1859:1 518:1 2943:1 649:1 138:1 142:1 17:1 275:1 2452:1 661:1 4506:1 367:1 284:4 6560:1 1825:1 35:1 1030:2 1065:1 810:1 1351:1 1453:1 47:1 48:2 563:2 3893:1 313:1 714:1 2110:1 447:1 458:1 834:1 323:1 5317:1 1350:1 1735:1 1353:1 202:1 204:1 77:1 3665:1 3282:1 398:1 2134:1 345:2 805:1 992:1 2021:1 4326:1 1809:1 1519:1 2416:1 3439:1 1396:1 634:1 5499:1 1790:1 1151:1',
 '47 129:1 6:1 647:1 8:3 9:2 266:1 3467:1 2958:1 1039:1 16:1 88:1 4114:1 147:1 1429:1 25:1 1438:1 1439:1 3105:1 1698:1 7590:1 167:3 4651:1 49:2 1206:1 31:1 1212:1 319:1 708:1 326:1 203:1 845:1 3281:1 215:1 216:1 89

In [2]:
import numpy as np
import pickle, time
# import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
#init random seed
np.random.seed(5)

Instructions for updating:
non-resource variables are not supported in the long term


In [3]:
#find vocabulary_size = 8000
with open(r"data/vocabulary.dat") as vocabulary_file:
    vocabulary_size = len(vocabulary_file.readlines())
    
#find item_size = 16980
with open(r"data/mult.dat") as item_info_file:
    item_size = len(item_info_file.readlines())

#initialize item_infomation_matrix (16980 , 8000)
item_infomation_matrix = np.zeros((item_size , vocabulary_size))

#build item_infomation_matrix
with open(r"data/mult.dat") as item_info_file:
    sentences = item_info_file.readlines()
    
    for index,sentence in enumerate(sentences):
        words = sentence.strip().split(" ")[1:]
        for word in words:
            vocabulary_index , number = word.split(":")
            item_infomation_matrix[index][int(vocabulary_index)] =number

In [15]:
import numpy as np
rating_matrix = load_user_article_likes('data')
rating_matrix.shape

(5551, 13584)

In [17]:


class MF():
    def __init__(self , rating_matrix ):
        self.num_u = rating_matrix.shape[0] #5551
        self.num_v = rating_matrix.shape[1] #16980
        self.u_lambda = 100
        self.v_lambda = 0.1
        self.k = 50 
        self.a = 1
        self.b =0.01
        self.R = np.mat(rating_matrix)
        self.C = np.mat(np.ones(self.R.shape)) * self.b
        self.C[np.where(self.R>0)] = self.a
        self.I_U = np.mat(np.eye(self.k) * self.u_lambda)
        self.I_V = np.mat(np.eye(self.k) * self.v_lambda)
        self.U = np.mat(np.random.normal(0 , 1/self.u_lambda , size=(self.k,self.num_u)))
        self.V = np.mat(np.random.normal(0 , 1/self.v_lambda , size=(self.k,self.num_v)))
                        

    def test(self):
        print( ((U_cut*self.R[np.ravel(np.where(self.R[:,j]>0)[1]),j] + self.v_lambda * self.V_sdae[j])).shape)
    def ALS(self , V_sdae):
        self.V_sdae = np.mat(V_sdae)
        
        V_sq = self.V * self.V.T * self.b
        for i in range(self.num_u):
            idx_a = np.ravel(np.where(self.R[i,:]>0)[1])
            V_cut = self.V[:,idx_a]
            self.U[:,i] = np.linalg.pinv( V_sq+ V_cut * V_cut.T * (self.a-self.b) + self.I_U )*(V_cut*self.R[i,idx_a].T) #V_sq+V_cut*V_cut.T*a_m_b = VCV^T
        
        U_sq = self.U * self.U.T * self.b
        for j in range(self.num_v):
            idx_a = np.ravel(np.where(self.R[:,j]>0)[1])
            U_cut = self.U[:,idx_a]
            self.V[:,j] = np.linalg.pinv(U_sq+U_cut*U_cut.T*(self.a-self.b)+self.I_V)* (U_cut*self.R[idx_a,j] + self.v_lambda * np.resize(self.V_sdae[j],(self.k,1)))
        
        return self.U ,self.V



In [5]:
def mask(corruption_level ,size):
    mask = np.random.binomial(1, 1 - corruption_level, [size[0],size[1]])
    return mask

def add_noise(x , corruption_level ):
    x = x * mask(corruption_level , x.shape)
    return x

In [16]:
class CDL():
    def __init__(self , rating_matrix , item_infomation_matrix):
        
        self.n_input = item_infomation_matrix.shape[1]
        self.n_hidden1 = 200
        self.n_hidden2 = 50
        self.k = 50
        
        self.lambda_w = 0.1
        self.lambda_n = 10
        self.lambda_u = 1
        self.lambda_v = 10
        
        self.drop_ratio = 0.1
        self.learning_rate = 0.01
        self.epochs = 2
        self.batch_size = 256
        
        self.a = 1
        self.b =0.01
        self.P = 1
        
        self.num_u = rating_matrix.shape[0]
        self.num_v = rating_matrix.shape[1]
        
        self.Weights = {
            'w1' : tf.Variable(tf.truncated_normal( [self.n_input , self.n_hidden1] , mean=0.0, stddev= tf.truediv(1.0,self.lambda_w))),
            'w2' : tf.Variable(tf.truncated_normal( [self.n_hidden1 , self.n_hidden2] , mean=0.0, stddev= tf.truediv(1.0,self.lambda_w))),
            'w3' : tf.Variable(tf.truncated_normal( [self.n_hidden2 , self.n_hidden1] , mean=0.0, stddev= tf.truediv(1.0,self.lambda_w))),
            'w4' : tf.Variable(tf.truncated_normal( [self.n_hidden1 , self.n_input] , mean=0.0,  stddev= tf.truediv(1.0,self.lambda_w)))   
        }
        self.Biases = {
            'b1' : tf.Variable( tf.zeros(shape=self.n_hidden1) ),
            'b2' : tf.Variable( tf.zeros(shape=self.n_hidden2) ),
            'b3' : tf.Variable( tf.zeros(shape=self.n_hidden1) ),
            'b4' : tf.Variable( tf.zeros(shape=self.n_input) ),
        }
        
        self.item_infomation_matrix = item_infomation_matrix
        
        self.rating_matrix = rating_matrix
        
        for i in range(self.num_u):
            try:
                x = np.random.choice(np.where(self.rating_matrix[i,:]>0)[0] , self.P)
            except:
                x=1
            self.rating_matrix[i,:].fill(0)
            self.rating_matrix[i,x] = 1
        
        self.confidence = np.mat(np.ones(self.rating_matrix.shape)) * self.b
        self.confidence[np.where(self.rating_matrix>0)] = self.a
        
    def encoder(self , x , drop_ratio):
        w1, b1 = self.Weights['w1'], self.Biases['b1']
        L1 = tf.nn.sigmoid( tf.matmul(x,w1) + b1 )
        L1 = tf.nn.dropout( L1 , keep_prob= 1 - drop_ratio )
        
        w2, b2 = self.Weights['w2'],self.Biases['b2']
        
        L2 = tf.nn.sigmoid( tf.matmul(L1,w2) + b2 )
        L2 = tf.nn.dropout(L2 , keep_prob= 1 - drop_ratio)
        
        return L2
    
    def decoder(self , x , drop_ratio):
        w3, b3 = self.Weights['w3'], b2 = self.Biases['b3']
        L3 = tf.nn.sigmoid(tf.matmul(x,w3) + b3)
        L3 = tf.nn.dropout(L3 , keep_prob= 1 - drop_ratio)

        w4, b4 = self.Weights['w4'], self.Biases['b4']
        L4 = tf.nn.sigmoid(tf.matmul(L3,w4) + b4)
        L4 = tf.nn.dropout(L4 , keep_prob= 1 - drop_ratio)

        return L4
    

    def build_model(self):
        
        self.X_0 = tf.placeholder(tf.float32 , shape=(None , self.n_input))
        self.X_c = tf.placeholder(tf.float32 , shape=(None , self.n_input))
        self.C = tf.placeholder(tf.float32 , shape=(self.num_u,None) )
        self.R = tf.placeholder(tf.float32 , shape=(self.num_u,None) )
        self.drop_ratio = tf.placeholder(tf.float32)
        self.model_batch_data_idx = tf.placeholder( tf.int32 , shape=None )
        
        #SDAE item factor
        V_sdae = self.encoder(self.X_0, self.drop_ratio )
        
        #SDAE output 
        sdae_output = self.decoder( V_sdae, self.drop_ratio )
        
        
        
        
        batch_size = tf.cast(tf.shape(self.X_0)[0], tf.int32)
        
        
        self.V = tf.Variable( tf.zeros(shape=[self.num_v, self.k], dtype=tf.float32 ) ) 
        self.U = tf.Variable( tf.zeros(shape=[self.num_u, self.k], dtype=tf.float32 ) )
        
        batch_V = tf.reshape(tf.gather(self.V, self.model_batch_data_idx), shape=[batch_size, self.k])
        
        loss_1 = self.lambda_u * tf.nn.l2_loss( self.U ) 
        loss_2 = self.lambda_w * 1/2 * tf.reduce_sum([tf.nn.l2_loss(w)+tf.nn.l2_loss(b) for w,b in zip(self.Weights.values() , self.Biases.values())])
        loss_3 = self.lambda_v * tf.nn.l2_loss(batch_V - V_sdae)
        loss_4 = self.lambda_n * tf.nn.l2_loss(sdae_output - self.X_c)
        
        loss_5 = tf.reduce_sum(tf.multiply(self.C ,
                                    tf.square(self.R - tf.matmul(self.U , batch_V , transpose_b=True))) 
                                )
        
        self.loss = loss_1 + loss_2 + loss_3 + loss_4 + loss_5
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    def train_model(self):
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
        start_time = time.time()
        
        random_idx = np.random.permutation(self.num_v)
        
        self.item_infomation_matrix_noise = add_noise(self.item_infomation_matrix , 0.3)
        
        for epoch in range(self.epochs):
            batch_cost = 0
            for i in range(0 , self.item_infomation_matrix.shape[0] , self.batch_size):
                
                batch_idx = random_idx[i:i+self.batch_size]
                _ , loss = self.sess.run([self.optimizer, self.loss] , 
                                            feed_dict={self.X_0 : self.item_infomation_matrix_noise[batch_idx,:] , 
                                                       self.X_c : self.item_infomation_matrix[batch_idx,:] , 
                                                       self.R : self.rating_matrix[: , batch_idx], 
                                                       self.C : self.confidence[: , batch_idx], 
                                                       self.drop_ratio : 0.1 ,
                                                       self.model_batch_data_idx  : batch_idx })
                batch_cost = batch_cost + loss

            print ("Training //", "Epoch %d //" % (epoch+1), " Total cost = {:.2f}".format(batch_cost), "Elapsed time : %d sec" % (time.time() - start_time))
            
        return self.sess.run((tf.matmul(self.U, self.V, transpose_b=True)))

In [9]:
R_train = rating_matrix.copy()
cdl = CDL(R_train , item_infomation_matrix)
cdl.build_model()
R = cdl.train_model()

Training // Epoch 1 //  Total cost = 657817522.00 Elapsed time : 22 sec
Training // Epoch 2 //  Total cost = 517610760.50 Elapsed time : 41 sec


In [10]:
all_cnt = 0
for i in range(rating_matrix.shape[0]):
    l_score = np.ravel(R[i,:]).tolist()
    pl = sorted(enumerate(l_score),key=lambda d:d[1],reverse=True)
    l_rec = [i[0] for i in pl][:300]
    s_rec = set(l_rec)
    s_true = set(np.ravel(np.where(rating_matrix[i,:]>0)))
    cnt_hit = len(s_rec.intersection(s_true))
    all_cnt = all_cnt + cnt_hit/len(s_true)
all_cnt

1577.0

In [49]:
np.where(rating_matrix[i,:]>0), sum(rating_matrix), set(np.ravel(np.where(rating_matrix[0,:]>0)))

((array([], dtype=int64),),
 array([18., 23., 80., ...,  4., 10.,  7.]),
 {11650})

In [14]:
R.shape

(5551, 13584)