In [1]:
import numpy
from numpy import matrix

In [3]:
with open('./nametagrate_num_nonull.txt', 'r') as log_fp:
    logs = [ log.strip() for log in log_fp.readlines() ]

In [4]:
logs_tuple = [ tuple(log.split(",")) for log in logs ]

In [5]:
restaurants = list(set([ log[0] for log in logs_tuple ]))
tags = list(set([ log[1] for log in logs_tuple ]))

In [8]:
# Graph means the relations number
graph = numpy.matrix(numpy.zeros([len(restaurants), len(tags)]))

for log in logs_tuple:
    restaurant = log[0]
    tag = log[1]
    r_i = restaurants.index(restaurant)
    t_j = tags.index(tag)
    graph[r_i, t_j] += 1

print (graph)

[[ 1.  1.  1. ...,  1.  1.  0.]
 [ 1.  1.  0. ...,  1.  1.  0.]
 [ 1.  1.  0. ...,  1.  0.  0.]
 ..., 
 [ 1.  0.  0. ...,  1.  1.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 1.  1.  0. ...,  1.  0.  0.]]


In [20]:
len(restaurants)

4851

In [21]:
len(tags)

14

In [9]:
restaurant_sim = matrix(numpy.identity(len(restaurants)))

In [10]:
restaurant_sim 

matrix([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  1., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  1.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  1.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  1.]])

In [11]:
tag_sim = matrix(numpy.identity(len(tags)))

In [12]:
tag_sim 

matrix([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.],
        [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.],
        [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.],
        [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.],
        [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
          0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
          0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
          0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
          0.],
        [ 0.,  0.,  0

In [13]:
# 計算兩個子集分別有多少種類
def get_tags_num(restaurant):
    r_i = restaurants.index(restaurant)
    return graph[r_i]

def get_restaurants_num(tag):
    t_j = tags.index(tag)
    return graph.transpose()[t_j]

In [14]:
#讓子集種類依照順序排序?
def get_tags(restaurant):
    series = get_tags_num(restaurant).tolist()[0]
    return [ tags[x] for x in range(len(series)) if series[x] > 0 ]

def get_restaurants(tag):
    series = get_restaurants_num(tag).tolist()[0]
    return [ restaurants[x] for x in range(len(series)) if series[x] > 0 ]

In [15]:
# 找到restaurant子集的相似度
def restaurant_simrank(r1, r2, C):
    """
    in this, graph[r_i] -> connected tags
    """
    """
    print "r1.tags"
    print get_tags_num(r1).tolist()
    print "r2.tags"
    print get_tags_num(r2).tolist()
    """
    if r1 == r2 : return 1
    prefix = C / (get_tags_num(r1).sum() * get_tags_num(r2).sum())
    postfix = 0
    for tag_i in get_tags(r1):
        for tag_j in get_tags(r2):
            i = tags.index(tag_i)
            j = tags.index(tag_j)
            postfix += tag_sim[i, j]
    return prefix * postfix

In [16]:
# 找到tag 子集的相似度
def tag_simrank(t1, t2, C):
    """
    in this, graph need to be transposed to make ad to be the index
    """
    """
    print "t1.restaurants"
    print get_restaurants_num(t1)
    print "t2.restaurants"
    print get_restaurants_num(t2)
    """
    if t1 == t2 : return 1
    prefix = C / (get_restaurants_num(t1).sum() * get_restaurants_num(t2).sum())
    postfix = 0
    for restaurant_i in get_restaurants(t1):
        for restaurant_j in get_restaurants(t2):
            i = restaurants.index(restaurant_i)
            j = restaurants.index(restaurant_j)
            postfix += restaurant_sim[i,j]
    return prefix * postfix

In [22]:
#依照兩個子集的相似度定義出新的相似度矩陣
def simrank(C=0.8, times=1):
    global restaurant_sim, tag_sim

    for run in range(times):
        # restaurants simrank
        new_restaurant_sim = matrix(numpy.identity(len(restaurants)))
        for ri in restaurants:
            for rj in restaurants:
                i =  restaurants.index(ri)
                j =  restaurants.index(rj)
                new_restaurant_sim[i,j] = restaurant_simrank(ri, rj, C)


        # tags simrank
        new_tag_sim = matrix(numpy.identity(len(tags)))
        for ti in tags:
            for tj in tags:
                i = tags.index(ti)
                j = tags.index(tj)
                new_tag_sim[i,j] = tag_simrank(ti, tj, C)

        restaurant_sim = new_restaurant_sim
        tag_sim = new_tag_sim

In [None]:
if __name__ == '__main__':
#     print("========================restaurants=========================")
#     print (restaurants[0])
#     print("========================tags============================")
#     print (tags)
    simrank()
    print("========================restaurant_sim======================")
    print (restaurant_sim)
    print("========================tag_sim=========================")
    print (tag_sim)

In [None]:
restaurant_sim

In [None]:
tmp = pd.DataFrame(restaurant_sim)

In [None]:
pd.DataFrame.to_csv(tmp)

In [None]:
tmp.to_csv('./emotion.csv')