In [2]:
import pandas as pd
import networkx as nx
# graphtools , networkkit

In [3]:
df_fav_authors = pd.read_csv('datasets_recsys/ff_users_follow_authors.csv', sep=';')
df_fav_authors.columns = ['user', 'author']
df_fav_authors.head()

Unnamed: 0,user,author
0,100012,2324222
1,100012,499755
2,100012,1785440
3,100012,1149077
4,100012,440495


In [4]:
df_stories = pd.read_csv('datasets_recsys/ff_books_dataset.csv', sep='|', engine='python')
df_stories.head()

Unnamed: 0,story_title,author,franchise,story_id,story_summary,q_words,date_submit,date_update,chapters,reviews
0,Sweet Silence,100016.0,Harry Potter,1895110.0,Draco meets Ron on the shore of the Great Lake...,725.0,"5/30/2002,1/11/2004,11/8/2003,12/2/2003,5/19/2...",6/5/2004,1.0,2.0
1,Draco Omniscient,100016.0,Harry Potter,1574594.0,Draco's views on the world around him.,1274.0,"5/30/2002,1/11/2004,11/8/2003,12/2/2003,5/19/2...",10/26/2003,1.0,8.0
2,Never Gone Forever,100035.0,Harry Potter,767307.0,*slash* Chapter 3 now up...very short...::is ...,3100.0,"5/9/2002,5/8/2002",7/31/2002,3.0,39.0
3,Shadows,100035.0,Harry Potter,765982.0,AU in a big way. Sev is a vampire in a great ...,3598.0,"5/9/2002,5/8/2002",5/15/2002,3.0,35.0
4,Gwen Potter:Harry\'s Girl,100062.0,Harry Potter,396405.0,Gwen Potter...Harry girl...Dark brown hair to ...,120.0,,9/8/2001,0.0,1.0


In [5]:
# lista de autores de libros 
books_authors = list(set([int(x) for x in list(df_stories['author'])]))

In [6]:
# dejamos solo authores de libros 
df_authors_books = df_fav_authors[df_fav_authors['author'].isin(books_authors)]
df_authors_books.head()

Unnamed: 0,user,author
4,100012,440495
6,100012,397833
7,100012,2365707
11,100016,205600
12,100016,147795


In [13]:
print('users: {}'.format(len(df_authors_books['user'].unique())))
print('authors: {}'.format(len(df_authors_books['author'].unique())))

users: 1543976
authors: 361601


In [30]:
# guardamos csv con fav_authors de libros (sin header)
#df_authors_books.to_csv('user_fav_author_books.csv', index=False, sep=';', header=False)

### calculamos page-rank para cada autor y lo exportamos a csv (author ; page_rank):

In [29]:
# abrimos archivo csv user; fav_author books como grafo
g = nx.read_edgelist('datasets_recsys/user_fav_author_books.csv', delimiter=";", data=True, encoding='utf-8', nodetype=int, create_using=nx.DiGraph())
N, K = g.order(), g.size()


In [31]:
# average degree 
avg_degr = float(K)/N
print('Nodes: {}'.format(N))
print('Edges: {}'.format(K))
print('Average degree: {}'.format(avg_degr))

Nodes: 1668103
Edges: 12830660
Average degree: 7.691767234996879


In [45]:
# declaramos grafo como unidirected graph
g_uni = g.to_undirected()

# obtenemos todos los componentes de red
g_net_components = nx.connected_component_subgraphs(g_uni)
g_node_mc = list(g_net_components)[0]

# calculamos page rank para cada autor 
page_rank = nx.pagerank(g_node_mc, alpha=0.85, max_iter=100) # alpha --> Damping parameter for PageRank

page_rank

{4194309: 2.456277007455133e-07,
 4194316: 1.0719266748432019e-07,
 4194322: 1.7216432456040163e-06,
 8388630: 1.016746959575412e-07,
 8388632: 1.2423408116442427e-06,
 4194330: 1.9409232353313185e-07,
 4194331: 1.4170666683673498e-07,
 4194332: 9.376073824854161e-08,
 4194333: 9.944100609099277e-08,
 4194336: 7.837752757612172e-06,
 4194340: 9.040453428441662e-08,
 4194341: 2.3767398355395908e-07,
 8388646: 1.0535371456126801e-07,
 4194343: 1.0062731984786778e-07,
 4194344: 1.0220360349465203e-07,
 4194345: 1.2140201453611726e-07,
 4194355: 9.203476224113973e-08,
 4194358: 1.852999524425077e-06,
 4194360: 1.2024581857443975e-07,
 4194361: 1.2654848845261634e-07,
 8388667: 9.212424455789577e-08,
 4194363: 1.0681021752755283e-07,
 4194365: 1.0014391548955697e-07,
 8388672: 1.0309613496352656e-07,
 4194369: 9.444661905814478e-08,
 4194380: 3.662343004136177e-07,
 4194387: 9.371463752402318e-08,
 4194390: 1.6191609532592734e-06,
 4194391: 9.798319250095964e-08,
 4194401: 9.758806270849886

**Parametros de pagerank:** <br>
<br>
**alpha:** click-through probability, is included to prevent sinks (i.e. pages with no outgoing links) from "absorbing" the PageRanks of those pages connected to the sinks.
It is easy to see that an infinite surfer would have to end up in a sink given enough time, so the damping factor allows a heuristic to offset the importance of those sinks. <br>

**max_iter:** iteraciones para alcanzar el resultado de page rank 



In [46]:
data = []

for x in page_rank:
    data.append([x, page_rank[x]])

df_results_pagerank = pd.DataFrame(data, columns=['author', 'page_rank'])

df_results_pagerank.to_csv('authors_pagerank.csv', sep=';', index=False)