In [257]:
import sys  
sys.path.insert(0, '../search-engine')
from utils import *
import pandas as pd
from igraph import *
import implicit
from scipy.sparse import csr_matrix
from sklearn.metrics import ndcg_score
from copy import deepcopy

## Create graph

In [30]:
# get data 
data = get_tweets(None, None, mode = "read", data_directory = '../data/')

In [38]:
# since igraph does not use node ids -> we need to create a dict storing the conversion
ids_mapping = {}
hashtags = {}
counter = 0
for tweet in data:
    if "retweeted_status" in tweet.keys():
        if tweet["user"]["id"] not in ids_mapping.keys():
            ids_mapping[tweet["user"]["id"]] = counter            
            counter += 1

In [256]:
retweets = {}
hashtags = {} 
for i, tweet in enumerate(data): 
    try: 
        id2 = ids_mapping[tweet["retweeted_status"]["user"]["id"]]        
        id1 = ids_mapping[tweet["user"]["id"]]
        try: retweets[id1].append(id2)
        except: retweets[id1] = [id2]
        
        if id1 not in hashtags.keys(): hashtags[id1] = []
        if id2 not in hashtags.keys(): hashtags[id2] = []
        
        # store hashtags list for each user for personalized score
        for hashtag in tweet["entities"]["hashtags"]:
            hashtags[id1].append(hashtag["text"])
            hashtags[id2].append(hashtag["text"])
    except: 
        continue

In [40]:
edgelist = []
for node in retweets:
    for i in retweets[node]:
        edgelist.append((node, i))
edgelist = list(set(edgelist)) # remove duplicated links

In [45]:
g = Graph(n = len(ids_mapping))
g.add_edges(edgelist)

Split graph into train and test

In [47]:
# fraction of edges to select as test-set
p = 0.2

# graphsize
N = len(g.es)

# idxs of all the edges
all_idxs = range(N)

# sample idxs of edges through the function "choice"
test_idxs = np.random.choice(a=all_idxs, size=int(p*N),replace=False)

In [112]:
ground_truth = set()
trainset = set()
for idx, one_edge in enumerate(g.es):
    
    # take n1 and n2 idx from one_edge, that is an igraph edge *object*
    n1 = one_edge.source
    n2 = one_edge.target

    if idx in test_idxs:
        ground_truth.add((n1, n2))
    else:
        trainset.add((n1, n2))

In [62]:
# test_idxs --> nodes q formen part del test
# ground_truth --> links entre els nodes del test set
# trainset --> links entre els nodes del train set 

In [113]:
def find_nodes_at_distance_2(graph):
    """
    starting from a graph this function returns all the nodes at distance 2
    """
    all_potential_recommendations = set()
    
    for n1 in graph.vs:
        # all the nodes at distance 1 and distance 2
        nodes_at_most_distant_2 = set(graph.neighborhood(n1, order = 2))
            
        if len(nodes_at_most_distant_2) > 0:
            for n2 in nodes_at_most_distant_2: 
                # since n1 is an igraph vertex object, we need to extract the id
                n1_index = n1.index
                if n2 != n1_index: 
                    a = min(n2, n1_index)
                    b = max(n2, n1_index)
                    all_potential_recommendations.add((a,b))
                
    return list(set(all_potential_recommendations))

Get nodes at distance 2 for future prediction

In [114]:
# Get nodes at distance 2 in test set
dist_2 = find_nodes_at_distance_2(g)
test_nodes = [(u,v) for u, v in dist_2 if (u in test_idxs) and (v in test_idxs)]

In [125]:
# Create dataframe with ground truth
test_list = []
for u, v in test_nodes: 
    if (u, v) in ground_truth: test_list.append((u,v,1))
    else: test_list.append((u,v,0))
        
test_df = pd.DataFrame(test_list, columns = ["u", "v", "ground_truth"])

## Adamic-Adar

In [63]:
def compute_ADA(u,v, graph):
    """
    compute adamic-adar from scratch
    """
    # set of neighbors of u
    outlinks_from_u = set(graph.neighbors(u))
    # set of neighbors of v
    inlinks_to_v = set(graph.neighbors(v))
    
    # set Z of neighbors of both
    bridges = outlinks_from_u.intersection(inlinks_to_v)
    
    # degree of nodes in set Z
    deg_ = [graph.degree(node) for node in bridges]
    
    # computing the reciprocal in log-scale
    out = [1./np.log2(dd+1) for dd in deg_]

    return sum(out)

In [158]:
pred_ADA = []
# Predictions
for i, row in test_df.iterrows():
    pred = compute_ADA(int(row["u"]), int(row["v"]), g)
    pred_ADA.append(pred)
test_df["ADA"] = pred_ADA

<font color = "Red">
    
    
**CALCULAR nDCG**
    

## ALS

In [None]:
# Get the adjacency matrix data
M = g.get_adjacency().data

In [151]:
M = csr_matrix(M)

In [152]:
# Run the model ALS
model = implicit.als.AlternatingLeastSquares(factors=10, calculate_training_loss=True,  iterations=5)

# Train the model on a sparse matrix of item/user/confidence weights
model.fit(M)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [153]:
def predict_ALS(testset, model):
    """
    predict for a list of observations the score for adding/removing a link
    """

    # initialize the empty list
    all_predictions = []

    # scroll the obs
    for n1, n2, w in testset:
        
        # take here the low-dimensional vectors returned by the matrix factorization
        
        array_n1 = model.user_factors[n1, :]
        array_n2 = model.item_factors[n2, :]

        # multiplying these vectors we generate an approximation for the edge score
        one_p = np.dot(array_n1, array_n2)

        all_predictions.append(one_p)
        
    return all_predictions

In [154]:
# generate the predictions
all_predictions = predict_ALS(test_df[["u", "v", "ground_truth"]].values, model)

# add predictions to df
test_df["ALS"] = all_predictions

In [159]:
test_df

Unnamed: 0,u,v,ground_truth,ADA,ALS
0,371,9935,0,0.095358,-0.000495
1,2260,6001,0,0.095358,-0.000151
2,4321,14803,0,0.095358,0.010087
3,2901,3619,0,0.095358,-0.000478
4,480,16009,0,0.095358,-0.000072
...,...,...,...,...,...
32383,2142,13032,0,0.113583,0.000629
32384,112,5267,0,0.127667,0.000258
32385,553,14304,0,0.095358,0.000457
32386,2132,7591,0,0.280150,0.086707


## PageRank

delete vertices from g --> all vertices except the ones in test_idxs

In [173]:
new_graph = deepcopy(g)
new_graph.delete_vertices([i for i in range(len(ids_mapping)) if i not in test_idxs])

In [201]:
def ppage_rank(graph, test_df):
    # here we need also the argument vid, which corresponds to a node-id
    probab = []
    for u, v in test_df[["u", "v"]].values: 
        probab.append(graph.personalized_pagerank(reset_vertices=u)[v])
    return probab

In [202]:
test_df["PPageRank"] = ppage_rank(g, test_df)

In [203]:
test_df

Unnamed: 0,u,v,ground_truth,ADA,ALS,PPageRank
0,371,9935,0,0.095358,-0.000495,0.000107
1,2260,6001,0,0.095358,-0.000151,0.000182
2,4321,14803,0,0.095358,0.010087,0.000025
3,2901,3619,0,0.095358,-0.000478,0.000065
4,480,16009,0,0.095358,-0.000072,0.000039
...,...,...,...,...,...,...
32383,2142,13032,0,0.113583,0.000629,0.000112
32384,112,5267,0,0.127667,0.000258,0.000325
32385,553,14304,0,0.095358,0.000457,0.000185
32386,2132,7591,0,0.280150,0.086707,0.000383


In [253]:
def get_topk (scores, column, topk = 10):
    scores = scores.sort_values(by=column, ascending = False)
    return scores[:topk]

## Our score

In [249]:
def compute_Jaccard(u,v):
    if len(set(u).union(set(v))) == 0:
        return 0.0
    return len(set(u).intersection(set(v)))/len(set(u).union(set(v)))

In [286]:
personalized_score = []
lamb = 1/3
max_ADA = test_df["ADA"].max()
# Predictions
for i, row in test_df.iterrows():
    pred = compute_Jaccard(hashtags[int(row["u"])], hashtags[int(row["v"])])
    personalized_score.append(row["ADA"] + lamb * max_ADA * pred)
test_df["Personalized"] = personalized_score

In [254]:
get_topk(test_df, "ADA")

Unnamed: 0,u,v,ground_truth,ADA,ALS,PPageRank,personalized
9882,6912,14506,0,5.788057,1.262843,0.019939,6.109616
5199,74,7317,0,5.187976,0.974226,0.029246,7.117328
4653,3044,3091,0,4.883447,0.686398,0.01997,6.8128
5961,46,1275,0,4.580674,0.240261,0.013795,5.866909
185,3066,14506,0,4.263924,-0.16817,0.012981,4.263924
18714,46,4800,0,4.154005,0.308623,0.022614,5.118681
23810,4321,14506,0,4.047398,-0.051517,0.015299,4.047398
15469,3091,7317,1,3.835365,0.812402,0.029696,5.764718
13172,3044,7317,0,3.789573,0.857237,0.029545,5.718925
14879,74,3044,0,3.499988,0.823073,0.020913,5.42934


In [289]:
get_topk(test_df, "ground_truth")

Unnamed: 0,u,v,ground_truth,ADA,ALS,PPageRank,Personalized
18948,6569,6912,1,0.0,0.009587,0.205396,0.0
9483,1612,18031,1,0.0,0.000118,0.231461,0.0
19184,2177,5111,1,0.0,0.031048,0.001848,0.0
11459,2177,6641,1,0.291919,0.022969,0.001839,0.291919
25376,9383,16177,1,0.0,0.000481,0.018723,0.0
14387,6912,8712,1,0.0,0.078688,0.001444,0.771741
12616,14016,14506,1,0.0,0.979228,0.292824,0.964676
17630,897,4803,1,0.142628,0.029225,0.002441,1.107305
14033,2177,11959,1,0.0,0.031353,0.001597,0.0
7699,3886,7823,1,0.0,0.000174,0.008582,0.0


## Compare scores - nDCG

Transform continuous scores to categorical ones (from 0 to 3) being 3 the best score. 

In [304]:
def compute_intervals(df, column, n):
    intervals = []
    minimum = df[column].min()
    maximum = df[column].max()
    interval_length = (maximum - minimum) / (n + 1)
    
    for i in range(n + 1):
        intervals.append((minimum + interval_length * i, minimum + interval_length * (i+1)))
    return intervals

def substitution (x, intervals):
    for i, pos in enumerate(intervals): 
        if x >= pos[0] and x <= pos[1]:
            return i

In [305]:
ranked_scores = test_df.copy()

In [306]:
ranked_scores["ground_truth"] = ranked_scores["ground_truth"].apply(lambda x: 3 if x == 1 else 0)
ranked_scores["ADA"] = ranked_scores["ADA"].apply(lambda x: substitution(x, 
                                                  compute_intervals(ranked_scores,"ADA",3)))
ranked_scores["ALS"] = ranked_scores["ALS"].apply(lambda x: substitution(x, 
                                                  compute_intervals(ranked_scores,"ALS",3)))
ranked_scores["PPageRank"] = ranked_scores["PPageRank"].apply(lambda x: substitution(x, 
                                                  compute_intervals(ranked_scores,"PPageRank",3)))
ranked_scores["Personalized"] = ranked_scores["Personalized"].apply(lambda x: substitution(x, 
                                                  compute_intervals(ranked_scores,"Personalized",3)))

Compute nDCG between scores

In [319]:
def dcg_at_k(y_true, y_score,  k=10):
    order = np.argsort(y_score)[::-1] # get the list of indexes of the predicted score sorted in descending order.
    y_true = np.take(list(y_true), order[:k]) # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k.
    gain = 2**(y_true) - 1 # Compute gain (use formula 7 above)
    discounts = np.log2(np.arange(len(y_true)) + 2) # Compute denominator
    return np.sum(gain / discounts) #return dcg@k


def ndcg_at_k(y_true, y_score, k=10):    
    dcg_max = dcg_at_k(y_true, y_true, k) # Ideal dcg
    if not dcg_max:
        return 0
    return np.round(dcg_at_k(y_true, y_score, k) / dcg_max,4)  # return ndcg@k

In [350]:
def compute_ndcg(df, column, topk = 20): 
    sorted_ = df.sort_values(by = column, ascending = False)
    return ndcg_at_k(sorted_["ground_truth"], sorted_[column], topk)

In [356]:
print("nDCG score ranking with ADA: {}".format(compute_ndcg(ranked_scores, "ADA")))
print("nDCG score ranking with ALS: {}".format(compute_ndcg(ranked_scores, "ALS")))
print("nDCG score ranking with PPageRank: {}".format(compute_ndcg(ranked_scores, "PPageRank")))
print("nDCG score ranking with Personalized: {}".format(compute_ndcg(ranked_scores, "Personalized")))

nDCG score ranking with ADA: 0.0373
nDCG score ranking with ALS: 0.1476
nDCG score ranking with PPageRank: 0.2282
nDCG score ranking with Personalized: 0.1563
