In [150]:
import sys  
sys.path.insert(0, '../search-engine')
from utils import *
import pandas as pd
from igraph import *
import implicit
from scipy.sparse import csr_matrix
from copy import deepcopy

## Create graph

In [30]:
# get data 
data = get_tweets(None, None, mode = "read", data_directory = '../data/')

In [38]:
# since igraph does not use node ids -> we need to create a dict storing the conversion
ids_mapping = {}
hashtags = {}
counter = 0
for tweet in data:
    if "retweeted_status" in tweet.keys():
        if tweet["user"]["id"] not in ids_mapping.keys():
            ids_mapping[tweet["user"]["id"]] = counter            
            counter += 1

In [39]:
retweets = {}
for i, tweet in enumerate(data): 
    try: 
        id2 = ids_mapping[tweet["retweeted_status"]["user"]["id"]]        
        id1 = ids_mapping[tweet["user"]["id"]]
        try: retweets[id1].append(id2)
        except: retweets[id1] = [id2]
    except: 
        continue
    
    #try: hashtags[counter] += [tweet["entities"]["hashtags"]]



In [40]:
edgelist = []
for node in retweets:
    for i in retweets[node]:
        edgelist.append((node, i))
edgelist = list(set(edgelist)) # remove duplicated links

In [45]:
g = Graph(n = len(ids_mapping))
g.add_edges(edgelist)

Split graph into train and test

In [47]:
# fraction of edges to select as test-set
p = 0.2

# graphsize
N = len(g.es)

# idxs of all the edges
all_idxs = range(N)

# sample idxs of edges through the function "choice"
test_idxs = np.random.choice(a=all_idxs, size=int(p*N),replace=False)

In [112]:
ground_truth = set()
trainset = set()
for idx, one_edge in enumerate(g.es):
    
    # take n1 and n2 idx from one_edge, that is an igraph edge *object*
    n1 = one_edge.source
    n2 = one_edge.target

    if idx in test_idxs:
        ground_truth.add((n1, n2))
    else:
        trainset.add((n1, n2))

In [62]:
# test_idxs --> nodes q formen part del test
# ground_truth --> links entre els nodes del test set
# trainset --> links entre els nodes del train set 

In [113]:
def find_nodes_at_distance_2(graph):
    """
    starting from a graph this function returns all the nodes at distance 2
    """
    all_potential_recommendations = set()
    
    for n1 in graph.vs:
        # all the nodes at distance 1 and distance 2
        nodes_at_most_distant_2 = set(graph.neighborhood(n1, order = 2))
            
        if len(nodes_at_most_distant_2) > 0:
            for n2 in nodes_at_most_distant_2: 
                # since n1 is an igraph vertex object, we need to extract the id
                n1_index = n1.index
                if n2 != n1_index: 
                    a = min(n2, n1_index)
                    b = max(n2, n1_index)
                    all_potential_recommendations.add((a,b))
                
    return list(set(all_potential_recommendations))

Get nodes at distance 2 for future prediction

In [114]:
# Get nodes at distance 2 in test set
dist_2 = find_nodes_at_distance_2(g)
test_nodes = [(u,v) for u, v in dist_2 if (u in test_idxs) and (v in test_idxs)]

In [125]:
# Create dataframe with ground truth
test_list = []
for u, v in test_nodes: 
    if (u, v) in ground_truth: test_list.append((u,v,1))
    else: test_list.append((u,v,0))
        
test_df = pd.DataFrame(test_list, columns = ["u", "v", "ground_truth"])

## Adamic-Adar

In [63]:
def compute_ADA(u,v, graph):
    """
    compute adamic-adar from scratch
    """
    # set of neighbors of u
    outlinks_from_u = set(graph.neighbors(u))
    # set of neighbors of v
    inlinks_to_v = set(graph.neighbors(v))
    
    # set Z of neighbors of both
    bridges = outlinks_from_u.intersection(inlinks_to_v)
    
    # degree of nodes in set Z
    deg_ = [graph.degree(node) for node in bridges]
    
    # computing the reciprocal in log-scale
    out = [1./np.log2(dd+1) for dd in deg_]

    return sum(out)

In [158]:
pred_ADA = []
# Predictions
for i, row in test_df.iterrows():
    pred = compute_ADA(int(row["u"]), int(row["v"]), g)
    pred_ADA.append(pred)
test_df["ADA"] = pred_ADA

<font color = "Red">
    
    
**CALCULAR nDCG**
    

## ALS

In [None]:
# Get the adjacency matrix data
M = g.get_adjacency().data

In [151]:
M = csr_matrix(M)

In [152]:
# Run the model ALS
model = implicit.als.AlternatingLeastSquares(factors=10, calculate_training_loss=True,  iterations=5)

# Train the model on a sparse matrix of item/user/confidence weights
model.fit(M)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [153]:
def predict_ALS(testset, model):
    """
    predict for a list of observations the score for adding/removing a link
    """

    # initialize the empty list
    all_predictions = []

    # scroll the obs
    for n1, n2, w in testset:
        
        # take here the low-dimensional vectors returned by the matrix factorization
        
        array_n1 = model.user_factors[n1, :]
        array_n2 = model.item_factors[n2, :]

        # multiplying these vectors we generate an approximation for the edge score
        one_p = np.dot(array_n1, array_n2)

        all_predictions.append(one_p)
        
    return all_predictions

In [154]:
# generate the predictions
all_predictions = predict_ALS(test_df[["u", "v", "ground_truth"]].values, model)

# add predictions to df
test_df["ALS"] = all_predictions

In [159]:
test_df

Unnamed: 0,u,v,ground_truth,ADA,ALS
0,371,9935,0,0.095358,-0.000495
1,2260,6001,0,0.095358,-0.000151
2,4321,14803,0,0.095358,0.010087
3,2901,3619,0,0.095358,-0.000478
4,480,16009,0,0.095358,-0.000072
...,...,...,...,...,...
32383,2142,13032,0,0.113583,0.000629
32384,112,5267,0,0.127667,0.000258
32385,553,14304,0,0.095358,0.000457
32386,2132,7591,0,0.280150,0.086707


## PageRank

delete vertices from g --> all vertices except the ones in test_idxs

In [173]:
new_graph = deepcopy(g)
new_graph.delete_vertices([i for i in range(len(ids_mapping)) if i not in test_idxs])

In [201]:
def ppage_rank(graph, test_df):
    # here we need also the argument vid, which corresponds to a node-id
    probab = []
    for u, v in test_df[["u", "v"]].values: 
        probab.append(graph.personalized_pagerank(reset_vertices=u)[v])
    return probab

In [None]:
test_df["PPageRank"] = ppage_rank(g, test_df)

In [None]:
test_df

In [None]:
def get_topk (scores, column, topk)
    scores = scores.sort_values(by=column, ascending = False)
    return scores[:topk]

## Our score