In [1]:
import sys  
sys.path.insert(0, '../search-engine')
from utils import *
import pandas as pd
from igraph import *
import implicit
from scipy.sparse import csr_matrix
from sklearn.metrics import ndcg_score
from copy import deepcopy


Bad key "text.kerning_factor" on line 4 in
/usr/local/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution


# NB3 - Link analysis

## 1. Load data 
Read tweets from data directory.

In [2]:
data = get_tweets(None, None, mode = "read", data_directory = '../data/')

## 2. Graph creation

Graph will be created using `igraph` library. The graph will contain only users that have retweeted tweets from other users that also appear in the data collection. Main characteristics are: 
- Direted graph: A -> B means user A retweeted a tweet from user B. 
- Nodes: Users
- Edges: Retweets. Not weighted graph: meaning that, if user A retweeted three times user B, the number of retweets will not be represented in the graph, only the connection between users. 

NOTE: `igraph` graphs do not contain labels for nodes, therefore, there needs to be a mapping between users' id and the id of the node in the graph. 

In [3]:
# Mapping between user id and node id in the graph.
ids_mapping = {}
counter = 0
for tweet in data:
    if "retweeted_status" in tweet.keys():
        if tweet["user"]["id"] not in ids_mapping.keys():
            ids_mapping[tweet["user"]["id"]] = counter            
            counter += 1

In [4]:
# Create a mapping betweet node id and user name
user_names_mapping = {}
for tweet in data:
    if "retweeted_status" in tweet.keys():
        user_names_mapping[ids_mapping[tweet["user"]["id"]]] = tweet["user"]["name"]

Creation of a dictionary containing as key the node id of the users that retweeted at least a tweet and as value, the node id of the users that wrote the original tweets.

An additional dictionary with the hashtags of each tweet (having as key the node id of the users) that will be needed later. 

In [5]:
# Creation of a dictionaries
retweets = {}
hashtags = {} 

for i, tweet in enumerate(data): 
    try: 
        # Store relations between users that have retweeted at least one tweet
        id2 = ids_mapping[tweet["retweeted_status"]["user"]["id"]]        
        id1 = ids_mapping[tweet["user"]["id"]]
        try: retweets[id1].append(id2)
        except: retweets[id1] = [id2]
        
        if id1 not in hashtags.keys(): hashtags[id1] = []
        if id2 not in hashtags.keys(): hashtags[id2] = []
        
        # Store hashtags list for each user for personalized score
        for hashtag in tweet["entities"]["hashtags"]:
            hashtags[id1].append(hashtag["text"])
            hashtags[id2].append(hashtag["text"])
    except: 
        continue

In [6]:
# Transform hashtags dictionary into a list for the creation of the graph
edgelist = []
for node in retweets:
    for i in retweets[node]:
        edgelist.append((node, i))
        
edgelist = list(set(edgelist)) # Remove duplicated links (since the graph is not weighted)

Creation of the graph. 

In [7]:
g = Graph(n = len(ids_mapping))
g.add_edges(edgelist)

## 3. Networks based predictions
Prepare the dataset: split graph into train and test, 80-20% respectively. 

In [8]:
p = 0.2       # fraction of edges to select as test-set
N = len(g.es) # graphsize

all_idxs = range(N) # idxs of all the edges

# sample randomly test nodes 
np.random.seed(0)
test_idxs = np.random.choice(a=all_idxs, size=int(p*N),replace=False)

# Create the train-set and ground truth for the test set 
ground_truth = set() # links between nodes of the test set
trainset = set()     # links between nodes of the train set

for idx, one_edge in enumerate(g.es):    
    n1 = one_edge.source
    n2 = one_edge.target

    if idx in test_idxs:
        ground_truth.add((n1, n2))
    else:
        trainset.add((n1, n2))

### Distance 2 
The goal of this part of the project is to predict the probability of having an edge for those users at distance 2. 

Therefore, we need to consider those nodes that are at distance 2, to predict if there's a node between them. 

In [9]:
# Select nodes at distance 2 (they can also may be at distance 1)
def find_nodes_at_distance_2(graph):
    all_potential_recommendations = set()
    
    for n1 in graph.vs:
        # all the nodes at distance 1 and distance 2
        nodes_at_most_distant_2 = set(graph.neighborhood(n1, order = 2))
            
        if len(nodes_at_most_distant_2) > 0:
            for n2 in nodes_at_most_distant_2: 
                n1_index = n1.index
                if n2 != n1_index: 
                    a = min(n2, n1_index)
                    b = max(n2, n1_index)
                    all_potential_recommendations.add((a,b))
    return list(set(all_potential_recommendations))

# Get nodes at distance 2 in test set
dist_2 = find_nodes_at_distance_2(g)
test_nodes = [(u,v) for u, v in dist_2 if (u in test_idxs) and (v in test_idxs)]

Create a dataframe containing the ground truth for the test set. 0 means they have no direct link, while 1 means they are connected. 

NOTE: there is a path of length at most 2 between the pairs of nodes we are considering.

In [10]:
# Create dataframe with ground truth
test_list = []
for u, v in test_nodes: 
    if (u, v) in ground_truth: test_list.append((u,v,1))
    else: test_list.append((u,v,0))
        
test_df = pd.DataFrame(test_list, columns = ["u", "v", "ground_truth"])
test_df.head()

Unnamed: 0,u,v,ground_truth
0,5951,13583,0
1,3908,10085,0
2,325,8039,0
3,1156,6255,0
4,14735,14835,0


## General statistics on train and test sets 

Statistics considered: 
- `number of nodes`
- `number of edges`
- `top-10 inlink users`: number of input links for each node 
- `top-10 pagerank users`: it defines a probability distribution over all the nodes in the graph. A score/probability assigned to each node indicates the importance of the single node, taking into account both local and global structure of the graph (link)
- `top-10 colseness users` and `total closeness centrality`: it is a measure of centrality in a network, calculated as the reciprocal of the sum of the length o the shortest paths between the node and all other nodes in the graph. Thus, the more central a node is, the closer it is to all other nodes

In [11]:
# Mapping to users names
general_df = pd.DataFrame(edgelist,  columns = ["Retweeter", "Author"])
general_df = general_df.replace(user_names_mapping)

train_df = pd.DataFrame(trainset, columns = ["Retweeter", "Author"])
train_df_names = train_df.replace(user_names_mapping)

test_df_names = test_df[test_df["ground_truth"] != 0][["u", "v"]].replace(user_names_mapping)

ground_df = pd.DataFrame(ground_truth, columns = ["Retweeter", "Author"])
ground_df = ground_df.replace(user_names_mapping)

In [12]:
import networkx as nx
# Create graphs with networkx
general_G = nx.DiGraph()
general_G.add_edges_from(general_df.values)

train_G = nx.DiGraph()
train_G.add_edges_from(train_df_names.values)

test_G = nx.DiGraph()
test_G.add_edges_from(test_df_names.values)
test_G.add_nodes_from(ground_df["Retweeter"].values)
test_G.add_nodes_from(ground_df["Author"].values)

ground_G = nx.DiGraph()
ground_G.add_edges_from(ground_df.values)

In [13]:
# Get number of nodes and links
print("General graph contains\n - {} users\n - {} retweet links".format(len(general_G.nodes), len(general_G.edges)))
print("\nTraining graph contains\n - {} users\n - {} retweet links".format(len(train_G.nodes), len(train_G.edges)))
print("\nTesting graph contains\n - {} users\n - {} retweet links".format(len(test_G.nodes), len(test_G.edges)))
print("\nTesting ground truth graph contains\n - {} users\n - {} retweet links".format(len(ground_G.nodes), len(ground_G.edges)))

General graph contains
 - 12270 users
 - 18552 retweet links

Training graph contains
 - 10485 users
 - 14709 retweet links

Testing graph contains
 - 3781 users
 - 76 retweet links

Testing ground truth graph contains
 - 3781 users
 - 3706 retweet links


In [14]:
# Get top-10 inlinks
print("\nTop-10 inlinks of the general graph")
for row in sorted(general_G.in_degree(), key=lambda x: x[1], reverse=True)[:10]: print(row)

print("\nTop-10 inlinks of training graph")
for row in sorted(train_G.in_degree(), key=lambda x: x[1], reverse=True)[:10]: print(row)

print("\nTop-10 inlinks of testing graph")
for row in sorted(test_G.in_degree(), key=lambda x: x[1], reverse=True)[:10]: print(row)

print("\nTop-10 inlinks of testing ground trugh graph")
for row in sorted(ground_G.in_degree(), key=lambda x: x[1], reverse=True)[:10]: print(row)


Top-10 inlinks of the general graph
('LORI HENDRY', 1417)
('Dr. Kelli Ward 🇺🇸', 691)
('Aaron Rupar', 467)
('❌🇺🇸Steve🇺🇸🇺🇸America First🇺🇸🇮🇹MAGA🇺🇸KAG', 463)
('Machiavelli', 444)
('DeplorableArmyBrat', 276)
('Ryan Pence ⭐⭐⭐', 234)
('John Harwood', 226)
('Citizens for Ethics', 224)
('Katie Johnson2020', 207)

Top-10 inlinks of training graph
('LORI HENDRY', 996)
('Dr. Kelli Ward 🇺🇸', 306)
('❌🇺🇸Steve🇺🇸🇺🇸America First🇺🇸🇮🇹MAGA🇺🇸KAG', 211)
('Aaron Rupar', 189)
('Machiavelli', 174)
('Ryan Pence ⭐⭐⭐', 154)
('DeplorableArmyBrat', 153)
('Citizens for Ethics', 146)
('John Harwood', 140)
('Jon Cooper 🇺🇸', 118)

Top-10 inlinks of testing graph
('❌🇺🇸Steve🇺🇸🇺🇸America First🇺🇸🇮🇹MAGA🇺🇸KAG', 11)
('Machiavelli', 10)
('Aaron Rupar', 7)
('sakshi🌸', 2)
('𝒜𝒩𝒿𝒶𝒶𝓃', 2)
('LivePDDave 🇺🇸 🚨 🥊', 2)
('Pigzzyy', 2)
('Speak out omaha', 2)
('Mike Engleman⭐⭐⭐', 2)
('#CompassionateRelease4Reality', 1)

Top-10 inlinks of testing ground trugh graph
('LORI HENDRY', 224)
('Dr. Kelli Ward 🇺🇸', 81)
('❌🇺🇸Steve🇺🇸🇺🇸America First🇺🇸🇮🇹

In [15]:
# Get top-10 pagerank users
print("\nTop-10 pagerank of the general graph")
for row in sorted(nx.pagerank(general_G).items(), key=lambda x: x[1], reverse=True)[:10]: print(row)

print("\nTop-10 pagerank of training graph")
for row in sorted(nx.pagerank(train_G).items(), key=lambda x: x[1], reverse=True)[:10]: print(row)

print("\nTop-10 pagerank of testing graph")
for row in sorted(nx.pagerank(test_G).items(), key=lambda x: x[1], reverse=True)[:10]: print(row)

print("\nTop-10 pagerank of testing ground trugh graph")
for row in sorted(nx.pagerank(ground_G).items(), key=lambda x: x[1], reverse=True)[:10]: print(row)


Top-10 pagerank of the general graph
('LORI HENDRY', 0.10166842783178236)
('PlayTheTrumpCard ⚡️ Legal Votes Only Please', 0.030645838388244674)
('Ryan Pence ⭐⭐⭐', 0.030153812353229552)
('Michael Guy', 0.028978706097937325)
('John Harwood', 0.020153132846440026)
('Citizens for Ethics', 0.018765167825236947)
('LivePDDave 🇺🇸 🚨 🥊', 0.01654695811855514)
('Jon Cooper 🇺🇸', 0.015881267864422927)
('(((DeanObeidallah)))', 0.01524893854881681)
('Dr. Kelli Ward 🇺🇸', 0.014380354249883619)

Top-10 pagerank of training graph
('LORI HENDRY', 0.033385958877422434)
('Dr. Kelli Ward 🇺🇸', 0.008840814382755736)
('Ryan Pence ⭐⭐⭐', 0.00534753234688879)
('Aaron Rupar', 0.005237079053696349)
('John Harwood', 0.005005205827125815)
('❌🇺🇸Steve🇺🇸🇺🇸America First🇺🇸🇮🇹MAGA🇺🇸KAG', 0.004877377973597925)
('Citizens for Ethics', 0.004501607235068948)
('Machiavelli', 0.004221274018313596)
('DeplorableArmyBrat', 0.00406718578955062)
('Jason Miller', 0.0037276289435704854)

Top-10 pagerank of testing graph
('❌🇺🇸Steve🇺🇸🇺🇸Ame

In [16]:
# General closeness centrality
count = 0
general_centrality = nx.closeness_centrality(general_G)
for elem in general_centrality: count += general_centrality[elem]
print("Closeness centrality of general graph: {}".format(count))

count = 0
train_centrality = nx.closeness_centrality(train_G)
for elem in train_centrality: count += train_centrality[elem]
print("Closeness centrality of training graph: {}".format(count))

count = 0
test_centrality = nx.closeness_centrality(test_G)
for elem in test_centrality: count += test_centrality[elem]
print("Closeness centrality of testing graph: {}".format(count))

count = 0
ground_centrality = nx.closeness_centrality(ground_G)
for elem in ground_centrality: count += ground_centrality[elem]
print("Closeness centrality of ground truth graph: {}".format(count))

Closeness centrality of general graph: 4.319447339126578
Closeness centrality of training graph: 117.28396749144667
Closeness centrality of testing graph: 0.025374861699913473
Closeness centrality of ground truth graph: 5.122661381085046


In [17]:
# Get top-10 closeness users
print("\nTop-10 closeness centrality of the general graph")
for row in sorted(general_centrality.items(), key=lambda x: x[1], reverse=True)[:10]: print(row)

print("\nTop-10 closeness centrality of training graph")
for row in sorted(train_centrality.items(), key=lambda x: x[1], reverse=True)[:10]: print(row)

print("\nTop-10 closeness centrality of testing graph")
for row in sorted(test_centrality.items(), key=lambda x: x[1], reverse=True)[:10]: print(row)

print("\nTop-10 closeness centrality of testing ground trugh graph")
for row in sorted(ground_centrality.items(), key=lambda x: x[1], reverse=True)[:10]: print(row)


Top-10 closeness centrality of the general graph
('LORI HENDRY', 0.1237464910795504)
('PlayTheTrumpCard ⚡️ Legal Votes Only Please', 0.09859107503577307)
('Michael Guy', 0.08729944763851358)
('Dr. Kelli Ward 🇺🇸', 0.08413130367808799)
('Machiavelli', 0.06009053100161735)
('DeplorableArmyBrat', 0.05997625966192418)
('❌🇺🇸Steve🇺🇸🇺🇸America First🇺🇸🇮🇹MAGA🇺🇸KAG', 0.05954992073747574)
('Ryan Pence ⭐⭐⭐', 0.05357710798276674)
('VP Elect Storm Nicole', 0.05253837838076708)
('Sheri - Stop the Sloppy Steal!', 0.04722568252603301)

Top-10 closeness centrality of training graph
('LORI HENDRY', 0.11323693857528437)
('Danny Meilleur', 0.0888863347655578)
('Sue', 0.08480430734020797)
('MJ', 0.08404947269205874)
('Skip', 0.08375471142873275)
('Sandi', 0.08279679337089903)
('JWS - Parler- @JWSNMNJ hit me there.', 0.08256384638220617)
('Hanakuli', 0.08223192622611299)
('Mike', 0.08223002792945346)
('Robin Lynne Yates', 0.08221938660903651)

Top-10 closeness centrality of testing graph
('❌🇺🇸Steve🇺🇸🇺🇸America

## 3.1. Adamic-Adar (ADA)
In this section we will use Adamic-Adar to compute the probability of having a direct link between all nodes that are at most at distance 2. 
This algorithm does not require a training set, it can be directly applied to the test set. 

In [18]:
def compute_ADA(u,v, graph):
    # set of neighbors of u
    outlinks_from_u = set(graph.neighbors(u))
    # set of neighbors of v
    inlinks_to_v = set(graph.neighbors(v))
    
    # set Z of neighbors of both
    bridges = outlinks_from_u.intersection(inlinks_to_v)
    
    # degree of nodes in set Z
    deg_ = [graph.degree(node) for node in bridges]
    
    # computing the reciprocal in log-scale
    out = [1./np.log2(dd+1) for dd in deg_]

    return sum(out)

In [19]:
# Predict the probability
pred_ADA = []
# Predictions
for i, row in test_df.iterrows():
    pred = compute_ADA(int(row["u"]), int(row["v"]), g)
    pred_ADA.append(pred)
    
test_df["ADA"] = pred_ADA # Append probabilities to the dataframe
test_df.head()

Unnamed: 0,u,v,ground_truth,ADA
0,5951,13583,0,0.095358
1,3908,10085,0,0.095358
2,325,8039,0,0.113583
3,1156,6255,0,0.172195
4,14735,14835,0,0.127667


## 3.2. Alternating Least Squares (ALS)
This algorithm has been tested using `implicit` libary. It requires an adjacency matrix (a sparse one). And the approximated predictions are obtained from the dot product of the low-dimensionality vectors obtained from the matrix factorization performed by the model.

In [20]:
# Get the adjacency matrix data
M = g.get_adjacency().data
M = csr_matrix(M)

In [21]:
# Declare the model ALS
model = implicit.als.AlternatingLeastSquares(factors=10, calculate_training_loss=True, iterations=5)

# Train the model on a sparse matrix
model.fit(M)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [22]:
# Link prediction
def predict_ALS(testset, model):
    all_predictions = []
    for n1, n2, w in testset:
        # take low-dimensional vectors returned by the matrix factorization
        array_n1 = model.user_factors[n1, :]
        array_n2 = model.item_factors[n2, :]

        # approximation for the edge score
        one_p = np.dot(array_n1, array_n2)
        all_predictions.append(one_p)
    return all_predictions

In [23]:
# Predict
all_predictions = predict_ALS(test_df[["u", "v", "ground_truth"]].values, model)

test_df["ALS"] = all_predictions # Append predictions to the dataframe

In [24]:
test_df.head()

Unnamed: 0,u,v,ground_truth,ADA,ALS
0,5951,13583,0,0.095358,-7e-05
1,3908,10085,0,0.095358,-6.1e-05
2,325,8039,0,0.113583,0.012325
3,1156,6255,0,0.172195,4.2e-05
4,14735,14835,0,0.127667,-4e-05


## 3.3. Personalized PageRank (PPageRank)

The library `igraph` has implemented a personalized page rank computation. In personalized page rank, the starting node needs to be specified.


For each pair of nodes (A, B) in the graph, the probability of having a link between them is computed by computing the PPageRank starting from A and getting the probability for B. 

In [25]:
# Since PPageRank is performed given a graph, all edges from outside the testset should
# be dropeed. 
new_graph = deepcopy(g)
new_graph.delete_vertices([i for i in range(len(ids_mapping)) if i not in test_idxs])

In [26]:
def ppage_rank(graph, test_df):
    probab = []
    # For each pair of nodes, compute their probability.
    for u, v in test_df[["u", "v"]].values: 
        probab.append(graph.personalized_pagerank(reset_vertices=u)[v])
    return probab

In [27]:
test_df["PPageRank"] = ppage_rank(g, test_df)

In [28]:
test_df

Unnamed: 0,u,v,ground_truth,ADA,ALS,PPageRank
0,5951,13583,0,0.095358,-0.000070,0.000097
1,3908,10085,0,0.095358,-0.000061,0.000210
2,325,8039,0,0.113583,0.012325,0.000453
3,1156,6255,0,0.172195,0.000042,0.004614
4,14735,14835,0,0.127667,-0.000040,0.000632
...,...,...,...,...,...,...
30969,2306,8998,0,0.095358,-0.000092,0.000188
30970,4288,5111,0,0.140676,0.000436,0.000501
30971,521,9591,0,0.129401,0.000195,0.000456
30972,4235,12262,0,0.095358,-0.000215,0.000062


## 3.4. Personalized score
**Main idea**

Improve ADA algorithm by adding a partial score to the one returned by ADA algorithm. 
The partial score considers information from the texts: the hashtags that each user has used. By doing so, we can relate the users having into account their common interests. 

The dictionary contianing the hashtags used by each user has been created at the beginning of section 2. 


**Personalized score**

\begin{equation*} \mathbf{diversity\_score = ADA + \lambda * partial\_score} \end{equation*}
Where: 

- $\lambda = 1/3 * max(ADA)$ regulating the impact on the score 

- $ partial\_score = Jaccard(hashtags\_userA, hashtags\_userB)$

By considering the Jaccard similarity between the sets of hashtags used by both users, we are selecting the keywords that the users themselves have selected to summarize the content they are posting. 

In [29]:
# Jaccard similarity
def compute_Jaccard(u,v):
    if len(set(u).union(set(v))) == 0:
        return 0.0
    return len(set(u).intersection(set(v)))/len(set(u).union(set(v)))

In [30]:
personalized_score = []
lamb = 1/3
max_ADA = test_df["ADA"].max()

# Compute personalized score
for i, row in test_df.iterrows():
    pred = compute_Jaccard(hashtags[int(row["u"])], hashtags[int(row["v"])])
    personalized_score.append(row["ADA"] + lamb * max_ADA * pred)
    
# Add it to the test dataframe    
test_df["Personalized"] = personalized_score 
test_df.head()

Unnamed: 0,u,v,ground_truth,ADA,ALS,PPageRank,Personalized
0,5951,13583,0,0.095358,-7e-05,9.7e-05,0.095358
1,3908,10085,0,0.095358,-6.1e-05,0.00021,0.095358
2,325,8039,0,0.113583,0.012325,0.000453,0.113583
3,1156,6255,0,0.172195,4.2e-05,0.004614,0.172195
4,14735,14835,0,0.127667,-4e-05,0.000632,0.127667


## 4. Predictions
Top-10 predictions for each algorithm. 

In [31]:
def get_topk (scores, column, topk = 10):
    scores = scores.sort_values(by=column, ascending = False)
    return scores[:topk]

In [32]:
algorithms = ["ADA", "ALS", "PPageRank", "Personalized"]
for algorithm in algorithms:
    print("\n\nTop-5 recommendations from {} algoritm".format(algorithm))
    print(get_topk(test_df, algorithm)[["u", "v", algorithm, "ground_truth"]].head())



Top-10 recommendations from ADA algoritm
          u      v        ADA  ground_truth
26804  9670  12512  13.543626             0
22720  5539  12512   7.772859             0
20160  1547   3420   7.477919             0
29239   346   3420   7.012568             1
23051   346   1547   6.853266             0


Top-10 recommendations from ALS algoritm
          u      v       ALS  ground_truth
20160  1547   3420  1.169639             0
8809    503  14583  1.144972             0
29239   346   3420  1.132539             1
5404   2359  12512  1.123251             0
23051   346   1547  1.112424             0


Top-10 recommendations from PPageRank algoritm
           u      v  PPageRank  ground_truth
25854   8790  14994   0.596491             0
13216   8509  17594   0.459459             0
17212   6023   7683   0.459459             0
9504    9917  13967   0.459459             1
1783   11396  17172   0.351615             1


Top-10 recommendations from Personalized algoritm
          u      v  P

In [40]:
algorithms = ["ADA", "ALS", "PPageRank", "Personalized"]
for algorithm in algorithms:
    print("\n\nTop-5 recommendations from {} algoritm".format(algorithm))
    top = get_topk(test_df, algorithm)
    top["u"] = top["u"].replace(user_names_mapping)
    top["v"] = top["v"].replace(user_names_mapping)
    print(top[["u", "v", algorithm, "ground_truth"]].head())



Top-10 recommendations from ADA algoritm
                       u                                       v        ADA  \
26804        Machiavelli  ❌🇺🇸Steve🇺🇸🇺🇸America First🇺🇸🇮🇹MAGA🇺🇸KAG  13.543626   
22720  LivePDDave 🇺🇸 🚨 🥊  ❌🇺🇸Steve🇺🇸🇺🇸America First🇺🇸🇮🇹MAGA🇺🇸KAG   7.772859   
20160       𝙊𝙣 𝙖 𝙗𝙧𝙚𝙖𝙠 ♡                                 sakshi🌸   7.477919   
29239          Mrs.Kabir                                 sakshi🌸   7.012568   
23051          Mrs.Kabir                            𝙊𝙣 𝙖 𝙗𝙧𝙚𝙖𝙠 ♡   6.853266   

       ground_truth  
26804             0  
22720             0  
20160             0  
29239             1  
23051             0  


Top-10 recommendations from ALS algoritm
                        u                                       v       ALS  \
20160        𝙊𝙣 𝙖 𝙗𝙧𝙚𝙖𝙠 ♡                                 sakshi🌸  1.169639   
8809              Tanis42                             Aaron Rupar  1.144972   
29239           Mrs.Kabir                                 sakshi🌸  1.1

## 5. Scores comparison - nDCG and precision@k

**nDCG**


nDCG is used to asses the performance of ranking algorithms for categorical scores. It assumes that the added value of an element decreases as it is placed in lower positions of the ranking. 

To compute nDCG, we used a scoring from 0 to 3, being 3 the best score. 
For the ground truth, all 1 (indicating a link) have been replaced with the top score 3, while 0s (indication no direct link) have been left as the worst score 0. 

For the output of the algorithms, they have been mapped into the scale from 0 to 3 taking into account their minimum and maximum values.

In [33]:
#Transform continuous scores to categorical ones (from 0 to 3) being 3 the best score. 

def compute_intervals(df, column, n):
    intervals = []
    minimum = df[column].min()
    maximum = df[column].max()
    interval_length = (maximum - minimum) / (n + 1)
    
    for i in range(n + 1):
        intervals.append((minimum + interval_length * i, minimum + interval_length * (i+1)))
    return intervals

def substitution (x, intervals):
    for i, pos in enumerate(intervals): 
        if x >= pos[0] and x <= pos[1]:
            return i
        
ranked_scores = test_df.copy()

ranked_scores["ground_truth"] = ranked_scores["ground_truth"].apply(lambda x: 3 if x == 1 else 0)
ranked_scores["ADA"] = ranked_scores["ADA"].apply(lambda x: substitution(x, compute_intervals(ranked_scores,"ADA",3)))
ranked_scores["ALS"] = ranked_scores["ALS"].apply(lambda x: substitution(x, compute_intervals(ranked_scores,"ALS",3)))
ranked_scores["PPageRank"] = ranked_scores["PPageRank"].apply(lambda x: substitution(x, compute_intervals(ranked_scores,"PPageRank",3)))
ranked_scores["Personalized"] = ranked_scores["Personalized"].apply(lambda x: substitution(x, compute_intervals(ranked_scores,"Personalized",3)))

ranked_scores.sort_values(by="ground_truth", ascending=False).head()

Unnamed: 0,u,v,ground_truth,ADA,ALS,PPageRank,Personalized
8267,1096,12512,3,0,3,1,0
12044,346,3550,3,0,3,0,2
19585,1178,14583,3,0,3,1,0
6235,1547,3550,3,0,3,0,2
21086,9378,9670,3,0,3,0,0


nDCG scores computation for each algorithm

In [34]:
def dcg_at_k(y_true, y_score,  k=10):
    order = np.argsort(y_score)[::-1] # get the list of indexes of the predicted score sorted in descending order.
    y_true = np.take(list(y_true), order[:k]) # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k.
    gain = 2**(y_true) - 1 # Compute gain (use formula 7 above)
    discounts = np.log2(np.arange(len(y_true)) + 2) # Compute denominator
    return np.sum(gain / discounts) #return dcg@k


def ndcg_at_k(y_true, y_score, k=10):    
    dcg_max = dcg_at_k(y_true, y_true, k) # Ideal dcg
    if not dcg_max:
        return 0
    return np.round(dcg_at_k(y_true, y_score, k) / dcg_max,4)  # return ndcg@k

def compute_ndcg(df, column, topk = 20): 
    sorted_ = df.sort_values(by = column, ascending = False)
    return ndcg_at_k(sorted_["ground_truth"], sorted_[column], topk)

In [35]:
for algorithm in algorithms:
    print("nDCG score ranking with {} algorithm: {}".format(algorithm, compute_ndcg(ranked_scores, algorithm)))

nDCG score ranking with ADA algorithm: 0.2914
nDCG score ranking with ALS algorithm: 0.1141
nDCG score ranking with PPageRank algorithm: 0.2048
nDCG score ranking with Personalized algorithm: 0.4043


**Precision@k**

p@20 score has also been used to compare the performance of the algorithms, by checking whether the top-20 link connections recommended by each algorithm are in the actual ground truth. 

In [36]:
def precision_at_k (scores, column, topk = 20):
    scores = get_topk(scores, column, topk)
    return (scores["ground_truth"].sum())/(topk)

In [37]:
k = 20
for algorithm in algorithms:
    print("Precision@{} of {} algorithm: {}".format(k, algorithm, precision_at_k(test_df, algorithm, k)))

Precision@20 of ADA algorithm: 0.25
Precision@20 of ALS algorithm: 0.3
Precision@20 of PPageRank algorithm: 0.35
Precision@20 of Personalized algorithm: 0.3
