# Libraries and Utility function

In [114]:
import time

start = time.time()

import networkit as nk
import numpy as np
import csv
from sklearn import preprocessing as pre

# working with text
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

end = time.time()
print('Importing libraries and setting up parameters takes %.4f s' % (end-start))

Importing libraries and setting up parameters takes 0.0008 s


In [3]:
def build_graph(nodes, edges):
    g = nk.Graph(len(nodes)) # adding nodes

    for edge in edges:
        if not g.hasEdge(edge[0], edge[1]): # avoid multiple edges
            g.addEdge(edge[0], edge[1])
            
    return g

In [4]:
# a function to preprocess text
def preprocess(text, dg_removal=True, sw_removal=True, stemming=True):
    '''
    Preprocess text: stopword removal, stemming, digit removal
    
    Parameters
    ----------
    text: text on which preprocessing is applied
    dg_removal: whether to apply digit removal or not
    sw_removal: whether to apply stopword removal or not
    stemming: whether to apply stemming or not
    
    Returns
    -------
    the text after preprocessing
    '''
    result = text
    
    sw = set(nltk.corpus.stopwords.words('english')) # set of stopwords
    stemmer = nltk.stem.PorterStemmer() # stemmer
    
    if dg_removal:
        result = re.sub('[0-9]', '', result)
    
    if sw_removal:
        result = ' '.join([token for token in result.split() if token not in sw])
        
    if stemming:
        result = ' '.join([stemmer.stem(token) for token in result.split()])
    
    return result

# Reading data

In [5]:
path_data = '../data/' # path to the data
path_submission = '../submission/' # path to submission files

In [6]:
start = time.time()

# ====== read in node informations ====== #
with open(path_data + 'node_information.csv', 'r') as f:
    reader = csv.reader(f)
    node_info = list(reader)

end = time.time()
print('Reading node information takes %.4f s' % (end-start))

Reading node information takes 0.6638 s


In [7]:
start = time.time()

# ====== read training data as str ====== #
training = np.genfromtxt(path_data + 'training_set.txt', dtype=str)

end = time.time()
print('Reading training set takes %.4f s' % (end-start))

Reading training set takes 8.5168 s


In [8]:
start = time.time()

# ====== read testing data as str ====== #
testing = np.genfromtxt(path_data + 'testing_set.txt', dtype=str)

end = time.time()
print('Reading testing set takes %.4f s' % (end-start))

Reading testing set takes 0.3988 s


# Building the citation graph

In [9]:
start = time.time()

# ====== build the graph ====== #

nodes = [element[0] for element in node_info] # create index list to be passed as nodes
edges = [(nodes.index(element[0]), nodes.index(element[1])) for element in training if element[2] == '1']
g = build_graph(nodes, edges)

end = time.time()
print('Building the citation graph takes %.4f s' % (end-start))

Building the citation graph takes 299.2728 s


In [10]:
# check for general information of the graph
print('Number of vertices: %d' % g.numberOfNodes())
print('Number of edges (after multiple edges removal): %d' % g.numberOfEdges())

Number of vertices: 27770
Number of edges (after multiple edges removal): 334690


# Computing features

The list of features is described as follows, and the computation rule is the same for both training and testing set.

| Feature                | Explanation                                                        | Type      | Range   |
|:----------------------:|:------------------------------------------------------------------:|:---------:|:-------:|
| Temporal difference    | Difference in publication year (absolute value)                    | numerical | $\ge$ 0 |
| Common authors         | The number of common authors between two articles                  | numerical | $\ge$ 0 |
| Same journal           | Whether two articles are published in the same journal             | binary    | 0, 1    |
| Cosine similarity      | Cosine similarity between word vectors of abstracts                | numerical | [0,1]   |
| Title overlap          | Number of overlapping words in title                               | numerical | $\ge$ 0 |
| Degree difference      | Difference in measure of degrees of two nodes (absolute value)     | numerical | $\ge$ 0 |
| Common neighbors       | Number of common neighbors                                         | numercial | $\ge$ 0 |
| Jaccard coefficient    | Link-based Jaccard coefficient                                     | numerical | [0,1]   |
| Same cluster           | Check whether two nodes are in the same cluster                    | binary    | [0,1]   |
| PageRank difference    | Difference in PageRank index of two nodes (absolute value)         | numerical | $\ge$ 0 |
| Betweenness centrality | Difference in betweenness centrality of two nodes (absolute value) | numerical | $\ge$ 0 |
| In the same k-core     | Whether both nodes/one of them/none of them are in the same k-core | ordinal   |[0,0.5,1]|

In [11]:
# compute the dictionary of (ID-index) to accelerate access to node'ID
ID = dict(zip(nodes, [nodes.index(n) for n in nodes]))

## 1. Temporal difference

In [12]:
def temporal_difference(ds):
    '''
    Compute feature: Difference in publication year
    
    Parameters
    ----------
    ds: the dataset to compute
    
    Returns
    -------
    A numpy array where each entry corresponds to the temporal difference of a pair of nodes
    '''
    size = len(ds)
    temp_diff = np.zeros(size)
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        src_info, dest_info = node_info[ID[src]], node_info[ID[dest]] # get the associated node information by index
        
        # compute the difference in publication year in absolute value (because we don't know which one cites the other)
        temp_diff[i] = abs(
            int(src_info[1]) - int(dest_info[1])
        )
        
    return temp_diff

In [13]:
start = time.time()

# compute the temporal difference
train_temp_diff = temporal_difference(training)

end = time.time()
print('Computing temporal difference for training set takes %.4f s' %(end-start))

Computing temporal difference for training set takes 1.6023 s


In [14]:
start = time.time()

# compute the temporal difference
test_temp_diff = temporal_difference(testing)

end = time.time()
print('Computing temporal difference for testing set takes %.4f s' %(end-start))

Computing temporal difference for testing set takes 0.1491 s


In [15]:
print('Training:', train_temp_diff[0:10])
print('Testing:', test_temp_diff[0:10])

Training: [0. 1. 2. 4. 5. 0. 4. 7. 0. 8.]
Testing: [0. 1. 2. 0. 5. 4. 0. 1. 7. 0.]


## 2. Number of common authors

In [16]:
def common_authors(ds):
    '''
    Compute feature: number of common authors
    
    Parameters
    ----------
    ds: dataset to compute feature from
    
    Returns
    -------
    A numpy array
    '''
    size = len(ds)
    common_auth = np.zeros(size)
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        src_info, dest_info = node_info[ID[src]], node_info[ID[dest]] # get the associated node information by index
        
        # compute the difference in publication year in absolute value (because we don't know which one cites the other)
        common_auth[i] = len(
            set(src_info[3].split(','))
            .intersection(set(dest_info[3].split(',')))
        )
        
        
    return common_auth

In [17]:
start = time.time()

# compute the temporal difference
train_common_auth = common_authors(training)

end = time.time()
print('Computing the number of common authors for training set takes %.4f s' %(end-start))

Computing the number of common authors for training set takes 2.4319 s


In [18]:
start = time.time()

# compute the temporal difference
test_common_auth = common_authors(testing)

end = time.time()
print('Computing the number of common authors for testing set takes %.4f s' %(end-start))

Computing the number of common authors for testing set takes 0.2100 s


In [19]:
print('Training:', train_common_auth[0:10])
print('Testing:', test_common_auth[0:10])

Training: [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
Testing: [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]


## 3. Same journal

In [20]:
def same_journal(ds):
    '''
    Compute feature: whether two articles are published in the same journal
    
    Parameters
    ----------
    ds: dataset to compute feature from
    
    Returns
    -------
    A numpy array of binary values (0|1)
    '''
    size = len(ds)
    same_journal = np.zeros(size)
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        src_info, dest_info = node_info[ID[src]], node_info[ID[dest]] # get the associated node information by index
        
        # 1 if two articles are published in the same journal, 0 otherwise
        same_journal[i] = int(
            len(src_info[4])>0 and  # journal info of source not null
            len(dest_info[4])>0 and # journal info of dest not null
            src_info[4] == dest_info[4] # the same journal title
        )
        
        
    return same_journal

In [21]:
start = time.time()

# compute the temporal difference
train_same_journal = same_journal(training)

end = time.time()
print('Computing whether two articles are published in the same journal for training set takes %.4f s' %(end-start))

Computing whether two articles are published in the same journal for training set takes 1.8569 s


In [24]:
start = time.time()

# compute the temporal difference
test_same_journal = same_journal(testing)

end = time.time()
print('Computing whether two articles are published in the same journal for testing set takes %.4f s' %(end-start))

Computing whether two articles are published in the same journal for testing set takes 0.1140 s


In [23]:
print('Training:', train_same_journal[0:10])
print('Testing:', test_same_journal[0:10])

Training: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Testing: [0. 0. 1. 1. 0. 0. 0. 1. 0. 1.]


## 4. Cosine similarity in title + abstract

In [31]:
start = time.time()

# ====== corpus is the set of titles + abstracts, apply preprocessing to each article ======#
#nltk.download('stopwords') # uncomment if haven't downloaded stopwords
corpus = [preprocess(element[2] + ' ' + element[5], dg_removal=True, sw_removal=True, stemming=True) 
          for element in node_info]
vectorizer = TfidfVectorizer(stop_words='english') # create a TF-IDF vectorizer
tfidf = vectorizer.fit_transform(corpus) # TD-IDF matrix of the entire corpus (set of abstracts)

end = time.time()
print('Computing the TF-IDF matrix takes %.4f s' % (end-start))

[nltk_data] Downloading package stopwords to /home/huong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Computing the TF-IDF matrix takes 54.7182 s


In [48]:
def cosine_sim_text(ds, tfidf):
    '''
    Compute feature: cosine similarity in title and abstract
    
    Parameters
    ----------
    ds: dataset to compute feature from
    
    Returns
    -------
    A numpy array of cosine values
    '''
    size = len(ds)
    cosines = np.zeros(size)
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        
        # collect the cosine similarity
        src_vect, dest_vect = tfidf[ID[src]], tfidf[ID[dest]] # get the corresponding vector in TD-IDF matrix
        cos = cosine_similarity(src_vect, dest_vect) # compute cosine similarity
        cosines[i] = cos
        
    return cosines

In [49]:
start = time.time()

# compute the cosine similarity
train_cosine = cosine_sim_text(training, tfidf)

end = time.time()
print('Computing cosine similarity for training set takes %.4f s' %(end-start))

Computing cosine similarity for training set takes 355.2014 s


In [50]:
start = time.time()

# compute the cosine similarity
test_cosine = cosine_sim_text(testing, tfidf)

end = time.time()
print('Computing cosine similarity for testing set takes %.4f s' %(end-start))

Computing cosine similarity for testing set takes 22.1789 s


In [51]:
print('Training:', train_cosine[0:10])
print('Testing:', test_cosine[0:10])

Training: [0.19996622 0.06436945 0.02053711 0.05937844 0.09852643 0.39581923
 0.18722569 0.08627054 0.04181436 0.06044751]
Testing: [0.11804009 0.30786265 0.20753805 0.16112407 0.31824453 0.03466872
 0.02490266 0.19991048 0.         0.3283665 ]


## 5. Number of overlapped words in title

In [55]:
def overlap_title(ds):
    '''
    Compute feature: number of overlapping words in the title
    
    Parameters
    ----------
    ds: dataset to compute feature from
    
    Returns
    -------
    A numpy array of numerical values
    '''
    size = len(ds)
    overlap_title = np.zeros(size)
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        src_info, dest_info = node_info[ID[src]], node_info[ID[dest]] # get the associated node information by index
        
        # collect the number of overlapping words in title
        src_title, dest_title = preprocess(src_info[2]).split(), preprocess(dest_info[2]).split()
        overlap_title[i] = len(
            set(src_title)
            .intersection(set(dest_title))
        )
        
    return overlap_title

In [56]:
start = time.time()

# compute the number of overlapping words in title
train_overlap_title = overlap_title(training)

end = time.time()
print('Computing number of overlapping words in title for training set takes %.4f s' %(end-start))

Computing number of overlapping words in title for training set takes 620.0249 s


In [57]:
start = time.time()

# compute the number of overlapping words in title
test_overlap_title = overlap_title(testing)

end = time.time()
print('Computing number of overlapping words in title for testing set takes %.4f s' %(end-start))

Computing number of overlapping words in title for testing set takes 35.2748 s


In [58]:
print('Training:', train_overlap_title[0:10])
print('Testing:', test_overlap_title[0:10])

Training: [2. 1. 0. 0. 0. 0. 0. 1. 0. 0.]
Testing: [0. 2. 1. 1. 0. 0. 1. 1. 0. 1.]


## 6. Average of degrees

In [119]:
def average_degrees(ds, g):
    '''
    Compute feature: Average degrees of 2 nodes
    
    Parameters
    ----------
    ds: dataset to compute feature from
    g: the graph
    
    Returns
    -------
    A numpy array of numerical values
    '''
    size = len(ds)
    avg_degree = np.zeros(size)
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        
        # collect the number of overlapping words in title
        src_deg = g.degree(ID[src])
        dest_deg = g.degree(ID[dest])
        avg_degree[i] = float(src_deg + dest_deg)/2.0
        
    return avg_degree

In [120]:
start = time.time()

# compute the average degree
train_avg_degrees = average_degrees(training, g)

end = time.time()
print('Computing the average degree for training set takes %.4f s' %(end-start))

Computing the average degree for training set takes 3.3527 s


In [121]:
start = time.time()

# compute the average degree
test_avg_degrees = average_degrees(testing, g)

end = time.time()
print('Computing the average degree for testing set takes %.4f s' %(end-start))

Computing the average degree for testing set takes 0.2641 s


In [122]:
print('Training:', train_avg_degrees[0:10])
print('Testing:', test_avg_degrees[0:10])

Training: [  9.  113.    3.   17.   15.5  36.5 400.   50.5  70.5  17. ]
Testing: [ 38.5 173.5 481.   58.   78.5  24.5   2.5  31.5   4.5  12.5]


## 7. Number of common neighbors

In [67]:
def common_neighbors(ds, g):
    '''
    Compute feature: The number of common neighbors
    
    Parameters
    ----------
    ds: dataset to compute feature from
    g: the graph
    
    Returns
    -------
    A numpy array of numerical values
    '''
    size = len(ds)
    common_neigh = np.zeros(size)
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        
        # collect the number of overlapping words in title
        common_neigh[i] = len(
            set(g.neighbors(ID[src]))
            .intersection(set(g.neighbors(ID[dest])))
        )
        
    return common_neigh

In [68]:
start = time.time()

# compute the average degree
train_common_neigh = common_neighbors(training, g)

end = time.time()
print('Computing the number of common neighbors for training set takes %.4f s' %(end-start))

Computing the number of common neighbors for training set takes 8.2902 s


In [69]:
start = time.time()

# compute the average degree
test_common_neigh = common_neighbors(testing, g)

end = time.time()
print('Computing the number of common neighbors for testing set takes %.4f s' %(end-start))

Computing the number of common neighbors for testing set takes 0.4579 s


In [70]:
print('Training:', train_common_neigh[0:10])
print('Testing:', test_common_neigh[0:10])

Training: [ 1. 20.  0.  0.  0. 14. 12.  0.  5.  0.]
Testing: [ 0. 24. 59. 21.  0.  0.  0.  6.  0.  4.]


## 8. Link-based Jaccard coefficient

In [71]:
def jaccard_coeff(ds, g):
    '''
    Compute feature: Link-based Jaccard coefficient
    
    Parameters
    ----------
    ds: dataset to compute feature from
    g: the graph
    
    Returns
    -------
    A numpy array of numerical values
    '''
    size = len(ds)
    coeff = np.zeros(size)
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        
        # collect the number of overlapping words in title
        inters = len(
            set(g.neighbors(ID[src]))
            .intersection(set(g.neighbors(ID[dest])))
        ) # intersection of neighbors
        
        union = len(
            set(g.neighbors(ID[src]))
            .union(set(g.neighbors(ID[dest])))
        ) # union of neighbors
        
        coeff[i] = (float(inters)/float(union) if union != 0 else 0)
        
    return coeff

In [72]:
start = time.time()

# compute the average degree
train_jaccard_coeff = jaccard_coeff(training, g)

end = time.time()
print('Computing link-based Jaccard coefficient for training set takes %.4f s' %(end-start))

Computing link-based Jaccard coefficient for training set takes 18.4065 s


In [73]:
start = time.time()

# compute the average degree
test_jaccard_coeff = jaccard_coeff(testing, g)

end = time.time()
print('Computing link-based Jaccard coefficient for testing set takes %.4f s' %(end-start))

Computing link-based Jaccard coefficient for testing set takes 1.0820 s


In [74]:
print('Training:', train_jaccard_coeff[0:10])
print('Testing:', test_jaccard_coeff[0:10])

Training: [0.05882353 0.09708738 0.         0.         0.         0.23728814
 0.01522843 0.         0.03676471 0.        ]
Testing: [0.         0.07430341 0.06533776 0.22105263 0.         0.
 0.         0.10526316 0.         0.19047619]


## 9. Same cluster

## 10. Average of PageRank index

In [139]:
# ====== compute PageRank index ====== #
start = time.time()

page_rank_g = nk.centrality.PageRank(g)
page_rank_g.run()

end = time.time()
print('Computing the PageRank index of the graph takes %.4f s' % (end-start))

Computing the PageRank index of the graph takes 0.1994 s


In [146]:
def avg_pagerank(ds, pr):
    '''
    Compute feature: average of pagerank
    
    Parameters
    ----------
    ds: dataset to compute feature from
    g: the graph
    
    Returns
    -------
    A numpy array of numerical values
    '''
    size = len(ds)
    avg_pr = np.zeros(size)
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        
        # collect the average of betweenness centrality of 2 nodes
        # log to "dampen" too small values
        avg_pr[i] = np.log(float(pr[ID[src]] + pr[ID[dest]])/2.0)
        
    return avg_pr

In [147]:
start = time.time()

# compute the average pagerank
train_avg_pr = avg_pagerank(training, page_rank_g.scores())

end = time.time()
print('Computing the average page rank for training set takes %.4f s' %(end-start))

Computing the average page rank for training set takes 1.6807 s


In [148]:
start = time.time()

# compute the average pagerank
test_avg_pr = avg_pagerank(testing, page_rank_g.scores())

end = time.time()
print('Computing the average page rank for testing set takes %.4f s' %(end-start))

Computing the average page rank for testing set takes 0.1304 s


In [150]:
print('Training:', train_avg_pr[0:10])
print('Testing:', test_avg_pr[0:10])

Training: [-10.75542888  -9.28819986 -11.27892234 -10.79563113 -10.66865404
 -10.28702328  -7.90007968  -9.70137278  -9.57712104 -10.0708701 ]
Testing: [-10.11196423  -8.59294688  -7.70950565  -9.84766778  -9.44479929
 -10.56567714 -11.30701705 -10.06990187 -11.24745838 -10.83202494]


## 11. Average of betweenness centrality

In [80]:
# ====== compute betweenness centrality ====== #
start = time.time()

# use the traditional approach of betweeness computation
btwn = nk.centrality.Betweenness(g)
btwn.run()

end = time.time()
print('Compute betweenness centrality of every node in the graph takes %.4f s' % (end-start))

Compute betweenness centrality of every node in the graph takes 436.9762 s


In [154]:
def avg_betweeness(ds, btwn):
    '''
    Compute feature: average in betweenness centrality
    
    Parameters
    ----------
    ds: dataset to compute feature from
    g: the graph
    
    Returns
    -------
    A numpy array of numerical values
    '''
    size = len(ds)
    avg_btw = np.zeros(size)
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        
        # collect the average of betweenness centrality of 2 nodes
        _avg = float(btwn[ID[src]] + btwn[ID[dest]])
        avg_btw[i] = np.log(_avg/2.0) if _avg != 0.0 else 0.0
        
    return avg_btw

In [155]:
start = time.time()

# compute the average degree
train_avg_btwn = avg_betweeness(training, btwn.scores())

end = time.time()
print('Computing the average betweenness for training set takes %.4f s' %(end-start))

Computing the average betweenness for training set takes 1.9107 s


In [156]:
start = time.time()

# compute the average degree
test_avg_btwn = avg_betweeness(testing, btwn.scores())

end = time.time()
print('Computing the average betweenness for testing set takes %.4f s' %(end-start))

Computing the average betweenness for testing set takes 0.1240 s


In [157]:
print('Training:', train_avg_btwn[0:10])
print('Testing:', test_avg_btwn[0:10])

Training: [10.08688482 11.16590802  8.84607013  7.65815856  9.07845272  8.44864804
 15.30895205 12.10404087 12.35357254 11.4065003 ]
Testing: [11.89953033 14.12541191 15.37221353 11.0787389  12.47171402  9.63258213
  4.84337128 11.06101491  6.74711568  7.60931935]


## 12. Core Decomposition

**Intuition:** retrieve the core of a network, where many articles are connected to each other. Given a pair of articles, if both are found in the core, they are likely to connect to each other (assign value 1). If one is in the core and one is not, they might be connect to each other (assign value 0.5). Otherwise, they are highly unlikely to connect to each other (value 0)

In [176]:
start = time.time()

core_decomp = nk.community.CoreDecomposition(g, storeNodeOrder=True)
core_decomp.run()
cover_g = core_decomp.getCover()
order = 20

end = time.time()
print('Core decomposition of the graph takes %.4f s' % (end-start))

Core decomposition of the graph takes 0.2839 s


In [177]:
# idx = 1
# for ss in cover_g.subsetSizes():
#     print('Subset of order %d has %d elements' % (idx, ss))
#     idx += 1

In [180]:
print('There are %d nodes that belong in %d-core decomposition of this graph' 
      % (len(cover_g.getMembers(order)), order))

There are 7285 nodes that belong in 20-core decomposition of this graph


In [175]:
def in_kcore(ds, kcore):
    '''
    Compute feature: whether a pair of nodes is found in the same k-core graph
    
    Parameters
    ----------
    ds: dataset
    kcore: the k-core graph after decomposition as a set of nodes index (ranged from 0 to 27,770)
    
    Returns
    -------
    A numpy array of ordinal values: 
        - 0 if both nodes are not in the kcores, 
        - 0.5 if one of them is in the kcores, 
        - 1 of they are both in the k-core
    '''
    size = len(ds)
    same_kcore = np.zeros(size)
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        
        # compute whether two nodes are in the given kcore | one of them is in the kcore | none of them
        index_src = ID[src] # index of src
        index_dest = ID[dest] # index of dest
        
        if index_src in kcore and index_dest in kcore:
            result = 1.0
        elif index_src not in kcore and index_dest not in kcore:
            result = 0.0
        else:
            result = 0.5
            
        same_kcore[i] = result
        
    return same_kcore

In [181]:
start = time.time()

# compute the position of two nodes wrt k-core
train_in_kcore = in_kcore(training, cover_g.getMembers(order))

end = time.time()
print('Computing the in k-core feature for training set takes %.4f s' %(end-start))

Computing the in k-core feature for training set takes 1.1343 s


In [187]:
start = time.time()

# compute the position of two nodes wrt k-core
test_in_kcore = in_kcore(testing, cover_g.getMembers(order))

end = time.time()
print('Computing the in k-core feature for testing set takes %.4f s' %(end-start))

Computing the in k-core feature for testing set takes 0.1085 s


In [188]:
print('Training:', train_in_kcore[0:10])
print('Testing:', test_in_kcore[0:10])

Training: [0.  1.  0.  0.  0.5 1.  1.  0.5 1.  0. ]
Testing: [0.5 1.  1.  1.  0.5 0.5 0.  0.5 0.  0. ]


# Saving features

In [189]:
# list of selected features
features = [
    'temporal_difference',
    'common_authors',
    'same_journal',
    'cosine_sim',
    'overlapping_title',
    'average_degrees',
    'common_neighbors',
    'jaccard_coefficient',
    'avg_pagerank',
    'average_betweenness',
    'in_kcore'
]

In [190]:
# ====== create array of training feature ====== #
training_features = np.array([
    train_temp_diff,
    train_common_auth,
    train_same_journal,
    train_cosine,
    train_overlap_title,
    train_avg_degrees,
    train_common_neigh,
    train_jaccard_coeff,
    train_avg_pr,
    train_avg_btwn,
    train_in_kcore
]).T

In [191]:
# ====== Saving features (training_features) ====== #
with open(path_data + 'training_features.csv', 'w', newline='') as f:
    csv_out = csv.writer(f)
    csv_out.writerow(features)
    for row in training_features:
        csv_out.writerow(row)

In [192]:
# ====== create array of testing features ====== #
testing_features = np.array([
    test_temp_diff,
    test_common_auth,
    test_same_journal,
    test_cosine,
    test_overlap_title,
    test_avg_degrees,
    test_common_neigh,
    test_jaccard_coeff,
    test_avg_pr,
    test_avg_btwn,
    test_in_kcore
]).T

In [193]:
# ====== Saving features (testing_features) ====== #
with open(path_data + 'testing_features.csv', 'w', newline='') as f:
    csv_out = csv.writer(f)
    csv_out.writerow(features)
    for row in testing_features:
        csv_out.writerow(row)