# Examining the citation networks

In [285]:
import numpy as np
import igraph as ig
import csv
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from sklearn import preprocessing as pre
from sklearn.metrics.pairwise import linear_kernel
from sklearn import svm

## Reading dataset (training, testing, node information)

In [326]:
path_data = '../data/' # path to the data
path_submission = '../submission/' # path to submission files

In [46]:
# read in node informations
with open(path_data + 'node_information.csv', 'r') as f:
    reader = csv.reader(f)
    node_info = list(reader)

In [151]:
# read training data as str
training = np.genfromtxt(path_data + 'training_set.txt', dtype=str)

In [294]:
# read testing data as str
testing = np.genfromtxt(path_data + 'testing_set.txt', dtype=str)

In [65]:
def build_graph(nodes, edges):
    '''
    Build a graph using igraph library
    
    Parameters
    ----------
    nodes: a list of nodes
    edges: a list of tuples (source, target)
    
    Returns
    -------
    a graph g
    '''
    g = ig.Graph(directed=True) # create an undirected graph
    g.add_vertices(nodes) # add nodes
    g.add_edges(edges) # add edges
    
    # TODO: add weights --> which metrics should we take into account ?
    
    return g

In [13]:
# a function used to build small graph for test purpose
def test_graph():
    nodes = ['hello','this','is','my','test']
    edges = [(0,1), (0,3), (1,4), (3,2), (4,3)]
    g = build_graph(nodes, edges)
    print(g)
    ig.plot(g, layout=g.layout('kk'))

In [199]:
# a function to preprocess text
def preprocess(text, dg_removal=True, sw_removal=True, stemming=True):
    '''
    Preprocess text: stopword removal, stemming, digit removal
    
    Parameters
    ----------
    text: text on which preprocessing is applied
    dg_removal: whether to apply digit removal or not
    sw_removal: whether to apply stopword removal or not
    stemming: whether to apply stemming or not
    
    Returns
    -------
    the text after preprocessing
    '''
    result = text
    
    sw = set(nltk.corpus.stopwords.words('english')) # set of stopwords
    stemmer = nltk.stem.PorterStemmer() # stemmer
    
    if dg_removal:
        result = re.sub('[0-9]', '', result)
    
    if sw_removal:
        result = ' '.join([token for token in result.split() if token not in sw])
        
    if stemming:
        result = ' '.join([stemmer.stem(token) for token in result.split()])
    
    return result

In [311]:
def compute_features(ds, scale=False):
    '''
    Compute the set of predefined features from the given dataset (training/testing)
    - cosine similarity between abstracts (texts already preprocessed)
    - temporal difference in term of publication year
    - number of common authors
    - whether two articles were published in the same journal
    
    Parameters
    ----------
    ds: dataset to compute features from
    scale: whether to scale the features or not
    
    Returns
    -------
    An array of computed features
    '''
    size = len(ds)
    
    cosines = np.zeros(size) # array of cosine similarities, between each abstract (already preprocessed)
    temp_diff = [] # array of feature "Temporal distance" i.e. integer values --> numerical feature
    common_auth = [] # array of feature "Number of common authors" i.e. integer values --> numerical feature
    same_journal = [] # array of feature "Same journal" i.e. binary label 0/1 --> categorical feature
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        src_info, dest_info = node_info[ID.index(src)], node_info[ID.index(dest)] # get the associated node information
    
        # collect the cosine similarity
        src_vect, dest_vect = tfidf[ID.index(src)], tfidf[ID.index(dest)] # get the corresponding vector in TD-IDF matrix
        cos = cosine_similarity(src_vect, dest_vect) # compute cosine similarity
        cosines[i] = cos
        
        # collect the temporal difference (in absolute value)
        temp_diff.append(abs(int(src_info[1]) - int(dest_info[1])))
    
        # collect the number of common authors
        common_auth.append(len(
            set(src_info[3].split(',')).intersection(set(dest_info[3].split(',')))
        ))
    
        # collect the information whether the two 2 articles were published in the same journal
        same_journal.append(int(
            len(src_info[4])>0 and  # journal info of source not null
            len(dest_info[4])>0 and # journal info of dest not null
            src_info[4] == dest_info[4] # the same journal title
        ))
    
    features = np.array([temp_diff, common_auth, same_journal, cosines]).T
    
    if scale:
        features = pre.scale(features)
    
    return features

In [336]:
def write_submission(filename, pred):
    '''
    Write prediction result in a submission file
    
    Parameters
    ----------
    filename: name of submission file
    pred: prediction array
    
    '''
    with open(path_submission + filename, 'wb') as f:
        csv_out = csv.writer(f)
        csv_out.writerow(['id','category'])
        for row in pred:
            csv_out.writerow(row)

In [152]:
labels = training[:, 2].astype(int) # get the labels

In [153]:
nb_edges = np.count_nonzero(labels)
print('%d edges among %d training instances --> %.2f%%' % (nb_edges, len(labels), 100*nb_edges/len(labels)))

335130 edges among 615512 training instances --> 54.00%


In [154]:
sources = training[:,0] # extract source nodes' ID
targets = training[:,1] # extract target nodes' ID

In [155]:
edges = [(element[0], element[1]) for element in training if int(element[2]) == 1] # extract all the edges
nodes = [element[0] for element in node_info] # extract all the vertices
g = build_graph(nodes, edges) # build the graph

In [156]:
# check the number of vertices and edges
print('Number of vertices: %d' % len(g.vs))
print('Number of edges: %d' % len(g.es))

Number of vertices: 27770
Number of edges: 335130


## Computing features

In [201]:
# corpus is the set of abstracts, apply preprocessing to each abstract
corpus = [preprocess(element[5], dg_removal=True, sw_removal=True, stemming=True) for element in node_info]
vectorizer = TfidfVectorizer(stop_words='english') # create a TF-IDF vectorizer
tfidf = vectorizer.fit_transform(corpus) # TD-IDF matrix of the entire corpus (set of abstracts)

# index list, to facilite access to a node by its id
ID = [element[0] for element in node_info]

### Computing training and testing features

In [318]:
# compute the training features
training_features = compute_features(training, scale=False)

In [319]:
# compute the testing features
testing_features = compute_features(testing, scale=False)

In [331]:
# scaling does NOT seem to improve the prediction
# training_features = pre.scale(training_features)
# testing_features = pre.scale(testing_features)

### SVM classifier

In [332]:
clf_svm = svm.LinearSVC()
clf_svm.fit(training_features, labels)
pred_svm = list(clf_svm.predict(testing_features))
pred_svm = zip(range(len(testing)), pred_svm)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)