# Examining the citation networks

In [2]:
import numpy as np
import igraph as ig
import csv
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from sklearn import preprocessing as pre
from sklearn.metrics.pairwise import linear_kernel
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
%matplotlib inline

## Reading dataset (training, testing, node information)

In [3]:
path_data = '../data/' # path to the data
path_submission = '../submission/' # path to submission files

In [4]:
# read in node informations
with open(path_data + 'node_information.csv', 'r') as f:
    reader = csv.reader(f)
    node_info = list(reader)

In [5]:
# read training data as str
training = np.genfromtxt(path_data + 'training_set.txt', dtype=str)

In [6]:
# read testing data as str
testing = np.genfromtxt(path_data + 'testing_set.txt', dtype=str)

## Utility functions

In [7]:
def build_graph(nodes, edges):
    '''
    Build a graph using igraph library
    
    Parameters
    ----------
    nodes: a list of nodes
    edges: a list of tuples (source, target)
    
    Returns
    -------
    a graph g
    '''
    g = ig.Graph(directed=False) # create an undirected graph
    g.add_vertices(nodes) # add nodes
    g.add_edges(edges) # add edges
    
    # TODO: add weights --> which metrics should we take into account ?
    
    return g

In [7]:
# a function used to build small graph for test purpose
def test_graph():
    nodes = ['hello','this','is','my','test']
    edges = [(0,1), (0,3), (1,4), (3,2), (4,3)]
    g = build_graph(nodes, edges)
    print(g)
    ig.plot(g, layout=g.layout('kk'))

In [8]:
# a function to preprocess text
def preprocess(text, dg_removal=True, sw_removal=True, stemming=True):
    '''
    Preprocess text: stopword removal, stemming, digit removal
    
    Parameters
    ----------
    text: text on which preprocessing is applied
    dg_removal: whether to apply digit removal or not
    sw_removal: whether to apply stopword removal or not
    stemming: whether to apply stemming or not
    
    Returns
    -------
    the text after preprocessing
    '''
    result = text
    
    sw = set(nltk.corpus.stopwords.words('english')) # set of stopwords
    stemmer = nltk.stem.PorterStemmer() # stemmer
    
    if dg_removal:
        result = re.sub('[0-9]', '', result)
    
    if sw_removal:
        result = ' '.join([token for token in result.split() if token not in sw])
        
    if stemming:
        result = ' '.join([stemmer.stem(token) for token in result.split()])
    
    return result

In [None]:
def compute_features(ds, g, scale=False):
    '''
    Compute the set of predefined features from the given dataset (training/testing)
    - cosine similarity between abstracts (texts already preprocessed)
    - temporal difference in term of publication year
    - number of common authors
    - whether two articles were published in the same journal
    - number of overlapped words in preprocessed title
    - average degree of both nodes (because orientation is not taken into account)
    - number of common neighbors in the graph
    
    Parameters
    ----------
    ds: dataset to compute features from
    g: citation graph
    scale: whether to scale the features or not
    
    Returns
    -------
    An array of computed features
    '''
    size = len(ds)
    
    cosines = np.zeros(size) # feature "Cosine similarities", between each abstract (already preprocessed)
    temp_diff = [] # feature "Temporal distance" i.e. integer values --> numerical feature
    common_auth = [] # feature "Number of common authors" i.e. integer values --> numerical feature
    same_journal = [] # feature "Same journal" i.e. binary label 0/1 --> categorical feature
    overlap_title = [] # feature "Overlapping title" i.e. number of common words in title
    avg_degrees = [] # feature "Average degree" i.e. the average degree of two nodes participating in the edge
    common_neigh = [] # feature "Common neighbors" i.e. the citations cited in common between two nodes
    jaccard_coeff = [] # feature "Jaccard coefficient" i.e. the relative number of common neighbors
    betw_centrality = [] # feature "Betweenness centrality" i.e. the difference of betweenness of two connected nodes
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        src_info, dest_info = node_info[ID.index(src)], node_info[ID.index(dest)] # get the associated node information
    
        # collect the cosine similarity
        src_vect, dest_vect = tfidf[ID.index(src)], tfidf[ID.index(dest)] # get the corresponding vector in TD-IDF matrix
        cos = cosine_similarity(src_vect, dest_vect) # compute cosine similarity
        cosines[i] = cos
        
        # collect the temporal difference (in absolute value)
        temp_diff.append(abs(int(src_info[1]) - int(dest_info[1])))
    
        # collect the number of common authors
        common_auth.append(len(
            set(src_info[3].split(',')).intersection(set(dest_info[3].split(',')))
        ))
    
        # collect the information whether the two 2 articles were published in the same journal
        same_journal.append(int(
            len(src_info[4])>0 and  # journal info of source not null
            len(dest_info[4])>0 and # journal info of dest not null
            src_info[4] == dest_info[4] # the same journal title
        ))
        
        # collect the number of overlapping words in title
        src_title, dest_title = preprocess(src_info[2]).split(), preprocess(dest_info[2]).split()
        overlap_title.append(len(
            set(src_title).intersection(set(dest_title))
        ))
        
        # collect the average degree
        src_deg, dest_deg = g.degree(src), g.degree(dest)
        avg_deg = float(src_deg + dest_deg)/2.0
        avg_degrees.append(avg_deg)
        
        # collect the number of common neighbors
        common_neigh.append(len(
            set(g.neighbors(src)).intersection(set(g.neighbors(dest)))
        ))
        
        # collect the relative number of common neighbors based on Jaccard coefficient
        inters = len(set(g.neighbors(src)).intersection(set(g.neighbors(dest)))) # intersection of neighbors
        union = len(set(g.neighbors(src)).union(set(g.neighbors(dest)))) # union of neighbors
        jaccard_coeff.append(
            (float(inters)/float(union) if union != 0 else 0)
        )
        
        # collect the betweenness centrality between two nodes
        betw_centrality.append(
            g.betweenness(dest, directed=False, cutoff=10)-g.betweenness(src, directed=False, cutoff=10)
        )
    
    features = np.array([
        temp_diff, 
        common_auth, 
        same_journal, 
        cosines, 
        overlap_title, 
        avg_degrees,
        common_neigh,
        jaccard_coeff,
        betw_centrality
    ]).T
    
    if scale:
        features = pre.scale(features)
    
    return features

In [9]:
def write_submission(filename, pred):
    '''
    Write prediction result in a submission file
    
    Parameters
    ----------
    filename: name of submission file
    pred: prediction array
    
    '''
    with open(path_submission + filename, 'wb') as f:
        csv_out = csv.writer(f)
        csv_out.writerow(['id','category'])
        for row in pred:
            csv_out.writerow(row)

## Building the citation graph

In [10]:
labels = training[:, 2].astype(int) # get the labels

In [13]:
nb_edges = np.count_nonzero(labels)
print('%d edges among %d training instances --> %.2f%%' % (nb_edges, len(labels), 100*nb_edges/len(labels)))

335130 edges among 615512 training instances --> 54.00%


In [14]:
sources = training[:,0] # extract source nodes' ID
targets = training[:,1] # extract target nodes' ID

In [73]:
edges = [(element[0], element[1]) for element in training if int(element[2]) == 1] # extract all the edges
nodes = [element[0] for element in node_info] # extract all the vertices
g = build_graph(nodes, edges) # build the graph

In [16]:
# check the number of vertices and edges
print('Number of vertices: %d' % len(g.vs))
print('Number of edges: %d' % len(g.es))

Number of vertices: 27770
Number of edges: 335130


## Computing features

In [17]:
# corpus is the set of abstracts, apply preprocessing to each abstract
corpus = [preprocess(element[5], dg_removal=True, sw_removal=True, stemming=True) for element in node_info]
vectorizer = TfidfVectorizer(stop_words='english') # create a TF-IDF vectorizer
tfidf = vectorizer.fit_transform(corpus) # TD-IDF matrix of the entire corpus (set of abstracts)

# index list, to facilite access to a node by its id
ID = [element[0] for element in node_info]

### Computing training and testing features

In [98]:
# compute the training features
training_features = compute_features(training, g, scale=False)

In [100]:
# compute the testing features
testing_features = compute_features(testing, g, scale=False)

In [None]:
feat = compute_features(training[0:5], g, scale=False)
feat
# scaling does NOT seem to improve the prediction
# training_features = pre.scale(training_features)
# testing_features = pre.scale(testing_features)

## SVM classifier

### SVM without scaling

In [107]:
clf_svm = svm.LinearSVC()
clf_svm.fit(training_features, labels)
pred_svm = list(clf_svm.predict(testing_features))
pred_svm = zip(range(len(testing)), pred_svm)

In [108]:
write_submission('submission_svm_05.csv', pred_svm)

### SVM with scaling

In [109]:
# scale the features
training_features_scale = pre.scale(training_features)
testing_features_scale = pre.scale(testing_features)

# training &  issuing predictions
clf_svm_scale = svm.LinearSVC()
clf_svm_scale.fit(training_features_scale, labels)
pred_svm_scale = list(clf_svm_scale.predict(testing_features_scale))
pred_svm_scale = zip(range(len(testing)), pred_svm_scale)

In [110]:
write_submission('submission_svm_05_scale.csv', pred_svm_scale)

## RandomForest classifier

In [11]:
#training_features = np.genfromtxt(path_data + 'training_features.csv', delimiter=',',skip_header=1)

In [12]:
#testing_features = np.genfromtxt(path_data + 'testing_features.csv', delimiter=',',skip_header=1)

In [13]:
clf_rf = RandomForestClassifier()
clf_rf.fit(training_features, labels)
pred_rf = list(clf_rf.predict(testing_features))
pred_rf = zip(range(len(testing)), pred_rf)

In [106]:
write_submission('submission_rf_03.csv', pred_rf)

## Logistic Regression classifier

In [111]:
clf_lg = LogisticRegression()
clf_lg.fit(training_features, labels)
pred_lg = list(clf_lg.predict(testing_features))
pred_lg = zip(range(len(testing)), pred_lg)
write_submission('submission_lg_02.csv', pred_lg)

In [112]:
# scale the features
training_features_scale = pre.scale(training_features)
testing_features_scale = pre.scale(testing_features)

# training + issuing predictions
clf_lg_scale = LogisticRegression()
clf_lg_scale.fit(training_features_scale, labels)
pred_lg_scale = list(clf_lg_scale.predict(testing_features_scale))
pred_lg_scale = zip(range(len(testing)), pred_lg_scale)
write_submission('submission_lg_02_scale.csv', pred_lg_scale)

## Neural Network (simple version)

In [113]:
clf_nn = MLPClassifier(
    hidden_layer_sizes = (50,60,40,30,20),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn.fit(training_features, labels)
pred_nn = clf_nn.predict(testing_features)
pred_nn = zip(range(len(testing)), pred_nn)

In [19]:
# with scaling (not good result)
training_features_scale = pre.scale(training_features)
testing_features_scale = pre.scale(testing_features)

clf_nn_scale = MLPClassifier(
    hidden_layer_sizes = (50,60,40,30,20),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn_scale.fit(training_features_scale, labels)
pred_nn_scale = clf_nn_scale.predict(testing_features_scale)
pred_nn_scale = zip(range(len(testing)), pred_nn_scale)

In [20]:
write_submission('submission_nn_03_scale.csv', pred_nn_norm)

## Feature importance

In [15]:
features = [
    'temp_diff', 
    'common_auth', 
    'same_journal', 
    'cosines', 
    'overlap_title', 
    'avg_degrees', 
    'common_neigh', 
    'jaccard_coeff',
    'betw_centrality'
]

idx = np.argsort(-clf_rf.feature_importances_)

for i in idx:
    print('Feature \'%s\' of importance %.3f' % (features[i], clf_rf.feature_importances_[i]))

Feature 'jaccard_coeff' of importance 0.397
Feature 'common_neigh' of importance 0.272
Feature 'cosines' of importance 0.186
Feature 'avg_degrees' of importance 0.095
Feature 'overlap_title' of importance 0.022
Feature 'temp_diff' of importance 0.020
Feature 'common_auth' of importance 0.005
Feature 'same_journal' of importance 0.003


In [104]:
# Saving features to reload it faster (testing_features)
with open(path_data + 'testing_features.csv', 'wb') as f:
    csv_out = csv.writer(f)
    csv_out.writerow(features)
    for row in testing_features:
        csv_out.writerow(row)

In [105]:
# Saving features to reload it faster (training_features)
with open(path_data + 'training_features.csv', 'wb') as f:
    csv_out = csv.writer(f)
    csv_out.writerow(features)
    for row in training_features:
        csv_out.writerow(row)