# Training model and Issuing predictions (main script)

In [50]:
# import time to find consuming steps
import time
start = time.time()

# utility libraries
import numpy as np
import igraph as ig
import csv
from sklearn import preprocessing as pre
import re

# working with text
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# classifier for classification
from sklearn.metrics.pairwise import linear_kernel
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

# plotting stuffs
import matplotlib.pyplot as plt
%matplotlib inline

end = time.time()
print('Loading libraries takes %.4f s' % (end-start))

Loading libraries takes 0.0160 s


# Reading dataset (training, testing, node information)

In [2]:
path_data = '../data/' # path to the data
path_submission = '../submission/' # path to submission files

In [3]:
start = time.time()

# ====== read in node informations ====== #
with open(path_data + 'node_information.csv', 'r') as f:
    reader = csv.reader(f)
    node_info = list(reader)

    
end = time.time()
print('Reading node information takes %.4f s' % (end-start))

Reading node information takes 0.2810 s


In [4]:
start = time.time()

# ====== read training data as str ====== #
training = np.genfromtxt(path_data + 'training_set.txt', dtype=str)

end = time.time()
print('Reading training set takes %.4f s' % (end-start))

Reading training set takes 3.1360 s


In [5]:
start = time.time()

# ====== read testing data as str ====== #
testing = np.genfromtxt(path_data + 'testing_set.txt', dtype=str)

end = time.time()
print('Reading testing set takes %.4f s' % (end-start))

Reading testing set takes 0.1700 s


# Utility functions

In [6]:
def build_graph(nodes, edges):
    '''
    Build a graph using igraph library
    
    Parameters
    ----------
    nodes: a list of nodes
    edges: a list of tuples (source, target)
    
    Returns
    -------
    a graph g
    '''
    g = ig.Graph(directed=False) # create an undirected graph
    g.add_vertices(nodes) # add nodes
    g.add_edges(edges) # add edges
    
    #betweenness = [(v['name'], g.betweenness(v, directed=False, cutoff=3)) for v in g.vs]
    
    return g

In [9]:
# a function used to build small graph for test purpose
def test_graph():
    nodes = ['hello','this','is','my','test']
    edges = [(0,1), (0,3), (1,4), (3,2), (4,3)]
    gr = build_graph(nodes, edges)
    print(gr.summary)
    
gr = test_graph()

<bound method Graph.summary of <igraph.Graph object at 0x00000000174A25E8>>


In [10]:
# a function to preprocess text
def preprocess(text, dg_removal=True, sw_removal=True, stemming=True):
    '''
    Preprocess text: stopword removal, stemming, digit removal
    
    Parameters
    ----------
    text: text on which preprocessing is applied
    dg_removal: whether to apply digit removal or not
    sw_removal: whether to apply stopword removal or not
    stemming: whether to apply stemming or not
    
    Returns
    -------
    the text after preprocessing
    '''
    result = text
    
    sw = set(nltk.corpus.stopwords.words('english')) # set of stopwords
    stemmer = nltk.stem.PorterStemmer() # stemmer
    
    if dg_removal:
        result = re.sub('[0-9]', '', result)
    
    if sw_removal:
        result = ' '.join([token for token in result.split() if token not in sw])
        
    if stemming:
        result = ' '.join([stemmer.stem(token) for token in result.split()])
    
    return result

In [11]:
def compute_features(ds, g, scale=False):
    '''
    Compute the set of predefined features from the given dataset (training/testing)
    - cosine similarity between abstracts (texts already preprocessed)
    - temporal difference in term of publication year
    - number of common authors
    - whether two articles were published in the same journal
    - number of overlapped words in preprocessed title
    - average degree of both nodes (because orientation is not taken into account)
    - number of common neighbors in the graph
    
    Parameters
    ----------
    ds: dataset to compute features from
    g: citation graph
    betweenness: betweenness centrality of every node (already computed when building graph)
    scale: whether to scale the features or not
    
    Returns
    -------
    An array of computed features
    '''
    size = len(ds)
    
    cosines = np.zeros(size) # feature "Cosine similarities", between each abstract (already preprocessed)
    temp_diff = [] # feature "Temporal distance" i.e. integer values --> numerical feature
    common_auth = [] # feature "Number of common authors" i.e. integer values --> numerical feature
    same_journal = [] # feature "Same journal" i.e. binary label 0/1 --> categorical feature
    overlap_title = [] # feature "Overlapping title" i.e. number of common words in title
    avg_degrees = [] # feature "Average degree" i.e. the average degree of two nodes participating in the edge
    common_neigh = [] # feature "Common neighbors" i.e. the citations cited in common between two nodes
    jaccard_coeff = [] # feature "Jaccard coefficient" i.e. the relative number of common neighbors
    
    #betw_centrality = [] # feature "Betweenness centrality" i.e. the difference of betweenness of two connected nodes
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        src_info, dest_info = node_info[ID.index(src)], node_info[ID.index(dest)] # get the associated node information
    
        # collect the cosine similarity
        src_vect, dest_vect = tfidf[ID.index(src)], tfidf[ID.index(dest)] # get the corresponding vector in TD-IDF matrix
        cos = cosine_similarity(src_vect, dest_vect) # compute cosine similarity
        cosines[i] = cos
        
        # collect the temporal difference (in absolute value)
        temp_diff.append(
            abs(int(src_info[1]) - int(dest_info[1]))
        )
    
        # collect the number of common authors
        common_auth.append(len(
            set(src_info[3].split(',')).intersection(set(dest_info[3].split(',')))
        ))
    
        # collect the information whether the two 2 articles were published in the same journal
        same_journal.append(int(
            len(src_info[4])>0 and  # journal info of source not null
            len(dest_info[4])>0 and # journal info of dest not null
            src_info[4] == dest_info[4] # the same journal title
        ))
        
        # collect the number of overlapping words in title
        src_title, dest_title = preprocess(src_info[2]).split(), preprocess(dest_info[2]).split()
        overlap_title.append(len(
            set(src_title).intersection(set(dest_title))
        ))
        
        # collect the average degree
        src_deg, dest_deg = g.degree(src), g.degree(dest)
        avg_deg = float(src_deg + dest_deg)/2.0
        avg_degrees.append(avg_deg)
        
        # collect the number of common neighbors
        common_neigh.append(len(
            set(g.neighbors(src)).intersection(set(g.neighbors(dest)))
        ))
        
        # collect the relative number of common neighbors based on Jaccard coefficient
        inters = len(set(g.neighbors(src)).intersection(set(g.neighbors(dest)))) # intersection of neighbors
        union = len(set(g.neighbors(src)).union(set(g.neighbors(dest)))) # union of neighbors
        jaccard_coeff.append(
            (float(inters)/float(union) if union != 0 else 0)
        )
        
        # computational cost is terrible!!!
        # collect the betweenness centrality between two nodes
        #betw_centrality.append(
        #    abs(betweennes[dest] - betweenness[src])
        #)
    
    features = np.array([
        temp_diff, 
        common_auth, 
        same_journal, 
        cosines, 
        overlap_title, 
        avg_degrees,
        common_neigh,
        jaccard_coeff
        #betw_centrality
    ]).T
    
    if scale:
        features = pre.scale(features)
    
    return features

In [12]:
def write_submission(filename, pred):
    '''
    Write prediction result in a submission file
    
    Parameters
    ----------
    filename: name of submission file
    pred: prediction array
    
    '''
    with open(path_submission + filename, 'wb') as f:
        csv_out = csv.writer(f)
        csv_out.writerow(['id','category'])
        for row in pred:
            csv_out.writerow(row)

# Building the citation graph (time consuming)

In [13]:
labels = training[:, 2].astype(int) # get the labels

In [14]:
nb_edges = np.count_nonzero(labels)
print('%d edges among %d training instances --> %.2f%%' % (nb_edges, len(labels), 100*nb_edges/len(labels)))

335130 edges among 615512 training instances --> 54.00%


In [15]:
start = time.time()

edges = [(element[0], element[1]) for element in training if int(element[2]) == 1] # extract all the edges
nodes = [element[0] for element in node_info] # extract all the vertices
g = build_graph(nodes, edges) # build the graph

end = time.time()
print('Building the graph takes %.4f s' % (end-start))

Building the graph takes 1.1560 s


In [16]:
# check the number of vertices and edges
print('Number of vertices: %d' % len(g.vs))
print('Number of edges: %d' % len(g.es))

Number of vertices: 27770
Number of edges: 335130


# Computing features (time consuming)

First, we prepare some ingredients that serve the computation of features:
- TF-IDF matrix, built on the corpus of abstracts
- index list containing the ID of each article, to facilitate access to node information

In [17]:
start = time.time()

# ====== corpus is the set of titles + abstracts, apply preprocessing to each article ======#
corpus = [preprocess(element[2] + ' ' + element[5], dg_removal=True, sw_removal=True, stemming=True) 
          for element in node_info]
vectorizer = TfidfVectorizer(stop_words='english') # create a TF-IDF vectorizer
tfidf = vectorizer.fit_transform(corpus) # TD-IDF matrix of the entire corpus (set of abstracts)

end = time.time()
print('Computing the TF-IDF matrix takes %.4f s' % (end-start))

Computing the TF-IDF matrix takes 78.0290 s


In [18]:
# ====== create an index list, to facilite access to a node by its id ====== #
ID = [element[0] for element in node_info]

In [29]:
features = [
    'temp_diff', 
    'common_auth', 
    'same_journal', 
    'cosines', 
    'overlap_title', 
    'avg_degrees', 
    'common_neigh', 
    'jaccard_coeff'
]

When everything is ready, we start computing features for the training and testing set. The list of features is described as follows, and the computation rule is the same for both training and testing set.

| Feature                | Explanation                                                        | Type      | Range   |
|:----------------------:|:------------------------------------------------------------------:|:---------:|:-------:|
| Temporal difference    | Difference in publication year (absolute value)                    | numerical | $\ge$ 0 |
| Same journal           | Whether two articles are published in the same journal             | binary    | 0, 1    |
| Cosine similarity      | Cosine similarity between word vectors of abstracts                | numerical | [0,1]   |
| Title overlap          | Number of overlapping words in title                               | numerical | $\ge$ 0 |
| Average degree         | Average degree of two nodes (disregard connected or not            | numerical | $\ge$ 0 |
| Common neighbors       | Number of common neighbors                                         | numercial | $\ge$ 0 |
| Jaccard coefficient    | Link-based Jaccard coefficient                                     | numerical | [0,1]   |
| Betweenness centrality | Difference in betweenness centrality of two nodes (absolute value) | numerical | $\ge$ 0 |

This step is highly time consuming.

In [24]:
# might read already computed features to save time
#training_features = np.genfromtxt(path_data + 'training_features.csv', delimiter=',',skip_header=1)
#testing_features = np.genfromtxt(path_data + 'testing_features.csv', delimiter=',',skip_header=1)

In [25]:
start = time.time()

# ====== compute training features : (615,521 x 8) ====== #
training_features = compute_features(training, g, scale=False)

end = time.time()
print('Computing the training features takes %.4f s' % (end-start))

Computing the training features takes 1655.8490 s


In [27]:
start = time.time()

# ====== compute testing features : (32,648 x 8) ====== #
testing_features = compute_features(testing, g, scale=False)

end = time.time()
print('Computing the training features takes %.4f s' % (end-start))

Computing the training features takes 87.4310 s


In [None]:
# test purpose
feat = compute_features(training[0:5], g, scale=False)
feat

In [30]:
# ====== Saving features to reload it faster (training_features) ====== #
with open(path_data + 'training_features.csv', 'wb') as f:
    csv_out = csv.writer(f)
    csv_out.writerow(features)
    for row in training_features:
        csv_out.writerow(row)

In [32]:
# ====== Saving features to reload it faster (testing_features) ====== #
with open(path_data + 'testing_features.csv', 'wb') as f:
    csv_out = csv.writer(f)
    csv_out.writerow(features)
    for row in testing_features:
        csv_out.writerow(row)

In [33]:
# ====== Scaling features ====== #
training_features_scale = pre.scale(training_features)
testing_features_scale = pre.scale(testing_features)

# SVM classifier

## A. SVM without scaling

In [34]:
start = time.time()

# ====== training and predicting with SVM ====== #
clf_svm = svm.LinearSVC()
clf_svm.fit(training_features, labels)
pred_svm = list(clf_svm.predict(testing_features))
pred_svm = zip(range(len(testing)), pred_svm)

end = time.time()
print('Training with SVM Linear SVC takes %.4f s' % (end-start))

Training with SVM Linear SVC takes 110.5520 s


In [35]:
write_submission('submission_svm_06.csv', pred_svm)

## B. SVM with scaling

In [36]:
start = time.time()

# ====== training and prediction with SVM and scaled features ====== #
clf_svm_scale = svm.LinearSVC()
clf_svm_scale.fit(training_features_scale, labels)
pred_svm_scale = list(clf_svm_scale.predict(testing_features_scale))
pred_svm_scale = zip(range(len(testing)), pred_svm_scale)

end = time.time()
print('Training with SVM Linear SVC + scaling takes %.4f s' % (end-start))

Training with SVM Linear SVC + scaling takes 64.6380 s


In [37]:
write_submission('submission_svm_06_scale.csv', pred_svm_scale)

# RandomForest classifier

In [38]:
start = time.time()

# ====== training and prediction with Random Forest ====== #
clf_rf = RandomForestClassifier()
clf_rf.fit(training_features, labels)
pred_rf = list(clf_rf.predict(testing_features))
pred_rf = zip(range(len(testing)), pred_rf)

end = time.time()
print('Training with Random Forest takes %.4f s' % (end-start))

Training with Random Forest takes 11.2460 s


In [39]:
write_submission('submission_rf_04.csv', pred_rf)

# Logistic Regression

## A. Logistic Regression without scaling

In [40]:
start = time.time()

# ====== training and prediction with Logistic Regression ====== #
clf_lg = LogisticRegression()
clf_lg.fit(training_features, labels)
pred_lg = list(clf_lg.predict(testing_features))
pred_lg = zip(range(len(testing)), pred_lg)

end = time.time()
print('Training with Logistic Regression takes %.4f s' % (end-start))

Training with Logistic Regression takes 4.0260 s


In [41]:
write_submission('submission_lg_03.csv', pred_lg)

## B. Logistic Regression with scaling

In [42]:
start = time.time()

# ====== training and prediction with Logistic Regression + scaling ====== #
clf_lg_scale = LogisticRegression()
clf_lg_scale.fit(training_features_scale, labels)
pred_lg_scale = list(clf_lg_scale.predict(testing_features_scale))
pred_lg_scale = zip(range(len(testing)), pred_lg_scale)

end = time.time()
print('Training with Logistic Regression + scaling takes %.4f s' % (end-start))

Training with Logistic Regression + scaling takes 4.0500 s


In [43]:
write_submission('submission_lg_03_scale.csv', pred_lg_scale)

# Neural Network (simple version)

## A. Neural Network without scaling

In [44]:
start = time.time()

# ====== training and prediction with Logistic Regression + scaling ====== #
clf_nn = MLPClassifier(
    hidden_layer_sizes = (50,60,40,30,20),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn.fit(training_features, labels)
pred_nn = clf_nn.predict(testing_features)
pred_nn = zip(range(len(testing)), pred_nn)

end = time.time()
print('Training with Neural Networks takes %.4f s' % (end-start))

Training with Neural Networks takes 29.6230 s


In [45]:
write_submission('submission_nn_04.csv', pred_nn)

## B. Neural Network with scaling

In [46]:
start = time.time()

# ====== training and prediction with Logistic Regression + scaling ====== #
training_features_scale = pre.scale(training_features)
testing_features_scale = pre.scale(testing_features)

clf_nn_scale = MLPClassifier(
    hidden_layer_sizes = (50,60,40,30,20),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn_scale.fit(training_features_scale, labels)
pred_nn_scale = clf_nn_scale.predict(testing_features_scale)
pred_nn_scale = zip(range(len(testing)), pred_nn_scale)

end = time.time()
print('Training with Neural Networks + scaling takes %.4f s' % (end-start))

Training with Neural Networks + scaling takes 39.8180 s


In [48]:
write_submission('submission_nn_04_scale.csv', pred_nn_scale)

# Gradient Boosting

## Gradient Boosting

In [52]:
start = time.time()

# ====== Training and predicting with Gradient Boosting ====== #
clf_gboost = GradientBoostingClassifier(
    loss = 'deviance',
    n_estimators = 150
)
clf_gboost.fit(training_features, labels)
pred_gboost = clf_gboost.predict(testing_features)
pred_gboost = zip(range(len(testing)), pred_gboost)

end = time.time()
print('Training with Gradient Boosting takes %.4f s' % (end-start))

Training with Gradient Boosting takes 81.1830 s


In [53]:
write_submission('submission_gboost_01.csv', pred_gboost)

## AdaBoost

In [57]:
start = time.time()

# ====== Training and predicting with Gradient Boosting ====== #
clf_ada = GradientBoostingClassifier(
    loss = 'exponential',
    n_estimators = 200
)
clf_ada.fit(training_features, labels)
pred_ada = clf_ada.predict(testing_features)
pred_ada = zip(range(len(testing)), pred_ada)

end = time.time()
print('Training with Adaboost takes %.4f s' % (end-start))

Training with Adaboost takes 108.2860 s


In [58]:
write_submission('submission_ada_02.csv', pred_ada)

# Feature importance

In [56]:
# ====== compute feature importance ====== #
idx = np.argsort(-clf_rf.feature_importances_) # sort the indicator of feature important by decreasing order

for i in idx:
    print('Feature \'%s\' of importance %.3f' % (features[i], clf_rf.feature_importances_[i]))

Feature 'jaccard_coeff' of importance 0.442
Feature 'common_neigh' of importance 0.245
Feature 'avg_degrees' of importance 0.140
Feature 'cosines' of importance 0.126
Feature 'overlap_title' of importance 0.031
Feature 'temp_diff' of importance 0.013
Feature 'common_auth' of importance 0.002
Feature 'same_journal' of importance 0.001
