# Training model and Issuing predictions (main script)

In [1]:
# import time to find consuming steps
import time
start = time.time()

# utility libraries
import numpy as np
import igraph as ig
import csv
from sklearn import preprocessing as pre
import re

# working with text
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# classifier for classification
from sklearn.metrics.pairwise import linear_kernel
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

# plotting stuffs
import matplotlib.pyplot as plt
%matplotlib inline

end = time.time()
print('Loading libraries takes %.4f s' % (end-start))

Loading libraries takes 1.1090 s


# Reading dataset (training, testing, node information)

In [2]:
path_data = '../data/' # path to the data
path_submission = '../submission/' # path to submission files

In [3]:
start = time.time()

# ====== read in node informations ====== #
with open(path_data + 'node_information.csv', 'r') as f:
    reader = csv.reader(f)
    node_info = list(reader)

    
end = time.time()
print('Reading node information takes %.4f s' % (end-start))

Reading node information takes 0.2350 s


In [4]:
start = time.time()

# ====== read training data as str ====== #
training = np.genfromtxt(path_data + 'training_set.txt', dtype=str)

end = time.time()
print('Reading training set takes %.4f s' % (end-start))

Reading training set takes 2.7660 s


In [5]:
start = time.time()

# ====== read testing data as str ====== #
testing = np.genfromtxt(path_data + 'testing_set.txt', dtype=str)

end = time.time()
print('Reading testing set takes %.4f s' % (end-start))

Reading testing set takes 0.1560 s


# Utility functions

In [6]:
def build_graph(nodes, edges):
    '''
    Build a graph using igraph library
    
    Parameters
    ----------
    nodes: a list of nodes
    edges: a list of tuples (source, target)
    
    Returns
    -------
    a graph g
    '''
    g = ig.Graph(directed=False) # create an undirected graph
    g.add_vertices(nodes) # add nodes
    g.add_edges(edges) # add edges
    
    # beware of multiple edges in the graph --> remove all redundant edges between the same pairs of vertices
    multiple_edges = [e for e in edges if g.is_multiple(e)]
    g.delete_edges(multiple_edges)
    
    return g

In [7]:
# a function used to build small graph for test purpose
def test_graph():
    nodes = ['hello','this','is','my','test']
    edges = [(0,1), (0,3), (1,4), (3,2), (4,3)]
    gr = build_graph(nodes, edges)
    print(gr.summary)
    
gr = test_graph()

<bound method Graph.summary of <igraph.Graph object at 0x0000000016D904F8>>


In [8]:
# a function to preprocess text
def preprocess(text, dg_removal=True, sw_removal=True, stemming=True):
    '''
    Preprocess text: stopword removal, stemming, digit removal
    
    Parameters
    ----------
    text: text on which preprocessing is applied
    dg_removal: whether to apply digit removal or not
    sw_removal: whether to apply stopword removal or not
    stemming: whether to apply stemming or not
    
    Returns
    -------
    the text after preprocessing
    '''
    result = text
    
    sw = set(nltk.corpus.stopwords.words('english')) # set of stopwords
    stemmer = nltk.stem.PorterStemmer() # stemmer
    
    if dg_removal:
        result = re.sub('[0-9]', '', result)
    
    if sw_removal:
        result = ' '.join([token for token in result.split() if token not in sw])
        
    if stemming:
        result = ' '.join([stemmer.stem(token) for token in result.split()])
    
    return result

In [9]:
def preprocess_authors(authors):
    '''
    Replace all (...) additional information from the names
    
    Parameters
    ----------
    authors: an author field e.g. "M. Jinzenji (U. of Tokyo)"
    
    Returns
    -------
    the authors name that is stripped of "(...)" e.g. "M. Jinzenji (U. of Tokyo)" --> "M. Jinzenji "
    (some blank, trailing space may be left behind)
    '''
    
    stripped = re.sub('\(([a-zA-Z0-9\\s,\.\-\'\/&]+)\)?', '', authors).strip() # remove (...) from author name
    
    return stripped

In [26]:
def compute_features(ds, g):
    '''
    Compute the set of predefined features from the given dataset (training/testing)
    - cosine similarity between abstracts (texts already preprocessed)
    - temporal difference in term of publication year
    - number of common authors
    - whether two articles were published in the same journal
    - number of overlapped words in preprocessed title
    - average degree of both nodes (because orientation is not taken into account)
    - number of common neighbors in the graph
    
    Parameters
    ----------
    ds: dataset to compute features from
    g: citation graph
    betweenness: betweenness centrality of every node (already computed when building graph)
    
    Returns
    -------
    An array of computed features
    '''
    size = len(ds)
    
    cosines = np.zeros(size) # feature "Cosine similarities", between each abstract (already preprocessed)
    temp_diff = [] # feature "Temporal distance" i.e. integer values --> numerical feature
    common_auth = [] # feature "Number of common authors" i.e. integer values --> numerical feature
    same_journal = [] # feature "Same journal" i.e. binary label 0/1 --> categorical feature
    overlap_title = [] # feature "Overlapping title" i.e. number of common words in title
    avg_degrees = [] # feature "Average degree" i.e. the average degree of two nodes participating in the edge
    common_neigh = [] # feature "Common neighbors" i.e. the citations cited in common between two nodes
    jaccard_coeff = [] # feature "Jaccard coefficient" i.e. the relative number of common neighbors
    same_cluster = [] # feature "Same cluster" i.e. to check whether two nodes lie in the same cluster
    pr_scores = [] # pagerank score
    
    #betw_centrality = [] # feature "Betweenness centrality" i.e. the difference of betweenness of two connected nodes
    
    for i in range(size):
        src, dest = ds[i][0], ds[i][1] # get the source and dest ID
        src_info, dest_info = node_info[ID.index(src)], node_info[ID.index(dest)] # get the associated node information
    
        # collect the cosine similarity
        src_vect, dest_vect = tfidf[ID.index(src)], tfidf[ID.index(dest)] # get the corresponding vector in TD-IDF matrix
        cos = cosine_similarity(src_vect, dest_vect) # compute cosine similarity
        cosines[i] = cos
        
        # collect the temporal difference (in absolute value)
        temp_diff.append(
            abs(int(src_info[1]) - int(dest_info[1]))
        )
    
        # collect the number of common authors
        common_auth.append(len(
            set(src_info[3].split(',')).intersection(set(dest_info[3].split(',')))
        ))
    
        # collect the information whether the two 2 articles were published in the same journal
        same_journal.append(int(
            len(src_info[4])>0 and  # journal info of source not null
            len(dest_info[4])>0 and # journal info of dest not null
            src_info[4] == dest_info[4] # the same journal title
        ))
        
        # collect the number of overlapping words in title
        src_title, dest_title = preprocess(src_info[2]).split(), preprocess(dest_info[2]).split()
        overlap_title.append(len(
            set(src_title).intersection(set(dest_title))
        ))
        
        # collect the difference in degrees
        src_deg, dest_deg = g.degree(src), g.degree(dest)
        avg_deg = float(src_deg + dest_deg)/2.0
        avg_degrees.append(avg_deg)
        
        # collect the number of common neighbors
        common_neigh.append(len(
            set(g.neighbors(src)).intersection(set(g.neighbors(dest)))
        ))
        
        # collect the relative number of common neighbors based on Jaccard coefficient
        inters = len(set(g.neighbors(src)).intersection(set(g.neighbors(dest)))) # intersection of neighbors
        union = len(set(g.neighbors(src)).union(set(g.neighbors(dest)))) # union of neighbors
        jaccard_coeff.append(
            (float(inters)/float(union) if union != 0 else 0)
        )
        
        # collect the information of same cluster
        same_cluster.append(int(
            node_cluster[src] == node_cluster[dest]
        ))
        
        # collect the pagerank score (average of pagerank of two nodes)
        pr_src, pr_dest = pg[ID.index(src)], pg[ID.index(dest)]
        pr_scores.append(
            float(pr_src+pr_dest)*10000/2.0
        )
        
        # computational cost is terrible!!!
        # collect the betweenness centrality between two nodes
        #betw_centrality.append(
        #    abs(betweennes[dest] - betweenness[src])
        #)
    
    features = np.array([
        temp_diff, 
        common_auth, 
        same_journal, 
        cosines, 
        overlap_title, 
        avg_degrees,
        common_neigh,
        jaccard_coeff,
        same_cluster,
        pr_scores
        #betw_centrality
    ]).T
    
    return features

In [11]:
def write_submission(filename, pred):
    '''
    Write prediction result in a submission file
    
    Parameters
    ----------
    filename: name of submission file
    pred: prediction array
    
    '''
    with open(path_submission + filename, 'wb') as f:
        csv_out = csv.writer(f)
        csv_out.writerow(['id','category'])
        for row in pred:
            csv_out.writerow(row)

# Building the citation graph

In [12]:
labels = training[:, 2].astype(int) # get the labels

In [13]:
nb_edges = np.count_nonzero(labels)
print('%d edges (before processing) among %d training instances --> %.2f%%' % (nb_edges, len(labels), 100*nb_edges/len(labels)))

335130 edges (before processing) among 615512 training instances --> 54.00%


## Build the graph (and remove multiple edges)

In [14]:
start = time.time()

edges = [(element[0], element[1]) for element in training if int(element[2]) == 1] # extract all the edges
nodes = [element[0] for element in node_info] # extract all the vertices
g = build_graph(nodes, edges) # build the graph

end = time.time()
print('Building the graph takes %.4f s' % (end-start))

Building the graph takes 23.6430 s


In [15]:
# check the number of vertices and edges
print('Number of vertices: %d' % len(g.vs))
print('Number of edges (after multiple edges removal): %d' % len(g.es))

Number of vertices: 27770
Number of edges (after multiple edges removal): 334690


## Find clusters

In [16]:
# ====== find the community structures ====== #
start = time.time()

dendogram = g.community_fastgreedy()
clusters_g = dendogram.as_clustering()
subg = clusters_g.subgraphs()

end = time.time()
print('Finding community takes %.4f s' % (end-start))

Finding community takes 99.3380 s


In [17]:
# create a mapping of (node : #cluster) for easy access when computing features
node_cluster = dict(zip(nodes, [0]*len(nodes)))

idx = 1 # assign a number for each distinct cluster
for sb in subg:
    for v in sb.vs:
        node_cluster[v['name']] = idx
    idx += 1

# Computing features (time consuming)

First, we prepare some ingredients that serve the computation of features:
- TF-IDF matrix, built on the corpus of abstracts
- index list containing the ID of each article, to facilitate access to node information

In [18]:
start = time.time()

# ====== corpus is the set of titles + abstracts, apply preprocessing to each article ======#
corpus = [preprocess(element[2] + ' ' + element[5], dg_removal=True, sw_removal=True, stemming=True) 
          for element in node_info]
vectorizer = TfidfVectorizer(stop_words='english') # create a TF-IDF vectorizer
tfidf = vectorizer.fit_transform(corpus) # TD-IDF matrix of the entire corpus (set of abstracts)

end = time.time()
print('Computing the TF-IDF matrix takes %.4f s' % (end-start))

Computing the TF-IDF matrix takes 63.4280 s


In [19]:
# ====== create an index list, to facilite access to a node by its id ====== #
ID = [element[0] for element in node_info]

## List of features

In [20]:
features = [
    'temp_diff', 
    'common_auth', 
    'same_journal', 
    'cosines', 
    'overlap_title', 
    'diff_degrees', 
    'common_neigh', 
    'jaccard_coeff',
    'same_cluster', 
    'page_rank'
]

When everything is ready, we start computing features for the training and testing set. The list of features is described as follows, and the computation rule is the same for both training and testing set.

| Feature                | Explanation                                                        | Type      | Range   |
|:----------------------:|:------------------------------------------------------------------:|:---------:|:-------:|
| Temporal difference    | Difference in publication year (absolute value)                    | numerical | $\ge$ 0 |
| Same journal           | Whether two articles are published in the same journal             | binary    | 0, 1    |
| Cosine similarity      | Cosine similarity between word vectors of abstracts                | numerical | [0,1]   |
| Title overlap          | Number of overlapping words in title                               | numerical | $\ge$ 0 |
| Degree difference      | Difference in measure of degrees of two nodes (absolute value)     | numerical | $\ge$ 0 |
| Common neighbors       | Number of common neighbors                                         | numercial | $\ge$ 0 |
| Jaccard coefficient    | Link-based Jaccard coefficient                                     | numerical | [0,1]   |
| Same cluster           | Check whether two nodes are in the same cluster                    | binary    | [0,1]   |
| Betweenness centrality | Difference in betweenness centrality of two nodes (absolute value) | numerical | $\ge$ 0 |

This step is highly time consuming.

In [21]:
# might read already computed features to save time
#training_features = np.genfromtxt(path_data + 'training_features.csv', delimiter=',',skip_header=1)
#testing_features = np.genfromtxt(path_data + 'testing_features.csv', delimiter=',',skip_header=1)

## Preprocessing author field (not sure if this is necessary though)

By looking at the list of authors, we find out that some records are not 'correctly' formatted, e.g., even though L.D. Paniak (Princeton University) and "L.D. Paniak (Princeton)" are the same person, only splitting the author field by ',' as separator would consider this two formats different, hence causing information loss. Before proceeding to compute features, we need to preprocess the author field first.

In [22]:
# group_authors = [group[3] for group in node_info] # read in all authors
# authors = []

# # TODO: also modify the details of authors in node_info (remove (..) and strip)
# for group in group_authors:
#     if group != '': # some author field is left empty
#         #stripped = re.sub('\(([a-zA-Z0-9\\s,\.\-\'\/&]+)\)?', '', group).strip() # remove (...) from author name
#         stripped = preprocess_authors(group).strip()
#         for indi in stripped.split(','):
#             indi = indi.strip()
#             if indi != '':
#                 authors.append(indi)

In [23]:
# print('Individual authors: %d' % len(authors))

# uni_authors, count = np.unique(authors, return_counts=True)
# author_score = dict(zip(uni_authors, count))

## Preparing the PageRank index

In [24]:
# ====== compute page rank of each node in the entire graph ====== #
start = time.time()

pg = g.pagerank()

end = time.time()
print('Finding page rank of all nodes in the graph takes %.4fs' % (end-start))

Finding page rank of all nodes in the graph takes 0.3280s


## Computing training and testing features

In [28]:
start = time.time()

# ====== compute training features : (615,521 x 10) ====== #
training_features = compute_features(training, g)

end = time.time()
print('Computing the training features takes %.4f' % (end-start))

Computing the training features takes 1854.1320


In [29]:
start = time.time()

# ====== compute testing features : (32,648 x 10) ====== #
testing_features = compute_features(testing, g)

end = time.time()
print('Computing the testing features takes %.4f s' % (end-start))

Computing the testing features takes 98.0200 s


In [30]:
# test purpose
feat = compute_features(training[0:5], g)
feat

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.99966215e-01,
        2.00000000e+00, 9.00000000e+00, 1.00000000e+00, 5.88235294e-02,
        1.00000000e+00, 2.13292993e-01],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 6.43694475e-02,
        1.00000000e+00, 1.13000000e+02, 2.00000000e+01, 9.70873786e-02,
        1.00000000e+00, 9.25094418e-01],
       [2.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.05371115e-02,
        0.00000000e+00, 3.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.26364844e-01],
       [4.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.93784382e-02,
        0.00000000e+00, 1.70000000e+01, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 2.04888212e-01],
       [5.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.85264277e-02,
        0.00000000e+00, 1.55000000e+01, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 2.32628234e-01]])

In [31]:
# ====== Saving features to reload it faster (training_features) ====== #
with open(path_data + 'training_features.csv', 'wb') as f:
    csv_out = csv.writer(f)
    csv_out.writerow(features)
    for row in training_features:
        csv_out.writerow(row)

In [32]:
# ====== Saving features to reload it faster (testing_features) ====== #
with open(path_data + 'testing_features.csv', 'wb') as f:
    csv_out = csv.writer(f)
    csv_out.writerow(features)
    for row in testing_features:
        csv_out.writerow(row)

In [33]:
# ====== Scaling features ====== #
training_features_scale = pre.scale(training_features)
testing_features_scale = pre.scale(testing_features)

# SVM classifier

## A. SVM without scaling

In [34]:
start = time.time()

# ====== training and predicting with SVM ====== #
clf_svm = svm.LinearSVC()
clf_svm.fit(training_features, labels)
pred_svm = list(clf_svm.predict(testing_features))
pred_svm = zip(range(len(testing)), pred_svm)

end = time.time()
print('Training with SVM Linear SVC takes %.4f s' % (end-start))

Training with SVM Linear SVC takes 110.0520 s


In [35]:
write_submission('submission_svm_08.csv', pred_svm)

## B. SVM with scaling

In [36]:
start = time.time()

# ====== training and prediction with SVM and scaled features ====== #
clf_svm_scale = svm.LinearSVC()
clf_svm_scale.fit(training_features_scale, labels)
pred_svm_scale = list(clf_svm_scale.predict(testing_features_scale))
pred_svm_scale = zip(range(len(testing)), pred_svm_scale)

end = time.time()
print('Training with SVM Linear SVC + scaling takes %.4f s' % (end-start))

Training with SVM Linear SVC + scaling takes 67.3250 s


In [37]:
write_submission('submission_svm_08_scale.csv', pred_svm_scale)

# RandomForest classifier

In [38]:
start = time.time()

# ====== training and prediction with Random Forest ====== #
clf_rf = RandomForestClassifier()
clf_rf.fit(training_features, labels)
pred_rf = list(clf_rf.predict(testing_features))
pred_rf = zip(range(len(testing)), pred_rf)

end = time.time()
print('Training with Random Forest takes %.4f s' % (end-start))

Training with Random Forest takes 14.4230 s


In [39]:
write_submission('submission_rf_06.csv', pred_rf)

# Logistic Regression

## A. Logistic Regression without scaling

In [40]:
start = time.time()

# ====== training and prediction with Logistic Regression ====== #
clf_lg = LogisticRegression()
clf_lg.fit(training_features, labels)
pred_lg = list(clf_lg.predict(testing_features))
pred_lg = zip(range(len(testing)), pred_lg)

end = time.time()
print('Training with Logistic Regression takes %.4f s' % (end-start))

Training with Logistic Regression takes 5.5160 s


In [41]:
write_submission('submission_lg_05.csv', pred_lg)

## B. Logistic Regression with scaling

In [42]:
start = time.time()

# ====== training and prediction with Logistic Regression + scaling ====== #
clf_lg_scale = LogisticRegression()
clf_lg_scale.fit(training_features_scale, labels)
pred_lg_scale = list(clf_lg_scale.predict(testing_features_scale))
pred_lg_scale = zip(range(len(testing)), pred_lg_scale)

end = time.time()
print('Training with Logistic Regression + scaling takes %.4f s' % (end-start))

Training with Logistic Regression + scaling takes 5.7820 s


In [43]:
write_submission('submission_lg_05_scale.csv', pred_lg_scale)

# Neural Network (simple version)

## A. Neural Network without scaling

In [44]:
start = time.time()

# ====== training and prediction with Logistic Regression + scaling ====== #
clf_nn = MLPClassifier(
    hidden_layer_sizes = (50,60,70,40,30,20),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn.fit(training_features, labels)
pred_nn = clf_nn.predict(testing_features)
pred_nn = zip(range(len(testing)), pred_nn)

end = time.time()
print('Training with Neural Networks takes %.4f s' % (end-start))

Training with Neural Networks takes 92.6970 s


In [45]:
write_submission('submission_nn_06.csv', pred_nn)

## B. Neural Network with scaling

In [46]:
start = time.time()

# ====== training and prediction with Logistic Regression + scaling ====== #
training_features_scale = pre.scale(training_features)
testing_features_scale = pre.scale(testing_features)

clf_nn_scale = MLPClassifier(
    hidden_layer_sizes = (50,60,70,40,30,20),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn_scale.fit(training_features_scale, labels)
pred_nn_scale = clf_nn_scale.predict(testing_features_scale)
pred_nn_scale = zip(range(len(testing)), pred_nn_scale)

end = time.time()
print('Training with Neural Networks + scaling takes %.4f s' % (end-start))

Training with Neural Networks + scaling takes 54.7090 s


In [47]:
write_submission('submission_nn_06_scale.csv', pred_nn_scale)

# Gradient Boosting

## Gradient Boosting

In [48]:
start = time.time()

# ====== Training and predicting with Gradient Boosting ====== #
clf_gboost = GradientBoostingClassifier(
    loss = 'deviance',
    n_estimators = 200
)
clf_gboost.fit(training_features, labels)
pred_gboost = clf_gboost.predict(testing_features)
pred_gboost = zip(range(len(testing)), pred_gboost)

end = time.time()
print('Training with Gradient Boosting takes %.4f s' % (end-start))

Training with Gradient Boosting takes 157.5080 s


In [49]:
write_submission('submission_gboost_03.csv', pred_gboost)

## AdaBoost

In [50]:
start = time.time()

# ====== Training and predicting with Gradient Boosting ====== #
clf_ada = GradientBoostingClassifier(
    loss = 'exponential',
    n_estimators = 200
)
clf_ada.fit(training_features, labels)
pred_ada = clf_ada.predict(testing_features)
pred_ada = zip(range(len(testing)), pred_ada)

end = time.time()
print('Training with Adaboost takes %.4f s' % (end-start))

Training with Adaboost takes 153.1410 s


In [51]:
write_submission('submission_ada_04.csv', pred_ada)

# Feature importance

In [52]:
# ====== compute feature importance ====== #
idx = np.argsort(-clf_rf.feature_importances_) # sort the indicator of feature important by decreasing order

for i in idx:
    print('Feature \'%s\' of importance %.3f' % (features[i], clf_rf.feature_importances_[i]))

Feature 'jaccard_coeff' of importance 0.445
Feature 'common_neigh' of importance 0.302
Feature 'cosines' of importance 0.104
Feature 'diff_degrees' of importance 0.077
Feature 'page_rank' of importance 0.041
Feature 'same_cluster' of importance 0.012
Feature 'temp_diff' of importance 0.010
Feature 'overlap_title' of importance 0.005
Feature 'common_auth' of importance 0.002
Feature 'same_journal' of importance 0.002
