# Libraries and Utility functions

In [1]:
# import time to find consuming steps
import time
start = time.time()

# utility libraries
import numpy as np
import csv as csv
from sklearn import preprocessing as pre
from itertools import cycle
from scipy import interp
import matplotlib.pyplot as plt

# classifier for classification
from sklearn.metrics.pairwise import linear_kernel
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import recall_score, roc_curve, auc, average_precision_score, precision_recall_curve

end = time.time()
print('Loading libraries takes %.4f s' % (end-start))

Loading libraries takes 1.3198 s


# Reading dataset (training, testing, node information)

In [2]:
path_data = '../data/' # path to the data
path_submission = '../submission/' # path to submission files

In [3]:
start = time.time()

# ====== read training data as str ====== #
training = np.genfromtxt(path_data + 'training_set.txt', dtype=str)

# ====== extract labels ====== #
labels = training[:, 2].astype(int) # get the labels

end = time.time()
print('Reading training set & extracting labels takes %.4f s' % (end-start))

Reading training set & extracting labels takes 3.4442 s


In [4]:
start = time.time()

# ====== read training features ====== #
orig_training_features = np.genfromtxt(path_data + 'training_features.csv', delimiter=',', skip_header=1, dtype=float)

end = time.time()
print('Reading training features takes %.4f s' % (end-start))

Reading training features takes 12.9700 s


In [5]:
start = time.time()

# ====== read testing features as str ====== #
orig_testing_features = np.genfromtxt(path_data + 'testing_features.csv', delimiter=',', skip_header=1, dtype=float)

end = time.time()
print('Reading testing features takes %.4f s' % (end-start))

Reading testing features takes 0.7599 s


In [6]:
print('Training features:', orig_training_features.shape)
print('Labels:', labels.shape)
print('Testing features:', orig_testing_features.shape)

Training features: (615512, 17)
Labels: (615512,)
Testing features: (32648, 17)


# Picking up some features

Sometimes, we might need to remove some features read from file. Here, we remove features by its index.

In [7]:
orig_features = [
    'temporal_difference', # 0
    'common_authors', # 1
    'same_journal', # 2
    'cosine_sim', # 3
    'overlapping_title', # 4
    'max_degrees', # 5
    'common_neighbors', # 6
    'jaccard_coefficient', # 7
    'max_pagerank', # 8
    'max_betweenness', # 9
    'in_kcore', # 10
    'adamic_adar', # 11
    'katz_index', # 12
    'cosine_sim_w2v', # 13
    'katz_linkpred', # 14
    'pref_attach', # 15
    'res_alloc' # 16
]

In [8]:
# remove very features before training
to_remove = [13, 14]
training_features = np.nan_to_num(np.delete(orig_training_features, to_remove, 1))
testing_features = np.nan_to_num(np.delete(orig_testing_features, to_remove, 1))
features = np.delete(orig_features, to_remove)

print('Training features:', training_features.shape)

Training features: (615512, 15)


# Evaluation

In [9]:
# splitting training and testing features
X_train, X_test, y_train, y_test = train_test_split(training_features, labels, test_size=0.35, random_state=42)

print('Training set:', X_train.shape, y_train.shape)
print('Testing set:', X_test.shape,y_test.shape)

Training set: (400082, 15) (400082,)
Testing set: (215430, 15) (215430,)


In [10]:
# scaling features
X_train_scale = pre.scale(X_train)
X_test_scale = pre.scale(X_test)

In [14]:
# logistic regression
clf_lg = LogisticRegression(penalty='l2', solver='newton-cg')

# SVM
clf_svm = svm.LinearSVC(penalty='l2', loss='hinge', C=1.0, fit_intercept=True)

# random forest
clf_rf = RandomForestClassifier(
    max_features=0.3, 
    n_estimators=90
)

# neural netowork
clf_nn = MLPClassifier(
    hidden_layer_sizes = (50,60,70,40,50,30,20,10),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)

# gradient boosting
clf_gboost = GradientBoostingClassifier(
    loss = 'deviance',
    n_estimators = 120,
    subsample = 0.8,
    max_depth = 5,
    learning_rate = 0.1
)

# adaboost
clf_ada = GradientBoostingClassifier(
    loss = 'exponential',
    n_estimators = 120,
    subsample = 0.8,
    max_depth = 5
)

# knn
clf_knn = KNeighborsClassifier(
    n_neighbors = 11
)

In [15]:
# list of classifiers (classifier name, classifier, scale)
clfs = [
#     ('Logistic Regression', clf_lg, False),
#     ('Logistic Regression', clf_lg, True),
#     ('SVM', clf_svm, False),
#     ('SVM', clf_svm, True),
#     ('Random Forest', clf_rf, False),
#     ('Random Forest', clf_rf, True),
#     ('Neural Network', clf_nn, False),
#     ('Neural Network', clf_nn, True),
    ('Gradient Boosting', clf_gboost, False),
#     ('Gradient Boosting', clf_gboost, True),
    ('AdaBoost', clf_ada, False),
#     ('AdaBoost', clf_ada, True),
#     ('k-NN', clf_knn, False),
#     ('k-NN', clf_knn, True)
]
result_score = [] # scores of each

for clf in clfs:
    start = time.time()
    
    _X_train = X_train_scale if clf[2] else X_train
    _X_test = X_test_scale if clf[2] else X_test
    clf[1].fit(_X_train, y_train)
    result_score.append((clf[0],clf[1].score(_X_test, y_test), clf[2]))
    
    end = time.time()
    print('Evaluation on %s takes %.4f s' % (clf[0], end-start))

Evaluation on Gradient Boosting takes 199.0381 s
Evaluation on AdaBoost takes 182.5642 s


In [16]:
for res in result_score: # result of each classifier
    print('%s (scaling: %r): %.5f' % (res[0], res[2], res[1]))

Gradient Boosting (scaling: False): 0.96929
AdaBoost (scaling: False): 0.96948
