# Training model and Issuing predictions (main script)

In [15]:
# import time to find consuming steps
import time
start = time.time()

# utility libraries
import numpy as np
import csv as csv
from sklearn import preprocessing as pre

# classifier for classification
from sklearn.metrics.pairwise import linear_kernel
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import recall_score

end = time.time()
print('Loading libraries takes %.4f s' % (end-start))

Loading libraries takes 0.0002 s


# Reading dataset (training, testing, node information)

In [16]:
path_data = '../data/' # path to the data
path_submission = '../submission/' # path to submission files

In [17]:
start = time.time()

# ====== read training data as str ====== #
training = np.genfromtxt(path_data + 'training_set.txt', dtype=str)

# ====== extract labels ====== #
labels = training[:, 2].astype(int) # get the labels

end = time.time()
print('Reading training set & extracting labels takes %.4f s' % (end-start))

Reading training set & extracting labels takes 2.9731 s


In [47]:
start = time.time()

# ====== read training features ====== #
training_features = np.genfromtxt(path_data + 'training_features.csv', delimiter=',', skip_header=1, dtype=float)

end = time.time()
print('Reading training features takes %.4f s' % (end-start))

Reading training features takes 9.2582 s


In [48]:
start = time.time()

# ====== read testing features as str ====== #
testing_features = np.genfromtxt(path_data + 'testing_features.csv', delimiter=',', skip_header=1, dtype=float)

end = time.time()
print('Reading testing features takes %.4f s' % (end-start))

Reading testing features takes 0.4289 s


In [49]:
print('Training features:', training_features.shape)
print('Labels:', labels.shape)
print('Testing features:', testing_features.shape)

Training features: (615512, 13)
Labels: (615512,)
Testing features: (32648, 13)


# Utility functions

In [52]:
def write_submission(filename, pred):
    '''
    Write prediction result in a submission file
    
    Parameters
    ----------
    filename: name of submission file
    pred: prediction array
    
    '''
    with open(path_submission + filename, 'w', newline='') as f:
        csv_out = csv.writer(f)
        csv_out.writerow(['id','category'])
        for row in pred:
            csv_out.writerow(row)

# Tuning classifiers

In [50]:
# ====== Scaling features ====== #
training_features_scale = pre.scale(training_features)
testing_features_scale = pre.scale(testing_features)

In [51]:
testing_size = testing_features.shape[0]
testing_size

32648

## 1. SVM classifier

### 1-A. SVM without scaling

In [48]:
start = time.time()

# ====== training and predicting with SVM ====== #
clf_svm = svm.LinearSVC()
clf_svm.fit(training_features, labels)
pred_svm = list(clf_svm.predict(testing_features))
pred_svm = zip(range(len(testing_features)), pred_svm)

end = time.time()
print('Training with SVM Linear SVC takes %.4f s' % (end-start))

Training with SVM Linear SVC takes 111.8000 s


In [49]:
write_submission('submission_svm_11.csv', pred_svm)

### 1-B. SVM with scaling

In [53]:
# ====== choosing C with cross validation ====== #
start = time.time()

clf_svm_scale = svm.LinearSVC(C=1.0, dual=False)
scores = cross_val_score(clf_svm_scale, training_features, labels, cv=5)
print("Accuracy: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

end = time.time()
print('Cross validating takes %.4f s' % (end-start))

Accuracy: 0.963 (+/- 0.00)
Cross validating takes 18.7288 s


In [54]:
start = time.time()

# ====== training and prediction with SVM and scaled features ====== #
clf_svm_scale.fit(training_features_scale, labels)
pred_svm_scale = list(clf_svm_scale.predict(testing_features_scale))
pred_svm_scale = zip(range(testing_size), pred_svm_scale)

end = time.time()
print('Training with SVM Linear SVC + scaling takes %.4f s' % (end-start))

Training with SVM Linear SVC + scaling takes 1.1296 s


In [55]:
write_submission('submission_svm_14_scale.csv', pred_svm_scale)

### 1-C. SVM with RBF as kernel

In [None]:
# beware: takes too long to complete

# start = time.time()

# # ====== training and prediction with SVM and scaled features ====== #
# clf_svm_rbf = svm.SVC(kernel='rbf')
# clf_svm_rbf.fit(training_features_scale, labels)
# pred_svm_rbf = list(clf_svm_rbf.predict(testing_features_scale))
# pred_svm_rbf = zip(range(testing_size), pred_svm_rbf)

# end = time.time()
# print('Training with SVM Linear SVC + scaling takes %.4f s' % (end-start))

In [None]:
# write_submission('submission_svm_rbf_01_scale.csv', pred_svm_rbf)

## 2. RandomForest classifier

In [60]:
start = time.time()

# ====== training and prediction with Random Forest ====== #
clf_rf = RandomForestClassifier()
clf_rf.fit(training_features, labels)
pred_rf = list(clf_rf.predict(testing_features))
pred_rf = zip(range(testing_size), pred_rf)

end = time.time()
print('Training with Random Forest takes %.4f s' % (end-start))

Training with Random Forest takes 9.2478 s


In [53]:
write_submission('submission_rf_09.csv', pred_rf)

# Logistic Regression

## A. Logistic Regression without scaling

In [54]:
start = time.time()

# ====== training and prediction with Logistic Regression ====== #
clf_lg = LogisticRegression()
clf_lg.fit(training_features, labels)
pred_lg = list(clf_lg.predict(testing_features))
pred_lg = zip(range(testing_size), pred_lg)

end = time.time()
print('Training with Logistic Regression takes %.4f s' % (end-start))

Training with Logistic Regression takes 5.3049 s


In [55]:
write_submission('submission_lg_07.csv', pred_lg)

## B. Logistic Regression with scaling

In [56]:
start = time.time()

# ====== training and prediction with Logistic Regression + scaling ====== #
clf_lg_scale = LogisticRegression()
clf_lg_scale.fit(training_features_scale, labels)
pred_lg_scale = list(clf_lg_scale.predict(testing_features_scale))
pred_lg_scale = zip(range(testing_size), pred_lg_scale)

end = time.time()
print('Training with Logistic Regression + scaling takes %.4f s' % (end-start))

Training with Logistic Regression + scaling takes 2.8288 s


In [57]:
write_submission('submission_lg_07_scale.csv', pred_lg_scale)

# Neural Network (simple version)

## A. Neural Network without scaling

In [56]:
start = time.time()

clf_nn = MLPClassifier(
    hidden_layer_sizes = (50,60,70,40,50,30,20,10),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn.fit(training_features, labels)
pred_nn = clf_nn.predict(testing_features)
pred_nn = zip(range(testing_size), pred_nn)

end = time.time()
print('Training with Neural Networks takes %.4f s' % (end-start))

Training with Neural Networks takes 256.8148 s


In [59]:
write_submission('submission_nn_15.csv', pred_nn)

## B. Neural Network with scaling

In [67]:
start = time.time()

clf_nn_scale = MLPClassifier(
    hidden_layer_sizes = (50,60,70,40,50,30,20,10),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn_scale.fit(training_features_scale, labels)
pred_nn_scale = clf_nn_scale.predict(testing_features_scale)
pred_nn_scale = zip(range(testing_size), pred_nn_scale)

end = time.time()
print('Training with Neural Networks + scaling takes %.4f s' % (end-start))

Training with Neural Networks + scaling takes 250.2031 s


In [68]:
write_submission('submission_nn_10_scale.csv', pred_nn_scale)

# Gradient Boosting

## Gradient Boosting

In [69]:
start = time.time()

# ====== Training and predicting with Gradient Boosting ====== #
clf_gboost = GradientBoostingClassifier(
    loss = 'deviance',
    n_estimators = 200
)
clf_gboost.fit(training_features, labels)
pred_gboost = clf_gboost.predict(testing_features)
pred_gboost = zip(range(testing_size), pred_gboost)

end = time.time()
print('Training with Gradient Boosting takes %.4f s' % (end-start))

Training with Gradient Boosting takes 442.9257 s


In [70]:
write_submission('submission_gboost_05.csv', pred_gboost)

## AdaBoost

In [71]:
start = time.time()

# ====== Training and predicting with Gradient Boosting ====== #
clf_ada = GradientBoostingClassifier(
    loss = 'exponential',
    n_estimators = 200
)
clf_ada.fit(training_features, labels)
pred_ada = clf_ada.predict(testing_features)
pred_ada = zip(range(testing_size), pred_ada)

end = time.time()
print('Training with Adaboost takes %.4f s' % (end-start))

Training with Adaboost takes 300.6069 s


In [72]:
write_submission('submission_ada_05.csv', pred_ada)

# Feature importance

In [58]:
# ====== compute feature importance ====== #
# list of selected features
features = [
    'temporal_difference',
    'common_authors',
    'same_journal',
    'cosine_sim',
    'overlapping_title',
    'average_degrees',
    'common_neighbors',
    'jaccard_coefficient',
    'avg_pagerank',
    'average_betweenness',
    'in_kcore',
    'adamic_adar',
    'katz_index'
]

idx = np.argsort(-clf_rf.feature_importances_) # sort the indicator of feature important by decreasing order

for i in idx:
    print('Feature \'%s\' of importance %.5f' % (features[i], clf_rf.feature_importances_[i]))

Feature 'adamic_adar' of importance 0.35864
Feature 'katz_index' of importance 0.30099
Feature 'jaccard_coefficient' of importance 0.24968
Feature 'common_neighbors' of importance 0.03826
Feature 'average_degrees' of importance 0.02634
Feature 'cosine_sim' of importance 0.01515
Feature 'in_kcore' of importance 0.00585
Feature 'average_betweenness' of importance 0.00179
Feature 'overlapping_title' of importance 0.00176
Feature 'avg_pagerank' of importance 0.00093
Feature 'temporal_difference' of importance 0.00029
Feature 'common_authors' of importance 0.00029
Feature 'same_journal' of importance 0.00003
