# Training model and Issuing predictions (main script)

In [2]:
# import time to find consuming steps
import time
start = time.time()

# utility libraries
import numpy as np
import csv as csv
from sklearn import preprocessing as pre

# classifier for classification
from sklearn.metrics.pairwise import linear_kernel
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import recall_score

end = time.time()
print('Loading libraries takes %.4f s' % (end-start))

Loading libraries takes 0.8376 s


# Reading dataset (training, testing, node information)

In [3]:
path_data = '../data/' # path to the data
path_submission = '../submission/' # path to submission files

In [4]:
start = time.time()

# ====== read training data as str ====== #
training = np.genfromtxt(path_data + 'training_set.txt', dtype=str)

# ====== extract labels ====== #
labels = training[:, 2].astype(int) # get the labels

end = time.time()
print('Reading training set & extracting labels takes %.4f s' % (end-start))

Reading training set & extracting labels takes 3.0708 s


In [5]:
start = time.time()

# ====== read training features ====== #
orig_training_features = np.genfromtxt(path_data + 'training_features.csv', delimiter=',', skip_header=1, dtype=float)

end = time.time()
print('Reading training features takes %.4f s' % (end-start))

Reading training features takes 8.6342 s


In [6]:
start = time.time()

# ====== read testing features as str ====== #
orig_testing_features = np.genfromtxt(path_data + 'testing_features.csv', delimiter=',', skip_header=1, dtype=float)

end = time.time()
print('Reading testing features takes %.4f s' % (end-start))

Reading testing features takes 0.4785 s


In [7]:
print('Training features:', orig_training_features.shape)
print('Labels:', labels.shape)
print('Testing features:', orig_testing_features.shape)

Training features: (615512, 13)
Labels: (615512,)
Testing features: (32648, 13)


# Picking up some features

Sometimes, we might need to remove some features read from file. Here, we remove features by its index.

In [29]:
orig_features = [
    'temporal_difference', # 0
    'common_authors', # 1
    'same_journal', # 2
    'cosine_sim', # 3
    'overlapping_title', # 4
    'average_degrees', # 5
    'common_neighbors', # 6
    'jaccard_coefficient', # 7
    'avg_pagerank', # 8
    'average_betweenness', # 9
    'in_kcore', # 10
    'adamic_adar', # 11
    'katz_index' # 12
]

In [30]:
# remove very useless features : same_journal (useless), common neighbors (strongly correlated with adamic-adar), page rank (strongly correlated with betweeness)
to_remove = [2, 6, 8]
training_features = np.delete(orig_training_features, to_remove, 1)
testing_features = np.delete(orig_testing_features, to_remove, 1)
features = np.delete(orig_features, to_remove)

print('Training features:', training_features.shape)

Training features: (615512, 10)


# Utility functions

In [31]:
def write_submission(filename, pred):
    '''
    Write prediction result in a submission file
    
    Parameters
    ----------
    filename: name of submission file
    pred: prediction array
    
    '''
    with open(path_submission + filename, 'w', newline='') as f:
        csv_out = csv.writer(f)
        csv_out.writerow(['id','category'])
        for row in pred:
            csv_out.writerow(row)

# Tuning classifiers

In [32]:
# ====== Scaling features ====== #
training_features_scale = pre.scale(training_features)
testing_features_scale = pre.scale(testing_features)

In [33]:
testing_size = testing_features.shape[0]
testing_size

32648

## 1. SVM classifier

### 1-A. SVM without scaling

In [26]:
start = time.time()

# ====== training and predicting with SVM ====== #
clf_svm = svm.LinearSVC()
clf_svm.fit(training_features, labels)
pred_svm = list(clf_svm.predict(testing_features))
pred_svm = zip(range(len(testing_features)), pred_svm)

end = time.time()
print('Training with SVM Linear SVC takes %.4f s' % (end-start))

Training with SVM Linear SVC takes 139.0131 s


In [49]:
write_submission('submission_svm_11.csv', pred_svm)

In [25]:
svm_scores = cross_val_score(clf_svm, training_features, labels, cv=5, scoring='f1')
print("Accuracy with SVM scaling: %0.4f (+/- %0.4f)" % (svm_scale_scores.mean(), svm_scale_scores.std() * 2))

Accuracy with SVM scaling: 0.9650 (+/- 0.0021)


### 1-B. SVM with scaling

In [34]:
start = time.time()

# ====== training and prediction with SVM and scaled features ====== #
clf_svm_scale = svm.LinearSVC(C=1.0, dual=False)
clf_svm_scale.fit(training_features_scale, labels)
pred_svm_scale = list(clf_svm_scale.predict(testing_features_scale))
pred_svm_scale = zip(range(testing_size), pred_svm_scale)

end = time.time()
print('Training with SVM Linear SVC + scaling takes %.4f s' % (end-start))

Training with SVM Linear SVC + scaling takes 1.0714 s


In [35]:
write_submission('submission_svm_15_scale.csv', pred_svm_scale)

In [36]:
svm_scale_scores = cross_val_score(clf_svm_scale, training_features, labels, cv=5, scoring='f1')
print("Accuracy with SVM scaling: %0.4f (+/- %0.4f)" % (svm_scale_scores.mean(), svm_scale_scores.std() * 2))

Accuracy with SVM scaling: 0.9632 (+/- 0.0013)


### 1-C. SVM with RBF as kernel

In [None]:
# beware: takes too long to complete

# start = time.time()

# # ====== training and prediction with SVM and scaled features ====== #
# clf_svm_rbf = svm.SVC(kernel='rbf')
# clf_svm_rbf.fit(training_features_scale, labels)
# pred_svm_rbf = list(clf_svm_rbf.predict(testing_features_scale))
# pred_svm_rbf = zip(range(testing_size), pred_svm_rbf)

# end = time.time()
# print('Training with SVM Linear SVC + scaling takes %.4f s' % (end-start))

In [None]:
# write_submission('submission_svm_rbf_01_scale.csv', pred_svm_rbf)

## 2. RandomForest classifier

In [25]:
start = time.time()

# ====== training and prediction with Random Forest ====== #
clf_rf = RandomForestClassifier()
clf_rf.fit(training_features, labels)
pred_rf = list(clf_rf.predict(testing_features))
pred_rf = zip(range(testing_size), pred_rf)

end = time.time()
print('Training with Random Forest takes %.4f s' % (end-start))

Training with Random Forest takes 3.1240 s


In [27]:
write_submission('submission_rf_11.csv', pred_rf)

In [26]:
rf_scores = cross_val_score(clf_rf, training_features, labels, cv=5, scoring='f1')
print("Accuracy of RandomForest: %0.4f (+/- %0.4f)" % (rf_scores.mean(), rf_scores.std() * 2))

Accuracy of RandomForest: 1.0000 (+/- 0.0000)


# Logistic Regression

## A. Logistic Regression without scaling

In [28]:
start = time.time()

# ====== training and prediction with Logistic Regression ====== #
clf_lg = LogisticRegression()
clf_lg.fit(training_features, labels)
pred_lg = list(clf_lg.predict(testing_features))
pred_lg = zip(range(testing_size), pred_lg)

end = time.time()
print('Training with Logistic Regression takes %.4f s' % (end-start))

Training with Logistic Regression takes 7.7417 s


In [55]:
write_submission('submission_lg_07.csv', pred_lg)

In [31]:
lg_scores = cross_val_score(clf_lg, training_features, labels, cv=5, scoring='f1')
print("Accuracy of Logistic Regression: %0.4f (+/- %0.4f)" % (lg_scores.mean(), lg_scores.std() * 2))

Accuracy of Logistic Regression: 0.9669 (+/- 0.0008)


## B. Logistic Regression with scaling

In [56]:
start = time.time()

# ====== training and prediction with Logistic Regression + scaling ====== #
clf_lg_scale = LogisticRegression()
clf_lg_scale.fit(training_features_scale, labels)
pred_lg_scale = list(clf_lg_scale.predict(testing_features_scale))
pred_lg_scale = zip(range(testing_size), pred_lg_scale)

end = time.time()
print('Training with Logistic Regression + scaling takes %.4f s' % (end-start))

Training with Logistic Regression + scaling takes 2.8288 s


In [57]:
write_submission('submission_lg_07_scale.csv', pred_lg_scale)

# Neural Network (simple version)

## A. Neural Network without scaling

In [29]:
start = time.time()

clf_nn = MLPClassifier(
    hidden_layer_sizes = (50,60,70,40,50,30,20,10),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn.fit(training_features, labels)
pred_nn = clf_nn.predict(testing_features)
pred_nn = zip(range(testing_size), pred_nn)

end = time.time()
print('Training with Neural Networks takes %.4f s' % (end-start))

Training with Neural Networks takes 562.4595 s


In [59]:
write_submission('submission_nn_15.csv', pred_nn)

In [30]:
nn_scores = cross_val_score(clf_nn, training_features, labels, cv=5, scoring='f1')
print("Accuracy of Neural Network + scaling: %0.4f (+/- %0.4f)" % (nn_scores.mean(), nn_scores.std() * 2))



Accuracy of Neural Network + scaling: 0.9912 (+/- 0.0195)


## B. Neural Network with scaling

In [32]:
start = time.time()

clf_nn_scale = MLPClassifier(
    hidden_layer_sizes = (50,60,70,40,50,30,20,10),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn_scale.fit(training_features_scale, labels)
pred_nn_scale = clf_nn_scale.predict(testing_features_scale)
pred_nn_scale = zip(range(testing_size), pred_nn_scale)

end = time.time()
print('Training with Neural Networks + scaling takes %.4f s' % (end-start))

Training with Neural Networks + scaling takes 49.4728 s


In [68]:
write_submission('submission_nn_10_scale.csv', pred_nn_scale)

In [None]:
nn_scale_scores = cross_val_score(clf_nn_scale, training_features, labels, cv=5, scoring='f1')
print("Accuracy of Neural Network + scaling: %0.4f (+/- %0.4f)" % (nn_scale_scores.mean(), nn_scale_scores.std() * 2))

# Gradient Boosting

## Gradient Boosting

In [69]:
start = time.time()

# ====== Training and predicting with Gradient Boosting ====== #
clf_gboost = GradientBoostingClassifier(
    loss = 'deviance',
    n_estimators = 200
)
clf_gboost.fit(training_features, labels)
pred_gboost = clf_gboost.predict(testing_features)
pred_gboost = zip(range(testing_size), pred_gboost)

end = time.time()
print('Training with Gradient Boosting takes %.4f s' % (end-start))

Training with Gradient Boosting takes 442.9257 s


In [70]:
write_submission('submission_gboost_05.csv', pred_gboost)

## AdaBoost

In [71]:
start = time.time()

# ====== Training and predicting with Gradient Boosting ====== #
clf_ada = GradientBoostingClassifier(
    loss = 'exponential',
    n_estimators = 200
)
clf_ada.fit(training_features, labels)
pred_ada = clf_ada.predict(testing_features)
pred_ada = zip(range(testing_size), pred_ada)

end = time.time()
print('Training with Adaboost takes %.4f s' % (end-start))

Training with Adaboost takes 300.6069 s


In [72]:
write_submission('submission_ada_05.csv', pred_ada)

# Feature importance

In [28]:
# ====== compute feature importance ====== #
idx = np.argsort(-clf_rf.feature_importances_) # sort the indicator of feature important by decreasing order

for i in idx:
    print('Feature \'%s\' of importance %.5f' % (features[i], clf_rf.feature_importances_[i]))

Feature 'katz_index' of importance 0.70817
Feature 'adamic_adar' of importance 0.16941
Feature 'jaccard_coefficient' of importance 0.08326
Feature 'in_kcore' of importance 0.03162
Feature 'cosine_sim' of importance 0.00695
Feature 'average_degrees' of importance 0.00033
Feature 'average_betweenness' of importance 0.00014
Feature 'common_authors' of importance 0.00009
Feature 'temporal_difference' of importance 0.00002
Feature 'overlapping_title' of importance 0.00001
