# Libraries and Utility functions

In [15]:
# import time to find consuming steps
import time
start = time.time()

# utility libraries
import numpy as np
import csv as csv
from sklearn import preprocessing as pre
from itertools import cycle
from scipy import interp
import matplotlib.pyplot as plt

# classifier for classification
from sklearn.metrics.pairwise import linear_kernel
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import recall_score, roc_curve, auc, average_precision_score, precision_recall_curve

end = time.time()
print('Loading libraries takes %.4f s' % (end-start))

Loading libraries takes 0.0003 s


# Reading dataset (training, testing, node information)

In [16]:
path_data = '../data/' # path to the data
path_submission = '../submission/' # path to submission files

In [17]:
start = time.time()

# ====== read training data as str ====== #
training = np.genfromtxt(path_data + 'training_set.txt', dtype=str)

# ====== extract labels ====== #
labels = training[:, 2].astype(int) # get the labels

end = time.time()
print('Reading training set & extracting labels takes %.4f s' % (end-start))

Reading training set & extracting labels takes 3.0183 s


In [18]:
start = time.time()

# ====== read training features ====== #
orig_training_features = np.genfromtxt(path_data + 'training_features.csv', delimiter=',', skip_header=1, dtype=float)

end = time.time()
print('Reading training features takes %.4f s' % (end-start))

Reading training features takes 11.7031 s


In [19]:
start = time.time()

# ====== read testing features as str ====== #
orig_testing_features = np.genfromtxt(path_data + 'testing_features.csv', delimiter=',', skip_header=1, dtype=float)

end = time.time()
print('Reading testing features takes %.4f s' % (end-start))

Reading testing features takes 0.5495 s


In [20]:
print('Training features:', orig_training_features.shape)
print('Labels:', labels.shape)
print('Testing features:', orig_testing_features.shape)

Training features: (615512, 17)
Labels: (615512,)
Testing features: (32648, 17)


# Picking up some features

Sometimes, we might need to remove some features read from file. Here, we remove features by its index.

In [21]:
orig_features = [
    'temporal_difference', # 0
    'common_authors', # 1
    'same_journal', # 2
    'cosine_sim', # 3
    'overlapping_title', # 4
    'max_degrees', # 5
    'common_neighbors', # 6
    'jaccard_coefficient', # 7
    'max_pagerank', # 8
    'max_betweenness', # 9
    'in_kcore', # 10
    'adamic_adar', # 11
    'katz_index', # 12
    'cosine_sim_w2v', # 13
    'katz_linkpred', # 14
    'pref_attach', # 15
    'res_alloc' # 16
]

In [22]:
# remove very features before training
to_remove = [14,13]
training_features = np.nan_to_num(np.delete(orig_training_features, to_remove, 1))
testing_features = np.nan_to_num(np.delete(orig_testing_features, to_remove, 1))
features = np.delete(orig_features, to_remove)

print('Training features:', training_features.shape)

Training features: (615512, 15)


In [23]:
print('Training:', training_features[0:10])
print('Testing:', testing_features[0:10])

Training: [[ 0.00000000e+00  0.00000000e+00  1.00000000e+00  1.99966215e-01
   2.00000000e+00  1.20000000e+01  1.00000000e+00  5.88235294e-02
  -1.04952174e+01  1.05093708e+01  0.00000000e+00  5.13898342e-01
  -5.58044870e+00  4.27666612e+00  1.42857143e-01]
 [ 1.00000000e+00  0.00000000e+00  0.00000000e+00  6.43694475e-02
   1.00000000e+00  1.47000000e+02  2.00000000e+01  9.70873786e-02
  -9.04507102e+00  1.13932537e+01  1.00000000e+00  4.32036615e+00
  -4.21648550e+00  9.35988044e+00  2.26400795e-01]
 [ 2.00000000e+00  0.00000000e+00  0.00000000e+00  2.05371115e-02
   0.00000000e+00  5.00000000e+00  0.00000000e+00  0.00000000e+00
  -1.10308202e+01  9.53921731e+00  0.00000000e+00  0.00000000e+00
  -5.64652461e+00  1.60943791e+00  0.00000000e+00]
 [ 4.00000000e+00  0.00000000e+00  0.00000000e+00  5.93784382e-02
   0.00000000e+00  2.00000000e+01  0.00000000e+00  0.00000000e+00
  -1.06715646e+01  8.04019582e+00  5.00000000e-01  0.00000000e+00
  -5.25171866e+00  5.63478960e+00  0.00000000

# Utility functions

In [24]:
def write_submission(filename, pred):
    '''
    Write prediction result in a submission file
    
    Parameters
    ----------
    filename: name of submission file
    pred: prediction array
    
    '''
    with open(path_submission + filename, 'w', newline='') as f:
        csv_out = csv.writer(f)
        csv_out.writerow(['id','category'])
        for row in pred:
            csv_out.writerow(row)

# Tuning classifiers

In [25]:
# ====== Scaling features ====== #
training_features_scale = pre.scale(training_features)
testing_features_scale = pre.scale(testing_features)

In [26]:
testing_size = len(testing_features)

## 1. SVM classifier

In [81]:
start = time.time()

# a list of svm classifiers with differenet settings
clfs_svm = [
    svm.LinearSVC(penalty='l2', loss='squared_hinge', C=1.0, fit_intercept=True),
    svm.LinearSVC(penalty='l2', loss='hinge', C=1.0, fit_intercept=True),
    svm.LinearSVC(penalty='l1', loss='squared_hinge', C=1.0, fit_intercept=True, dual=False)
]

# check for best settings (without tuning C)
tune_svm_scores = []
for index, clf in enumerate(clfs_svm):
    print('Running cross-validation for:', index)
    tune_svm_scores.append(cross_val_score(clf, training_features_scale, labels, cv=10, scoring='f1'))
    
end = time.time()
print('Tuning parameters for SVM takes %.4f s' % (end-start))

Running cross-validation for: 0
Running cross-validation for: 1
Running cross-validation for: 2
Tuning parameters for SVM takes 1768.1033 s


In [91]:
print(np.mean(tune_svm_scores,axis=1))
best_clf_svm = clfs_svm[np.argmax(np.mean(tune_svm_scores,axis=1))]
print('Best setting for SVM:', best_clf_svm)

[0.96474375 0.96784078 0.96441065]
Best setting for SVM: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)


In [40]:
start = time.time()

best_clf_svm = svm.LinearSVC(penalty='l2', loss='hinge', C=1.0, fit_intercept=True)

# ====== training and prediction with SVM and scaled features ====== #
best_clf_svm.fit(training_features, labels)
pred_svm = list(best_clf_svm.predict(testing_features))
pred_svm = zip(range(testing_size), pred_svm)

end = time.time()
print('Training with SVM Linear SVC takes %.4f s' % (end-start))

Training with SVM Linear SVC takes 136.2333 s


In [41]:
write_submission('tuned_submission_svm_01.csv', pred_svm)

In [95]:
svm_scores = cross_val_score(best_clf_svm, training_features_scale, labels, cv=5, scoring='f1')
print("F1-score with SVM: %0.4f (+/- %0.4f)" % (svm_scores.mean(), svm_scores.std() * 2))

F1-score with SVM: 0.9678 (+/- 0.0005)


### 1-A. SVM without scaling (do not use, always scale for better performance)

In [12]:
# start = time.time()

# # ====== training and predicting with SVM ====== #
# clf_svm = svm.LinearSVC(dual=False, C=1.0)
# clf_svm.fit(training_features, labels)
# pred_svm = list(clf_svm.predict(testing_features))
# pred_svm = zip(range(len(testing_features)), pred_svm)

# end = time.time()
# print('Training with SVM Linear SVC takes %.4f s' % (end-start))

Training with SVM Linear SVC takes 4.2013 s


In [13]:
# write_submission('submission_svm_11.csv', pred_svm)

In [15]:
# svm_scores = cross_val_score(clf_svm, training_features, labels, cv=5, scoring='f1')
# print("F1-score with SVM: %0.4f (+/- %0.4f)" % (svm_scores.mean(), svm_scores.std() * 2))

F1-score with SVM: 0.9650 (+/- 0.0021)


### 1-B. SVM with scaling

In [16]:
# start = time.time()

# # ====== training and prediction with SVM and scaled features ====== #
# clf_svm_scale = svm.LinearSVC(dual=False, C=1.0)
# clf_svm_scale.fit(training_features_scale, labels)
# pred_svm_scale = list(clf_svm_scale.predict(testing_features_scale))
# pred_svm_scale = zip(range(testing_size), pred_svm_scale)

# end = time.time()
# print('Training with SVM Linear SVC + scaling takes %.4f s' % (end-start))

Training with SVM Linear SVC + scaling takes 1.8097 s


In [17]:
# write_submission('submission_svm_16_scale.csv', pred_svm_scale)

In [18]:
# svm_scale_scores = cross_val_score(clf_svm_scale, training_features, labels, cv=5, scoring='f1')
# print("F1-score with SVM scaling: %0.4f (+/- %0.4f)" % (svm_scale_scores.mean(), svm_scale_scores.std() * 2))

F1-score with SVM scaling: 0.9650 (+/- 0.0021)


## 2. RandomForest classifier

In [22]:
clfs_rf = [
    RandomForestClassifier(),
    RandomForestClassifier(max_features='sqrt'),
    RandomForestClassifier(max_features='log2'),
    RandomForestClassifier(max_features=0.3),
]

# check for best settings (without tuning C)
tune_rf_scores = []
for index, clf in enumerate(clfs_rf):
    start = time.time()
    
    print('Running cross-validation for:', index)
    tune_rf_scores.append(cross_val_score(clf, training_features, labels, cv=10, scoring='f1'))

    end = time.time()
    print('Tuning parameters for Random Forest, clf[%d] takes %.4f s' % (index, end-start))

Running cross-validation for: 0
Tuning parameters for Random Forest, clf[0] takes 144.8391 s
Running cross-validation for: 1
Tuning parameters for Random Forest, clf[1] takes 141.8941 s
Running cross-validation for: 2
Tuning parameters for Random Forest, clf[2] takes 142.2216 s
Running cross-validation for: 3
Tuning parameters for Random Forest, clf[3] takes 142.0169 s


In [23]:
print(np.mean(tune_rf_scores, axis=1))
best_clf_rf = clfs_rf[np.argmax(np.mean(tune_rf_scores,axis=1))]
print('Best setting for Logistic Regression:', best_clf_rf)

[0.96977626 0.9697922  0.96988879 0.97002613]
Best setting for Logistic Regression: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [24]:
max_feat = 0.3

clfs_rf = [
    RandomForestClassifier(max_features=max_feat, n_estimators=10),
    RandomForestClassifier(max_features=max_feat, n_estimators=30),
    RandomForestClassifier(max_features=max_feat, n_estimators=50),
    RandomForestClassifier(max_features=max_feat, n_estimators=70),
    RandomForestClassifier(max_features=max_feat, n_estimators=90),
]

# check for best settings (without tuning C)
tune_rf_scores = []
for index, clf in enumerate(clfs_rf):
    start = time.time()
    
    print('Running cross-validation for:', index)
    tune_rf_scores.append(cross_val_score(clf, training_features, labels, cv=10, scoring='f1'))

    end = time.time()
    print('Tuning parameters for Random Forest, clf[%d] takes %.4f s' % (index, end-start))

Running cross-validation for: 0
Tuning parameters for Random Forest, clf[0] takes 155.6879 s
Running cross-validation for: 1
Tuning parameters for Random Forest, clf[1] takes 437.9482 s
Running cross-validation for: 2
Tuning parameters for Random Forest, clf[2] takes 1808.1578 s
Running cross-validation for: 3
Tuning parameters for Random Forest, clf[3] takes 990.1995 s
Running cross-validation for: 4
Tuning parameters for Random Forest, clf[4] takes 1310.2704 s


In [29]:
print(np.mean(tune_rf_scores, axis=1))
best_clf_rf = clfs_rf[np.argmax(np.mean(tune_rf_scores,axis=1))]
print('Best setting for Logistic Regression:', best_clf_rf)

[0.96991399 0.97098628 0.97121565 0.97128654 0.97139856]
Best setting for Logistic Regression: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=90, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [29]:
start = time.time()

best_clf_rf = RandomForestClassifier(
    max_features=0.3, 
    n_estimators=90
)

# ====== training and prediction with Random Forest ====== #
best_clf_rf.fit(training_features, labels)
pred_rf = list(best_clf_rf.predict(testing_features))
pred_rf = zip(range(testing_size), pred_rf)

end = time.time()
print('Training with Random Forest takes %.4f s' % (end-start))

Training with Random Forest takes 737.4384 s


In [30]:
write_submission('tuned_submission_rf_02.csv', pred_rf)

In [34]:
rf_scores = cross_val_score(best_clf_rf, training_features, labels, cv=5, scoring='f1')
print("Accuracy of RandomForest: %0.4f (+/- %0.4f)" % (rf_scores.mean(), rf_scores.std() * 2))

Accuracy of RandomForest: 0.9716 (+/- 0.0006)


# Logistic Regression

In [27]:
start = time.time()

# a list of svm classifiers with differenet settings
clfs_logreg = [
    LogisticRegression(penalty='l2', solver='liblinear'),
    LogisticRegression(penalty='l2', solver='newton-cg'),
    LogisticRegression(penalty='l2', solver='lbfgs')
    #LogisticRegression(penalty='l1', solver='saga', max_iter=5000) #takes too long
]

# check for best settings (without tuning C)
tune_logreg_scores = []
for index, clf in enumerate(clfs_logreg):
    print('Running cross-validation for:', index)
    tune_logreg_scores.append(cross_val_score(clf, training_features_scale, labels, cv=10, scoring='f1'))
    
end = time.time()
print('Tuning parameters for Logistic Regression takes %.4f s' % (end-start))

Running cross-validation for: 0
Running cross-validation for: 1
Running cross-validation for: 2
Tuning parameters for Logistic Regression takes 358.0291 s


In [28]:
print(np.mean(tune_logreg_scores, axis=1))
best_clf_logreg = clfs_logreg[np.argmax(np.mean(tune_logreg_scores,axis=1))]
print('Best setting for Logistic Regression:', best_clf_logreg)

[0.96163912 0.96170016 0.96169857]
Best setting for Logistic Regression: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)


In [30]:
start = time.time()

# ====== training and prediction with Logistic Regression and scaled features ====== #
best_clf_logreg.fit(training_features_scale, labels)
pred_logreg = list(best_clf_logreg.predict(testing_features_scale))
pred_logreg = zip(range(testing_size), pred_logreg)

end = time.time()
print('Training with Logistic Regression takes %.4f s' % (end-start))

Training with Logistic Regression takes 17.7848 s


In [31]:
write_submission('tuned_submission_lg_02.csv', pred_logreg)

In [32]:
logreg_scores = cross_val_score(best_clf_logreg, training_features_scale, labels, cv=5, scoring='f1')
print("Accuracy of Logistic Regression: %0.4f (+/- %0.4f)" % (logreg_scores.mean(), logreg_scores.std() * 2))

Accuracy of Logistic Regression: 0.9616 (+/- 0.0006)


## A. Logistic Regression without scaling

In [22]:
# start = time.time()

# # ====== training and prediction with Logistic Regression ====== #
# clf_lg = LogisticRegression()
# clf_lg.fit(training_features, labels)
# pred_lg = list(clf_lg.predict(testing_features))
# pred_lg = zip(range(testing_size), pred_lg)

# end = time.time()
# print('Training with Logistic Regression takes %.4f s' % (end-start))

Training with Logistic Regression takes 7.7782 s


In [23]:
# write_submission('submission_lg_07.csv', pred_lg)

In [24]:
# lg_scores = cross_val_score(clf_lg, training_features, labels, cv=5, scoring='f1')
# print("Accuracy of Logistic Regression: %0.4f (+/- %0.4f)" % (lg_scores.mean(), lg_scores.std() * 2))

Accuracy of Logistic Regression: 0.9669 (+/- 0.0008)


## B. Logistic Regression with scaling

In [25]:
# start = time.time()

# # ====== training and prediction with Logistic Regression + scaling ====== #
# clf_lg_scale = LogisticRegression()
# clf_lg_scale.fit(training_features_scale, labels)
# pred_lg_scale = list(clf_lg_scale.predict(testing_features_scale))
# pred_lg_scale = zip(range(testing_size), pred_lg_scale)

# end = time.time()
# print('Training with Logistic Regression + scaling takes %.4f s' % (end-start))

Training with Logistic Regression + scaling takes 2.5854 s


In [26]:
# write_submission('submission_lg_07_scale.csv', pred_lg_scale)

In [27]:
# lg_scale_scores = cross_val_score(clf_lg_scale, training_features_scale, labels, cv=5, scoring='f1')
# print("F1-score with Logistic Regression + scaling: %0.4f (+/- %0.4f)" % (lg_scale_scores.mean(), lg_scale_scores.std() * 2))

F1-score with Logistic Regression + scaling: 1.0000 (+/- 0.0000)


# Neural Network (simple version)

## A. Neural Network without scaling

In [28]:
start = time.time()

clf_nn = MLPClassifier(
    hidden_layer_sizes = (50,60,70,40,50,30,20,10),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn.fit(training_features, labels)
pred_nn = clf_nn.predict(testing_features)
pred_nn = zip(range(testing_size), pred_nn)

end = time.time()
print('Training with Neural Networks takes %.4f s' % (end-start))

Training with Neural Networks takes 629.5872 s


In [29]:
write_submission('submission_nn_27.csv', pred_nn)

In [None]:
start = time.time()

# cross validation
nn_scores = cross_val_score(clf_nn, training_features, labels, cv=5, scoring='f1')
print("F1-score of Neural Network: %0.4f (+/- %0.4f)" % (nn_scores.mean(), nn_scores.std() * 2))

end = time.time()
print('Cross validation evaluation on Neural Network takes %.4f s' % (end-start))



## B. Neural Network with scaling

In [42]:
start = time.time()

clf_nn_scale = MLPClassifier(
    hidden_layer_sizes = (50,60,70,40,50,30,20,10),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn_scale.fit(training_features_scale, labels)
pred_nn_scale = clf_nn_scale.predict(testing_features_scale)
pred_nn_scale = zip(range(testing_size), pred_nn_scale)

end = time.time()
print('Training with Neural Networks + scaling takes %.4f s' % (end-start))

Training with Neural Networks + scaling takes 212.0493 s


In [43]:
write_submission('tuned_submission_nn_01_scale.csv', pred_nn_scale)

In [33]:
nn_scale_scores = cross_val_score(clf_nn_scale, training_features, labels, cv=5, scoring='f1')
print("F1-score of Neural Network + scaling: %0.4f (+/- %0.4f)" % (nn_scale_scores.mean(), nn_scale_scores.std() * 2))

F1-score of Neural Network + scaling: 0.9978 (+/- 0.0022)


# Gradient Boosting

In [17]:
# a list of svm classifiers with differenet settings
clfs_gb = [
    GradientBoostingClassifier(n_estimators=40),
    GradientBoostingClassifier(n_estimators=60),
    GradientBoostingClassifier(n_estimators=80),
    GradientBoostingClassifier(n_estimators=100),
    GradientBoostingClassifier(n_estimators=120)
]

# check for best settings: number of estimators
tune_gb_scores = []
for index, clf in enumerate(clfs_gb):
    start = time.time()
    print('Running cross-validation for:', index)
    tune_gb_scores.append(cross_val_score(clf, training_features_scale, labels, cv=10, scoring='f1'))
    end = time.time()
    print('Tuning parameters for Gradient Boosting, setting %d takes %.4f s' % (index, end-start))

Running cross-validation for: 0
Tuning parameters for Gradient Boosting, setting 0 takes 412.8917 s
Running cross-validation for: 1
Tuning parameters for Gradient Boosting, setting 1 takes 738.6681 s
Running cross-validation for: 2
Tuning parameters for Gradient Boosting, setting 2 takes 1298.6631 s
Running cross-validation for: 3
Tuning parameters for Gradient Boosting, setting 3 takes 1258.4368 s
Running cross-validation for: 4
Tuning parameters for Gradient Boosting, setting 4 takes 1242.2505 s


In [18]:
print(np.mean(tune_gb_scores, axis=1))
best_clf_gb = clfs_gb[np.argmax(np.mean(tune_gb_scores,axis=1))]
print('Best setting for Gradient Boosting:', best_clf_gb)

[0.96959925 0.97038891 0.97089067 0.97112219 0.97123316]
Best setting for Gradient Boosting: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=120,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


In [None]:
# a list of svm classifiers with differenet settings
n_estim = 120
_max_depth = 5
_subsample = 0.8

clfs_gb_depth = [
    GradientBoostingClassifier(n_estimators=n_estim, max_depth=_max_depth, subsampl=_subsample), 
]

# check for best settings: number of estimators
tune_gb_scores_depth = []
for index, clf in enumerate(clfs_gb_depth):
    start = time.time()
    print('Running cross-validation for:', index)
    tune_gb_scores_depth.append(cross_val_score(clf, training_features_scale, labels, cv=10, scoring='f1'))
    end = time.time()
    print('Tuning parameters for Gradient Boosting, setting %d takes %.4f s' % (index, end-start))

In [30]:
start = time.time()

# ====== training and prediction with Logistic Regression and scaled features ====== #
best_clf_gb.fit(training_features_scale, labels)
pred_gb = list(best_clf_gb.predict(testing_features_scale))
pred_gb = zip(range(testing_size), pred_gb)

end = time.time()
print('Training with Gradient Boosting takes %.4f s' % (end-start))

Training with Logistic Regression takes 17.7848 s


In [31]:
write_submission('tuned_submission_gb_01.csv', pred_gb)

In [95]:
gb_scores = cross_val_score(best_gb_svm, training_features_scale, labels, cv=5, scoring='f1')
print("F1-score with Gradient Boosting: %0.4f (+/- %0.4f)" % (gb_scores.mean(), gb_scores.std() * 2))

F1-score with SVM: 0.9678 (+/- 0.0005)


## Gradient Boosting

In [32]:
start = time.time()

# ====== Training and predicting with Gradient Boosting ====== #
clf_gboost = GradientBoostingClassifier(
    loss = 'deviance',
    n_estimators = 120,
    subsample = 0.8,
    max_depth = 5
)

clf_gboost.fit(training_features, labels)
pred_gboost = clf_gboost.predict(testing_features)
pred_gboost = zip(range(testing_size), pred_gboost)

end = time.time()
print('Training with Gradient Boosting takes %.4f s' % (end-start))

Training with Gradient Boosting takes 493.5955 s


In [33]:
write_submission('tuned_submission_gboost_02.csv', pred_gboost)

In [36]:
gd_scores = cross_val_score(clf_gboost, training_features, labels, cv=5, scoring='f1')
print("F1-score with Gradient Boosting: %0.4f (+/- %0.4f)" % (gd_scores.mean(), gd_scores.std() * 2))

F1-score with Gradient Boosting: 1.0000 (+/- 0.0000)


## AdaBoost

In [35]:
start = time.time()

# ====== Training and predicting with Gradient Boosting ====== #
clf_ada = GradientBoostingClassifier(
    loss = 'exponential',
    n_estimators = 120,
    subsample = 0.8,
    max_depth = 5
)
clf_ada.fit(training_features, labels)
pred_ada = clf_ada.predict(testing_features)
pred_ada = zip(range(testing_size), pred_ada)

end = time.time()
print('Training with Adaboost takes %.4f s' % (end-start))

Training with Adaboost takes 356.5677 s


In [36]:
write_submission('tuned_submission_ada_02.csv', pred_ada)

In [37]:
ada_scores = cross_val_score(clf_ada, training_features, labels, cv=5, scoring='f1')
print("F1-score with AdaBoost: %0.4f (+/- %0.4f)" % (ada_scores.mean(), ada_scores.std() * 2))

KeyboardInterrupt: 

# Nearest Neighbors (kNN)

In [40]:
start = time.time()

# creating odd list of K for KNN
X_train, X_test, y_train, y_test = train_test_split(training_features, labels, test_size=0.35, random_state=42)

myList = list(range(1,50))

# subsetting just the odd ones
neighbors = filter(lambda x: x % 2 != 0, myList)

# empty list that will hold cv scores
cv_scores = []

# perform 10-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
    
# changing to misclassification error
MSE = [1 - x for x in cv_scores]

# determining best k

end = time.time()
print('Cross-validating to pick up the optimal number of neighbors takes %.4f s' % (end-start))

Cross-validating to pick up the optimal number of neighbors takes 4485.4285 s


In [41]:
neighbors = filter(lambda x: x % 2 != 0, myList)
_neighbors = list(neighbors)
optimal_k = _neighbors[np.argmin(MSE)]
print('The optimal number of neighbors is %d' % optimal_k)

The optimal number of neighbors is 11


In [42]:
start = time.time()

# ====== Training & predicting with k-Neareat Neighbors ====== #
clf_knn = KNeighborsClassifier(
    n_neighbors = optimal_k
)
clf_knn.fit(training_features, labels)
pred_knn = clf_knn.predict(testing_features)
pred_knn = zip(range(testing_size), pred_knn)

end = time.time()
print('Training with k-nearest neighbors takes %.4f s' % (end-start))

Training with k-nearest neighbors takes 44.9709 s


In [43]:
write_submission('submission_knn_01.csv', pred_knn)

In [44]:
knn_scores = cross_val_score(clf_knn, training_features, labels, cv=5, scoring='f1')
print("F1-score with K-NearestNeighbors: %0.4f (+/- %0.4f)" % (knn_scores.mean(), knn_scores.std() * 2))

F1-score with K-NearestNeighbors: 0.9588 (+/- 0.0008)


# Feature importance

In [28]:
# ====== compute feature importance ====== #
best_clf_rf.fit(np.nan_to_num(orig_training_features), labels)
idx = np.argsort(-best_clf_rf.feature_importances_) # sort the indicator of feature important by decreasing order

for i in idx:
    print('Feature \'%s\' of importance %.5f' % (orig_features[i], best_clf_rf.feature_importances_[i]))

Feature 'res_alloc' of importance 0.30824
Feature 'katz_linkpred' of importance 0.19527
Feature 'adamic_adar' of importance 0.18733
Feature 'common_neighbors' of importance 0.11597
Feature 'jaccard_coefficient' of importance 0.10649
Feature 'cosine_sim' of importance 0.03725
Feature 'pref_attach' of importance 0.01517
Feature 'in_kcore' of importance 0.01285
Feature 'max_pagerank' of importance 0.00584
Feature 'max_degrees' of importance 0.00518
Feature 'overlapping_title' of importance 0.00370
Feature 'max_betweenness' of importance 0.00286
Feature 'katz_index' of importance 0.00178
Feature 'cosine_sim_w2v' of importance 0.00147
Feature 'common_authors' of importance 0.00030
Feature 'temporal_difference' of importance 0.00027
Feature 'same_journal' of importance 0.00003
