# Libraries and Utility functions

In [1]:
# import time to find consuming steps
import time
start = time.time()

# utility libraries
import numpy as np
import csv as csv
from sklearn import preprocessing as pre
from itertools import cycle
from scipy import interp
import matplotlib.pyplot as plt

# classifier for classification
from sklearn.metrics.pairwise import linear_kernel
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import recall_score, roc_curve, auc, average_precision_score, precision_recall_curve

end = time.time()
print('Loading libraries takes %.4f s' % (end-start))

Loading libraries takes 1.1800 s


# Reading dataset (training, testing, node information)

In [2]:
path_data = '../data/' # path to the data
path_submission = '../submission/' # path to submission files

In [3]:
start = time.time()

# ====== read training data as str ====== #
training = np.genfromtxt(path_data + 'training_set.txt', dtype=str)

# ====== extract labels ====== #
labels = training[:, 2].astype(int) # get the labels

end = time.time()
print('Reading training set & extracting labels takes %.4f s' % (end-start))

Reading training set & extracting labels takes 2.8691 s


In [4]:
start = time.time()

# ====== read training features ====== #
orig_training_features = np.genfromtxt(path_data + 'training_features.csv', delimiter=',', skip_header=1, dtype=float)

end = time.time()
print('Reading training features takes %.4f s' % (end-start))

Reading training features takes 8.0128 s


In [5]:
start = time.time()

# ====== read testing features as str ====== #
orig_testing_features = np.genfromtxt(path_data + 'testing_features.csv', delimiter=',', skip_header=1, dtype=float)

end = time.time()
print('Reading testing features takes %.4f s' % (end-start))

Reading testing features takes 0.4422 s


In [6]:
print('Training features:', orig_training_features.shape)
print('Labels:', labels.shape)
print('Testing features:', orig_testing_features.shape)

Training features: (615512, 13)
Labels: (615512,)
Testing features: (32648, 13)


# Picking up some features

Sometimes, we might need to remove some features read from file. Here, we remove features by its index.

In [7]:
orig_features = [
    'temporal_difference', # 0
    'common_authors', # 1
    'same_journal', # 2
    'cosine_sim', # 3
    'overlapping_title', # 4
    'average_degrees', # 5
    'common_neighbors', # 6
    'jaccard_coefficient', # 7
    'avg_pagerank', # 8
    'average_betweenness', # 9
    'in_kcore', # 10
    'adamic_adar', # 11
    'katz_index' # 12
]

In [8]:
# remove very useless features : same_journal (useless), common neighbors (strongly correlated with adamic-adar), page rank (strongly correlated with betweeness)
to_remove = []
training_features = np.delete(orig_training_features, to_remove, 1)
testing_features = np.delete(orig_testing_features, to_remove, 1)
features = np.delete(orig_features, to_remove)

print('Training features:', training_features.shape)

Training features: (615512, 13)


# Utility functions

In [9]:
def write_submission(filename, pred):
    '''
    Write prediction result in a submission file
    
    Parameters
    ----------
    filename: name of submission file
    pred: prediction array
    
    '''
    with open(path_submission + filename, 'w', newline='') as f:
        csv_out = csv.writer(f)
        csv_out.writerow(['id','category'])
        for row in pred:
            csv_out.writerow(row)

# Tuning classifiers

In [10]:
# ====== Scaling features ====== #
training_features_scale = pre.scale(training_features)
testing_features_scale = pre.scale(testing_features)

In [11]:
testing_size = testing_features.shape[0]
testing_size

32648

## 1. SVM classifier

### 1-A. SVM without scaling

In [12]:
start = time.time()

# ====== training and predicting with SVM ====== #
clf_svm = svm.LinearSVC(dual=False, C=1.0)
clf_svm.fit(training_features, labels)
pred_svm = list(clf_svm.predict(testing_features))
pred_svm = zip(range(len(testing_features)), pred_svm)

end = time.time()
print('Training with SVM Linear SVC takes %.4f s' % (end-start))

Training with SVM Linear SVC takes 4.2013 s


In [13]:
write_submission('submission_svm_11.csv', pred_svm)

In [15]:
svm_scores = cross_val_score(clf_svm, training_features, labels, cv=5, scoring='f1')
print("F1-score with SVM: %0.4f (+/- %0.4f)" % (svm_scores.mean(), svm_scores.std() * 2))

F1-score with SVM: 0.9650 (+/- 0.0021)


### 1-B. SVM with scaling

In [16]:
start = time.time()

# ====== training and prediction with SVM and scaled features ====== #
clf_svm_scale = svm.LinearSVC(dual=False, C=1.0)
clf_svm_scale.fit(training_features_scale, labels)
pred_svm_scale = list(clf_svm_scale.predict(testing_features_scale))
pred_svm_scale = zip(range(testing_size), pred_svm_scale)

end = time.time()
print('Training with SVM Linear SVC + scaling takes %.4f s' % (end-start))

Training with SVM Linear SVC + scaling takes 1.8097 s


In [17]:
write_submission('submission_svm_16_scale.csv', pred_svm_scale)

In [18]:
svm_scale_scores = cross_val_score(clf_svm_scale, training_features, labels, cv=5, scoring='f1')
print("F1-score with SVM scaling: %0.4f (+/- %0.4f)" % (svm_scale_scores.mean(), svm_scale_scores.std() * 2))

F1-score with SVM scaling: 0.9650 (+/- 0.0021)


## 2. RandomForest classifier

In [19]:
start = time.time()

# ====== training and prediction with Random Forest ====== #
clf_rf = RandomForestClassifier()
clf_rf.fit(training_features, labels)
pred_rf = list(clf_rf.predict(testing_features))
pred_rf = zip(range(testing_size), pred_rf)

end = time.time()
print('Training with Random Forest takes %.4f s' % (end-start))

Training with Random Forest takes 7.3797 s


In [20]:
write_submission('submission_rf_11.csv', pred_rf)

In [21]:
rf_scores = cross_val_score(clf_rf, training_features, labels, cv=5, scoring='f1')
print("Accuracy of RandomForest: %0.4f (+/- %0.4f)" % (rf_scores.mean(), rf_scores.std() * 2))

Accuracy of RandomForest: 1.0000 (+/- 0.0000)


# Logistic Regression

## A. Logistic Regression without scaling

In [22]:
start = time.time()

# ====== training and prediction with Logistic Regression ====== #
clf_lg = LogisticRegression()
clf_lg.fit(training_features, labels)
pred_lg = list(clf_lg.predict(testing_features))
pred_lg = zip(range(testing_size), pred_lg)

end = time.time()
print('Training with Logistic Regression takes %.4f s' % (end-start))

Training with Logistic Regression takes 7.7782 s


In [23]:
write_submission('submission_lg_07.csv', pred_lg)

In [24]:
lg_scores = cross_val_score(clf_lg, training_features, labels, cv=5, scoring='f1')
print("Accuracy of Logistic Regression: %0.4f (+/- %0.4f)" % (lg_scores.mean(), lg_scores.std() * 2))

Accuracy of Logistic Regression: 0.9669 (+/- 0.0008)


## B. Logistic Regression with scaling

In [25]:
start = time.time()

# ====== training and prediction with Logistic Regression + scaling ====== #
clf_lg_scale = LogisticRegression()
clf_lg_scale.fit(training_features_scale, labels)
pred_lg_scale = list(clf_lg_scale.predict(testing_features_scale))
pred_lg_scale = zip(range(testing_size), pred_lg_scale)

end = time.time()
print('Training with Logistic Regression + scaling takes %.4f s' % (end-start))

Training with Logistic Regression + scaling takes 2.5854 s


In [26]:
write_submission('submission_lg_07_scale.csv', pred_lg_scale)

In [27]:
lg_scale_scores = cross_val_score(clf_lg_scale, training_features_scale, labels, cv=5, scoring='f1')
print("F1-score with Logistic Regression + scaling: %0.4f (+/- %0.4f)" % (lg_scale_scores.mean(), lg_scale_scores.std() * 2))

F1-score with Logistic Regression + scaling: 1.0000 (+/- 0.0000)


# Neural Network (simple version)

## A. Neural Network without scaling

In [28]:
start = time.time()

clf_nn = MLPClassifier(
    hidden_layer_sizes = (50,60,70,40,50,30,20,10),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn.fit(training_features, labels)
pred_nn = clf_nn.predict(testing_features)
pred_nn = zip(range(testing_size), pred_nn)

end = time.time()
print('Training with Neural Networks takes %.4f s' % (end-start))

Training with Neural Networks takes 456.2616 s


In [29]:
write_submission('submission_nn_16.csv', pred_nn)

In [30]:
nn_scores = cross_val_score(clf_nn, training_features, labels, cv=5, scoring='f1')
print("F1-score of Neural Network: %0.4f (+/- %0.4f)" % (nn_scores.mean(), nn_scores.std() * 2))

F1-score of Neural Network: 0.9974 (+/- 0.0039)


## B. Neural Network with scaling

In [31]:
start = time.time()

clf_nn_scale = MLPClassifier(
    hidden_layer_sizes = (50,60,70,40,50,30,20,10),
    activation = 'relu',
    solver = 'adam',
    early_stopping = True
)
clf_nn_scale.fit(training_features_scale, labels)
pred_nn_scale = clf_nn_scale.predict(testing_features_scale)
pred_nn_scale = zip(range(testing_size), pred_nn_scale)

end = time.time()
print('Training with Neural Networks + scaling takes %.4f s' % (end-start))

Training with Neural Networks + scaling takes 50.3232 s


In [32]:
write_submission('submission_nn_10_scale.csv', pred_nn_scale)

In [33]:
nn_scale_scores = cross_val_score(clf_nn_scale, training_features, labels, cv=5, scoring='f1')
print("F1-score of Neural Network + scaling: %0.4f (+/- %0.4f)" % (nn_scale_scores.mean(), nn_scale_scores.std() * 2))

F1-score of Neural Network + scaling: 0.9978 (+/- 0.0022)


# Gradient Boosting

## Gradient Boosting

In [34]:
start = time.time()

# ====== Training and predicting with Gradient Boosting ====== #
clf_gboost = GradientBoostingClassifier(
    loss = 'deviance',
    n_estimators = 200
)
clf_gboost.fit(training_features, labels)
pred_gboost = clf_gboost.predict(testing_features)
pred_gboost = zip(range(testing_size), pred_gboost)

end = time.time()
print('Training with Gradient Boosting takes %.4f s' % (end-start))

Training with Gradient Boosting takes 35.0845 s


In [35]:
write_submission('submission_gboost_05.csv', pred_gboost)

In [36]:
gd_scores = cross_val_score(clf_gboost, training_features, labels, cv=5, scoring='f1')
print("F1-score with Gradient Boosting: %0.4f (+/- %0.4f)" % (gd_scores.mean(), gd_scores.std() * 2))

F1-score with Gradient Boosting: 1.0000 (+/- 0.0000)


## AdaBoost

In [37]:
start = time.time()

# ====== Training and predicting with Gradient Boosting ====== #
clf_ada = GradientBoostingClassifier(
    loss = 'exponential',
    n_estimators = 200
)
clf_ada.fit(training_features, labels)
pred_ada = clf_ada.predict(testing_features)
pred_ada = zip(range(testing_size), pred_ada)

end = time.time()
print('Training with Adaboost takes %.4f s' % (end-start))

Training with Adaboost takes 47.4125 s


In [38]:
write_submission('submission_ada_05.csv', pred_ada)

In [39]:
ada_scores = cross_val_score(clf_ada, training_features, labels, cv=5, scoring='f1')
print("F1-score with AdaBoost: %0.4f (+/- %0.4f)" % (ada_scores.mean(), ada_scores.std() * 2))

F1-score with AdaBoost: 1.0000 (+/- 0.0000)


# Nearest Neighbors (kNN)

In [40]:
start = time.time()

# creating odd list of K for KNN
X_train, X_test, y_train, y_test = train_test_split(training_features, labels, test_size=0.35, random_state=42)

myList = list(range(1,50))

# subsetting just the odd ones
neighbors = filter(lambda x: x % 2 != 0, myList)

# empty list that will hold cv scores
cv_scores = []

# perform 10-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
    
# changing to misclassification error
MSE = [1 - x for x in cv_scores]

# determining best k

end = time.time()
print('Cross-validating to pick up the optimal number of neighbors takes %.4f s' % (end-start))

Cross-validating to pick up the optimal number of neighbors takes 4485.4285 s


In [41]:
neighbors = filter(lambda x: x % 2 != 0, myList)
_neighbors = list(neighbors)
optimal_k = _neighbors[np.argmin(MSE)]
print('The optimal number of neighbors is %d' % optimal_k)

The optimal number of neighbors is 11


In [42]:
start = time.time()

# ====== Training & predicting with k-Neareat Neighbors ====== #
clf_knn = KNeighborsClassifier(
    n_neighbors = optimal_k
)
clf_knn.fit(training_features, labels)
pred_knn = clf_knn.predict(testing_features)
pred_knn = zip(range(testing_size), pred_knn)

end = time.time()
print('Training with k-nearest neighbors takes %.4f s' % (end-start))

Training with k-nearest neighbors takes 44.9709 s


In [43]:
write_submission('submission_knn_01.csv', pred_knn)

In [44]:
knn_scores = cross_val_score(clf_knn, training_features, labels, cv=5, scoring='f1')
print("F1-score with K-NearestNeighbors: %0.4f (+/- %0.4f)" % (knn_scores.mean(), knn_scores.std() * 2))

F1-score with K-NearestNeighbors: 0.9588 (+/- 0.0008)


# Feature importance

In [45]:
# ====== compute feature importance ====== #
idx = np.argsort(-clf_rf.feature_importances_) # sort the indicator of feature important by decreasing order

for i in idx:
    print('Feature \'%s\' of importance %.5f' % (features[i], clf_rf.feature_importances_[i]))

Feature 'jaccard_coefficient' of importance 0.27180
Feature 'katz_index' of importance 0.26861
Feature 'adamic_adar' of importance 0.20545
Feature 'cosine_sim' of importance 0.10024
Feature 'common_neighbors' of importance 0.05078
Feature 'average_degrees' of importance 0.03634
Feature 'in_kcore' of importance 0.03274
Feature 'avg_pagerank' of importance 0.02964
Feature 'average_betweenness' of importance 0.00196
Feature 'overlapping_title' of importance 0.00186
Feature 'common_authors' of importance 0.00029
Feature 'temporal_difference' of importance 0.00028
Feature 'same_journal' of importance 0.00003
