In [1]:
import random, csv
import numpy as np
import scipy
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import networkx as nx

### Reading test and train data, and node info

In [2]:
#nltk.download('punkt') # for tokenization
#nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

In [3]:
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

In [4]:
with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]

### Creating graph (nodes and edges from training data)

In [5]:
edges = [(element[0],element[1]) for element in training_set if element[2]=="1"]
nodes = [element[0] for element in training_set] + [element[1] for element in training_set]

In [6]:
# this is the actual directed graph
G = nx.DiGraph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)

In [7]:
# undirected version of the graph (used for jaccard similarity)
G_undirected = nx.Graph()
G_undirected.add_nodes_from(nodes)
G_undirected.add_edges_from(edges)

In [8]:
#### KATZ!!!! - very slow

#adj_mat = nx.to_scipy_sparse_matrix(G, format='csc')
#identity = scipy.sparse.eye(adj_mat.shape[0])
#temp = identity - (0.9)*adj_mat
#katz_score = scipy.sparse.linalg.inv(temp)

### Global variables

In [39]:
NUM_EXAMPLES = 10000
NUM_TRAIN = 7000
NUM_VALIDATION = 3000
NUM_FEATURES = 8

In [81]:
# parameters to run on entire data
# NUM_EXAMPLES = len(training_set)
# NUM_TRAIN = int(0.7*NUM_EXAMPLES)
# NUM_VALIDATION = int(0.3*NUM_EXAMPLES)
# NUM_FEATURES = 7

### Reading existing features for the entire data & add the new feature and store in csv file
##### since this is very slow, I am writing only 50000 examples for now

### Reading features from features.csv

In [84]:
featuresFile = "featuresNEW.csv" if NUM_EXAMPLES == 10000 else "features.csv" if NUM_EXAMPLES == 100000 \
        else "features_train_2.csv"
with open(featuresFile, "r") as f:
    reader = csv.reader(f)
    existing_features  = list(reader)
print existing_features[0]

['source', 'target', 'label', 'overlap_title', 'diff', 'common_author', 'jaccard_sim']


In [85]:
len(existing_features)

615513

In [86]:
# create the next feature - ? measure
resource_alloc, pre_at, common_neigh = [], [], []
counter = 0

for i in xrange(len(training_set[:NUM_EXAMPLES])):
    source = training_set[i][0]
    target = training_set[i][1]
    
    # calculate the measure
    common_neigh.append(len(sorted(nx.common_neighbors(G_undirected, source, target))))

    res = nx.preferential_attachment(G_undirected, [(source, target)])
    for r in res:
        pre_at.append(r[2])
    
    res = nx.resource_allocation_index(G_undirected, [(source, target)])
    for r in res:
        resource_alloc.append(r[2])
    
    counter += 1
    if counter % 2000 == True:
        print counter, "training examples processsed"

1 training examples processsed
2001 training examples processsed
4001 training examples processsed
6001 training examples processsed
8001 training examples processsed
10001 training examples processsed
12001 training examples processsed
14001 training examples processsed
16001 training examples processsed
18001 training examples processsed
20001 training examples processsed
22001 training examples processsed
24001 training examples processsed
26001 training examples processsed
28001 training examples processsed
30001 training examples processsed
32001 training examples processsed
34001 training examples processsed
36001 training examples processsed
38001 training examples processsed
40001 training examples processsed
42001 training examples processsed
44001 training examples processsed
46001 training examples processsed
48001 training examples processsed
50001 training examples processsed
52001 training examples processsed
54001 training examples processsed
56001 training examples proc

460001 training examples processsed
462001 training examples processsed
464001 training examples processsed
466001 training examples processsed
468001 training examples processsed
470001 training examples processsed
472001 training examples processsed
474001 training examples processsed
476001 training examples processsed
478001 training examples processsed
480001 training examples processsed
482001 training examples processsed
484001 training examples processsed
486001 training examples processsed
488001 training examples processsed
490001 training examples processsed
492001 training examples processsed
494001 training examples processsed
496001 training examples processsed
498001 training examples processsed
500001 training examples processsed
502001 training examples processsed
504001 training examples processsed
506001 training examples processsed
508001 training examples processsed
510001 training examples processsed
512001 training examples processsed
514001 training examples pro

In [87]:
common_neigh = preprocessing.scale(common_neigh)
pre_at = preprocessing.scale(pre_at)
resource_alloc = preprocessing.scale(resource_alloc)
#print s_paths[:10]

In [88]:
feats = []
feats.append(existing_features[0] + ['common_neigh'] + ['pre_at'] + ['resource_alloc'])
for i in range(NUM_EXAMPLES):
    feats.append(existing_features[i+1] + [common_neigh[i]] + [pre_at[i]] + [resource_alloc[i]])
print feats[0]

['source', 'target', 'label', 'overlap_title', 'diff', 'common_author', 'jaccard_sim', 'common_neigh', 'pre_at', 'resource_alloc']


### Writing features to features.csv

In [89]:
# write features to .csv file
with open(featuresFile, "wb") as feat:
    csv_out = csv.writer(feat)
    for row in feats:
        csv_out.writerow(row)

In [90]:
len(feats), feats[1][3:NUM_FEATURES+3]

(615513,
 ['1.6332984567897153',
  '-0.32844506272978868',
  '-0.23343490079502244',
  '0.0043584186047233879',
  -0.46980830711435273,
  -0.22321280779425962,
  0.070963551867491301])

In [145]:
NUM_TRAIN = NUM_EXAMPLES

### Creating training_features and validation_features

In [146]:
# convert labels into integers then into column array

labels = [int(element[2]) for element in feats[1:NUM_TRAIN+1]]
labels_array = np.array(labels)
training_features = [element[3:NUM_FEATURES+3] for element in feats[1:NUM_TRAIN+1]]
training_features = np.array(training_features, dtype=np.float64)

In [147]:
len(training_features), training_features[0]

(615512,
 array([ 1.63329846, -0.32844506, -0.2334349 ,  0.00435842, -0.46980831,
        -0.22321281,  0.07096355]))

In [93]:
# convert labels into integers then into column array
labels_validation = [int(element[2]) for element in feats[NUM_TRAIN+1:]]
#labels_validation = list(labels_validation)
labels_array_validation = np.array(labels_validation)
validation_features = [element[3:NUM_FEATURES+3] for element in feats[NUM_TRAIN+1:]]
validation_features = np.array(validation_features, dtype=np.float64)

In [94]:
len(validation_features), validation_features[0]

(184654,
 array([-0.57150124,  1.09132875,  2.5652013 , -0.42066353, -0.29023304,
        -0.05322793, -0.00262718]))

### Initializing basic SVM and predict on validation set

In [95]:
# initialize basic SVM
classifier = svm.LinearSVC()

# train
classifier.fit(training_features, labels_array)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

### Random Forest & predict on validation set

In [148]:
from sklearn.ensemble import RandomForestClassifier
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=80, max_depth=9)
rf.fit(training_features, labels_array)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=80, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [149]:
predictions_SVM = list(rf.predict(validation_features))

### Calculating the metrics

In [150]:
import sklearn 
f1score = sklearn.metrics.f1_score(labels_array_validation, predictions_SVM)
acc = sklearn.metrics.accuracy_score(labels_array_validation, predictions_SVM)

print 'Network size: ', len(G.nodes())
print 'Number of training samples: ', NUM_TRAIN
print 'Number of validation samples: ', NUM_VALIDATION
print 'F1-score: ', f1score
print 'Accuracy: ', acc

Network size:  27770
Number of training samples:  615512
Number of validation samples:  184653
F1-score:  0.969030361397
Accuracy:  0.966911087764


### Reading the test data from features_test.csv

In [102]:
with open("features_test.csv", "r") as f:
    reader = csv.reader(f)
    existing_features_test  = list(reader)
print existing_features_test[0], existing_features_test[0][:6]

['source', 'target', 'overlap_title', 'diff', 'common_author', 'jaccard_sim', 's_path', 'common_neigh', 'pref_att', 'resource_alloc'] ['source', 'target', 'overlap_title', 'diff', 'common_author', 'jaccard_sim']


### Creating the next feature for test data

In [103]:
# create the next feature
counter = 0
resource_alloc_test = []
pre_at_test, common_neigh_test = [], []
for i in xrange(len(testing_set)):
    source = testing_set[i][0]
    target = testing_set[i][1]    
    
    # calculate the measure
    common_neigh_test.append(len(sorted(nx.common_neighbors(G_undirected, source, target))))

    res = nx.preferential_attachment(G_undirected, [(source, target)])
    for r in res:
        pre_at_test.append(r[2])
    
    res = nx.resource_allocation_index(G_undirected, [(source, target)])
    for r in res:
        resource_alloc_test.append(r[2])
    
    counter += 1
    if counter % 2000 == True:
        print counter, "training examples processsed"

1 training examples processsed
2001 training examples processsed
4001 training examples processsed
6001 training examples processsed
8001 training examples processsed
10001 training examples processsed
12001 training examples processsed
14001 training examples processsed
16001 training examples processsed
18001 training examples processsed
20001 training examples processsed
22001 training examples processsed
24001 training examples processsed
26001 training examples processsed
28001 training examples processsed
30001 training examples processsed
32001 training examples processsed


In [104]:
resource_alloc_test = preprocessing.scale(resource_alloc_test)
pre_at_test = preprocessing.scale(pre_at_test)
common_neigh_test = preprocessing.scale(common_neigh_test)

In [105]:
feats_test = []
feats_test.append(existing_features_test[0][:6] + ['common_neigh'] + ['pre_at'] + ['resource_alloc'])
for i in range(len(testing_set)):
    feats_test.append(existing_features_test[i+1][:6] + [common_neigh_test[i]] + [pre_at_test[i]] + [resource_alloc_test[i]])
print feats_test[0]

['source', 'target', 'overlap_title', 'diff', 'common_author', 'jaccard_sim', 'common_neigh', 'pre_at', 'resource_alloc']


### Writing features for test data to csv

In [107]:
# write features to .csv file
with open("features_test.csv", "wb") as feat:
    csv_out = csv.writer(feat)
    for row in feats_test:
        csv_out.writerow(row)

In [108]:
# convert labels into integers then into column array

testing_features = [element[2:NUM_FEATURES+2] for element in feats_test[1:]]
testing_features = np.array(testing_features, dtype=np.float64)

In [109]:
print len(testing_features), testing_features[0], len(testing_set), testing_set[0]

32648 [-0.56631167 -0.31764894 -0.23392443 -0.6281111  -0.56276568 -0.18520986
 -0.51311621] 32648 ['9807076', '9807139']


### Running the model for test data and write the link predictions in the file

In [151]:
# issue predictions
test_predictions_SVM = list(rf.predict(testing_features))

# write predictions to .csv file suitable for Kaggle (just make sure to add the column names)
test_predictions_SVM = zip(range(len(testing_set)), test_predictions_SVM)

with open("improved_predictions_common_neigh_rf.csv","wb") as pred1:
    csv_out = csv.writer(pred1)
    csv_out.writerow(('ID','category'))
    for row in test_predictions_SVM:
        csv_out.writerow(row)