In [140]:
import random, csv
import numpy as np
#import igraph
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import networkx as nx

### Reading test and train data, and node info

In [141]:
#nltk.download('punkt') # for tokenization
#nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

In [142]:
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

In [143]:
with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]

### Creating graph (nodes and edges from training data)

In [144]:
edges = [(element[0],element[1]) for element in training_set if element[2]=="1"]
nodes = [element[0] for element in training_set] + [element[1] for element in training_set]

In [145]:
# this is the actual directed graph
G = nx.DiGraph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)

In [146]:
# undirected version of the graph (used for jaccard similarity)
G_undirected = nx.Graph()
G_undirected.add_nodes_from(nodes)
G_undirected.add_edges_from(edges)

### Global variables

In [236]:
NUM_EXAMPLES = 100000
NUM_TRAIN = 70000
NUM_VALIDATION = 30000

### Creating features for the entire data to store in csv file
##### since this is very slow, I am writing only 50000 examples for now

In [237]:
### creating features and store in a csv file

# number of overlapping words in title
overlap_title = []

# temporal distance between the papers
temp_diff = []

# number of common authors
comm_auth = []

# jaccard similarity
jaccard_sim = []

counter = 0
for i in xrange(len(training_set[:NUM_EXAMPLES])):
    source = training_set[i][0]
    target = training_set[i][1]
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
	# convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
	# remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    overlap_title.append(len(set(source_title).intersection(set(target_title))))
    temp_diff.append(int(source_info[1]) - int(target_info[1]))
    comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
   
    preds = nx.jaccard_coefficient(G_undirected, [(source, target)])
    for _, _, p in preds:
        jaccard_sim.append(p)
    
    counter += 1
    if counter % 2000 == True:
        print counter, "training examples processsed"

1 training examples processsed
2001 training examples processsed
4001 training examples processsed
6001 training examples processsed
8001 training examples processsed
10001 training examples processsed
12001 training examples processsed
14001 training examples processsed
16001 training examples processsed
18001 training examples processsed
20001 training examples processsed
22001 training examples processsed
24001 training examples processsed
26001 training examples processsed
28001 training examples processsed
30001 training examples processsed
32001 training examples processsed
34001 training examples processsed
36001 training examples processsed
38001 training examples processsed
40001 training examples processsed
42001 training examples processsed
44001 training examples processsed
46001 training examples processsed
48001 training examples processsed
50001 training examples processsed
52001 training examples processsed
54001 training examples processsed
56001 training examples proc

In [238]:
# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
training_features = np.array([overlap_title, temp_diff, comm_auth, jaccard_sim]).T

# scale
training_features = preprocessing.scale(training_features)

In [239]:
# write features to .csv file
X_train = np.array(training_set[:NUM_EXAMPLES]).T
X_features = training_features.T
features = zip(X_train[0], X_train[1], X_train[2], X_features[0], X_features[1], X_features[2], X_features[3])
with open("features.csv","wb") as feat:
    csv_out = csv.writer(feat)
    csv_out.writerow(('source', 'target', 'label', 'overlap_title', 'diff', 'common_author', 'jaccard_sim'))
    for row in features:
        csv_out.writerow(row)

### Reading features from features.csv

In [240]:
# reading features
with open("features.csv", "r") as f:
    reader = csv.reader(f)
    feats = list(reader)

In [241]:
len(feats), feats[1][3:6]

(100001,
 ['1.6372487962605144', '-0.32515275343668099', '-0.23481201628467319'])

### Creating training_features and validation_features

In [242]:
# convert labels into integers then into column array

labels = [int(element[2]) for element in feats[1:NUM_TRAIN+1]]
labels_array = np.array(labels)
training_features = [element[3:7] for element in feats[1:NUM_TRAIN+1]]
training_features = np.array(training_features, dtype=np.float64)

In [243]:
len(training_features), training_features[0]

(70000, array([  1.63724880e+00,  -3.25152753e-01,  -2.34812016e-01,
          1.31323942e-03]))

In [244]:
# convert labels into integers then into column array
labels_validation = [int(element[2]) for element in feats[NUM_TRAIN+1:]]
#labels_validation = list(labels_validation)
labels_array_validation = np.array(labels_validation)
validation_features = [element[3:7] for element in feats[NUM_TRAIN+1:]]
validation_features = np.array(validation_features, dtype=np.float64)

In [245]:
len(validation_features), validation_features[0]

(30000, array([ 0.53333424,  0.52539418, -0.23481202,  1.02823117]))

### Initializing basic SVM and predict on validation set

In [246]:
# initialize basic SVM
classifier = svm.LinearSVC()

# train
classifier.fit(training_features, labels_array)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [247]:
predictions_SVM = list(classifier.predict(validation_features))

### Calculating the metrics

In [248]:
import sklearn 
f1score = sklearn.metrics.f1_score(labels_array_validation, predictions_SVM)
acc = sklearn.metrics.accuracy_score(labels_array_validation, predictions_SVM)

print 'Network size: ', len(G.nodes())
print 'Number of training samples: ', NUM_TRAIN
print 'Number of validation samples: ', NUM_VALIDATION
print 'F1-score: ', f1score
print 'Accuracy: ', acc

Network size:  27770
Number of training samples:  70000
Number of validation samples:  30000
F1-score:  0.933435396932
Accuracy:  0.930433333333


### Creating features for test data

In [249]:
# test
# we need to compute the features for the testing set

overlap_title_test = []
temp_diff_test = []
comm_auth_test = []
jaccard_sim=[]

counter = 0
for i in xrange(len(testing_set)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    overlap_title_test.append(len(set(source_title).intersection(set(target_title))))
    temp_diff_test.append(int(source_info[1]) - int(target_info[1]))
    comm_auth_test.append(len(set(source_auth).intersection(set(target_auth))))
   
    preds = nx.jaccard_coefficient(G_undirected, [(source, target)])
    for _, _, p in preds:
        jaccard_sim.append(p)
        
    counter += 1
    if counter % 1000 == True:
        print counter, "testing examples processsed"

1 testing examples processsed
1001 testing examples processsed
2001 testing examples processsed
3001 testing examples processsed
4001 testing examples processsed
5001 testing examples processsed
6001 testing examples processsed
7001 testing examples processsed
8001 testing examples processsed
9001 testing examples processsed
10001 testing examples processsed
11001 testing examples processsed
12001 testing examples processsed
13001 testing examples processsed
14001 testing examples processsed
15001 testing examples processsed
16001 testing examples processsed
17001 testing examples processsed
18001 testing examples processsed
19001 testing examples processsed
20001 testing examples processsed
21001 testing examples processsed
22001 testing examples processsed
23001 testing examples processsed
24001 testing examples processsed
25001 testing examples processsed
26001 testing examples processsed
27001 testing examples processsed
28001 testing examples processsed
29001 testing examples proc

In [250]:
# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
testing_features = np.array([overlap_title_test, temp_diff_test, comm_auth_test, jaccard_sim]).T

# scale
testing_features = preprocessing.scale(testing_features)


In [251]:
# write features to .csv file
X_test = np.array(testing_set).T
X_features_test = testing_features.T
features_test = zip(X_test[0], X_test[1], X_features_test[0], X_features_test[1], X_features_test[2], X_features_test[3])
with open("features_test.csv","wb") as feat:
    csv_out = csv.writer(feat)
    csv_out.writerow(('source', 'target', 'overlap_title', 'diff', 'common_author', 'jaccard_sim'))
    for row in features_test:
        csv_out.writerow(row)

### Reading the test data from features_test.csv

In [252]:
# reading test features
with open("features_test.csv", "r") as f:
    reader = csv.reader(f)
    feats_test = list(reader)

In [255]:
# convert labels into integers then into column array

testing_features = [element[2:7] for element in feats_test[1:]]
testing_features = np.array(testing_features, dtype=np.float64)

In [256]:
print len(testing_features), testing_features[0]

32648 [-0.56631167 -0.31764894 -0.23392443 -0.6281111 ]


### Running the model for test data and write the link predictions in the file

In [257]:
# issue predictions
test_predictions_SVM = list(classifier.predict(testing_features))

# write predictions to .csv file suitable for Kaggle (just make sure to add the column names)
test_predictions_SVM = zip(range(len(testing_set)), test_predictions_SVM)

with open("improved_predictions_jaccard.csv","wb") as pred1:
    csv_out = csv.writer(pred1)
    csv_out.writerow(('ID','category'))
    for row in test_predictions_SVM:
        csv_out.writerow(row)