In [1]:
import random
import numpy as np
from sklearn import svm
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))

with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]



[nltk_data] Downloading package stopwords to /home/bat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
import networkx as nx
###############################
# beating the random baseline #
###############################

# the following script gets an F1 score of approximately 0.66

# data loading and preprocessing 

# the columns of the data frame below are: 
# (1) paper unique ID (integer)
# (2) publication year (integer)
# (3) paper title (string)
# (4) authors (strings separated by ,)
# (5) name of journal (optional) (string)
# (6) abstract (string) - lowercased, free of punctuation except intra-word dashes

with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]

# randomly select 5% of training set
to_keep = random.sample(range(len(training_set)), k=int(round(len(training_set)*0.05)))
training_set_reduced = [training_set[i] for i in to_keep]

# in this baseline we will use three basic features:

# number of overlapping words in title
overlap_title = []

# temporal distance between the papers
temp_diff = []

# number of common authors
comm_auth = []



counter = 0
training_set_reduced = np.array(training_set_reduced)
def make_graph(training_set_reduced):
    G = nx.DiGraph()
    for i in xrange(len(training_set_reduced)):
        source = training_set_reduced[i,0]
        target = training_set_reduced[i,1]
        G.add_node(source)
        G.add_node(target)
        if training_set_reduced[i,2] =='1':
            G.add_edge(source,target)
    return G

G = make_graph(training_set_reduced)   

In [46]:
def make_common_neighbors(G,source, target):
    common_neighbors =[]
    for i,n1 in enumerate(source):
        n2 = target[i]
        common_neighbors.append(len(list(nx.common_neighbors(G.to_undirected(), n1, n2))))
    return np.array(common_neighbors)

In [47]:
source = np.array(training_set_reduced[:,0])
target = np.array(training_set_reduced[:,1])
    
def make_features(G,source, target):
    common_neighbors =make_common_neighbors(G,source,target)
    
    features = np.hstack((common_neighbors))
    return features

In [48]:
source

array(['9801013', '9602156', '9807050', ..., '8060', '9506087', '9405134'], 
      dtype='|S7')

In [50]:
make_features(G, source, target)

9801013
9602156
9807050
9702086
9812167
207119
203192
10065
201047
101077
205218
302221
9812247
9804126
9906211
9407026
212211
110206
302179
9807146
110259
9806085
10195
9508078
304006
8027
9411047
9605011
210102
9810123
110046
205091
9906156
9706221
3078
9807107
201163
9901144
9604030
9810041
9803073
9504066
9804097
206109
9607162
9604035
9601117
11070
9610008
6146
211276
107061
206043
9904040
203030
9905018
107101
9503021
9906242
9702107
102098
9902034
106074
9406060
101211
9712249
9705168
9507136
301091
106116
107174
9809006
9711130
9609084
9801151
105039
9808192
11286
9712194
9905111
111047
204016
204109
10048
211001
11197
9704173
204155
212208
2120
7029
9210047
9608136
9803051
111087
9306069
9403068
9807233
107081
9910211
110024
212032
6003
9201040
9902029
2150
304195
104237
9712004
5053
206083
9909050
9811036
212245
4146
205227
9711157
9605028
9905111
102020
9609052
5055
9610132
204148
210154
210149
9906200
9710208
9603031
9812060
9609098
4026
7238
201259
10237
105229
9708105
970

KeyboardInterrupt: 

In [None]:

for n1 in G.nodes():
    for n2 in G.nodes():
        len(list(nx.common_neighbors(G.to_undirected(), n1, n2)))

In [12]:
len([int(i) for i in G.nodes()])

14220

In [11]:
np.unique([int(i) for i in G.nodes()])

array([   1001,    1002,    1005, ..., 9912288, 9912290, 9912293])

In [None]:
for n1 in G.nodes():
    for n2 in G.nodes():
        len(list(nx.common_neighbors(G.to_undirected(), n1, n2)))

In [3]:
print(G.nodes())

['110005', '9609135', '110007', '110001', '110002', '110008', '110006', '9310108', '211020', '9510188', '9510184', '9510186', '9510183', '9510182', '9706201', '102199', '9305010', '2116', '9305016', '9605154', '9305014', '301063', '301061', '301066', '304011', '304012', '9603030', '9603031', '301068', '9603033', '304018', '9603035', '9603037', '9612051', '205115', '205119', '9303029', '9312018', '210170', '9205081', '210175', '9205086', '9205085', '9205084', '210178', '9205089', '9205088', '9710029', '9902067', '105001', '105006', '12018', '105005', '12015', '9710021', '9710022', '9710023', '12011', '12013', '9710027', '9604026', '9604024', '9604025', '9604023', '9705169', '9805068', '9805065', '9705164', '9705163', '9705162', '9604028', '9705160', '9252', '9608023', '9712151', '9608026', '9608024', '9608028', '208080', '208081', '208082', '208084', '9712154', '208086', '208088', '103110', '9506096', '9609145', '9506091', '9506098', '9506099', '9806008', '9806009', '9605087', '9605086'

In [7]:
len(list(nx.common_neighbors(G.to_undirected(), '9306070', '9912106')))

0

In [10]:
nx.common_neighbors?

In [13]:
G = nx.complete_graph(5)
list(nx.common_neighbors(G, 0, 1))

[2, 3, 4]