In [10]:
import random, csv
import numpy as np
import cPickle
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import networkx as nx

### Reading test and train data, and node info

In [2]:
#nltk.download('punkt') # for tokenization
#nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()
stemmer_new = nltk.stem.SnowballStemmer("english")
with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

In [3]:
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

In [4]:
with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]

### Creating graph (nodes and edges from training data)

In [5]:
edges = [(element[0],element[1]) for element in training_set if element[2]=="1"]
nodes = [element[0] for element in training_set] + [element[1] for element in training_set]

In [6]:
# this is the actual directed graph
G = nx.DiGraph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)

In [7]:
# undirected version of the graph (used for jaccard similarity)
G_undirected = nx.Graph()
G_undirected.add_nodes_from(nodes)
G_undirected.add_edges_from(edges)

### Global variables

In [8]:
# NUM_EXAMPLES = 100000
# NUM_TRAIN = 70000
# NUM_VALIDATION = 30000

In [15]:
tfidf_dict = cPickle.load(open('tfidf_features.p', 'rb'))
print type(tfidf_dict)

<type 'dict'>


In [16]:
print tfidf_dict[tfidf_dict.keys()[0]]
print tfidf_dict.keys()[0]

0.0182400016638
('9312155', '9506142')


### Creating features for the entire data to store in csv file
##### since this is very slow, I am writing only 50000 examples for now

In [17]:
### creating features and store in a csv file

# number of overlapping words in title
overlap_title = []

# number of overlapping words in summary
overlap_summary = []

# temporal distance between the papers
temp_diff = []

# number of common authors
comm_auth = []

# jaccard similarity
jaccard_sim = []

# adamic score
adamic = []

tfidf = []

counter = 0
for i in xrange(len(training_set)):
    source = training_set[i][0]
    target = training_set[i][1]
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
    # Title
	# convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
	# remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    # Author
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    overlap_title.append(len(set(source_title).intersection(set(target_title))))
    temp_diff.append(int(source_info[1]) - int(target_info[1]))
    comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
   
    # Summary
    # convert to lowercase and tokenize
    source_summary = source_info[2].lower().split(" ")
    source_summary = [token for token in source_summary if token not in stpwds]
    source_summary = [stemmer_new.stem(token) for token in source_summary]
    
    target_summary = target_info[2].lower().split(" ")
    target_summary = [token for token in target_summary if token not in stpwds]
    target_summary = [stemmer_new.stem(token) for token in target_summary]
    
    overlap_summary.append(len(set(source_summary).intersection(set(target_summary))))
    
    # Jaccard
    preds = nx.jaccard_coefficient(G_undirected, [(source, target)])
    for _, _, p in preds:
        jaccard_sim.append(p)
        
    # Adamic adar
    preds = nx.adamic_adar_index(G_undirected, [(source, target)])
    for _, _, p in preds:
        adamic.append(p)
    
    key = (source, target)
    tfidf.append(tfidf_dict[key])
    
    counter += 1
    if counter % 20000 == True:
        print counter, "training examples processsed"

1 training examples processsed
20001 training examples processsed
40001 training examples processsed
60001 training examples processsed
80001 training examples processsed
100001 training examples processsed
120001 training examples processsed
140001 training examples processsed
160001 training examples processsed
180001 training examples processsed
200001 training examples processsed
220001 training examples processsed
240001 training examples processsed
260001 training examples processsed
280001 training examples processsed
300001 training examples processsed
320001 training examples processsed
340001 training examples processsed
360001 training examples processsed
380001 training examples processsed
400001 training examples processsed
420001 training examples processsed
440001 training examples processsed
460001 training examples processsed
480001 training examples processsed
500001 training examples processsed
520001 training examples processsed
540001 training examples processsed
5

In [24]:
training_features = np.array([overlap_title, temp_diff, comm_auth, overlap_summary, jaccard_sim, adamic, tfidf]).T

In [24]:
# # convert list of lists into array
# # documents as rows, unique words as columns (i.e., example as rows, features as columns)
# training_features = np.array([overlap_title, temp_diff, comm_auth, jaccard_sim]).T

# # scale
# training_features = preprocessing.scale(training_features)

In [25]:
print training_features.shape

(615512, 7)


In [26]:
# write features to .csv file
X_train = np.array(training_set[:]).T
X_features = training_features.T
features = zip(X_train[0], X_train[1], X_train[2], X_features[0], X_features[1], X_features[2], X_features[3], X_features[4], \
              X_features[5], X_features[6])
with open("features_train_unscaled.csv","wb") as feat:
    csv_out = csv.writer(feat)
    csv_out.writerow(('source', 'target', 'label', 'overlap_title', 'temp_diff', 'common_author', 'overlap_summary', 'jaccard_sim'\
                     ,'adamic', 'tfidf_cosine_sim'))
    for row in features:
        csv_out.writerow(row)

### Computing features for test set:

In [28]:
tfidf_dict_test = cPickle.load(open('tfidf_features_test.p', 'rb'))
print type(tfidf_dict_test)

<type 'dict'>


In [29]:
### creating features and store in a csv file

# number of overlapping words in title
overlap_title = []

# number of overlapping words in summary
overlap_summary = []

# temporal distance between the papers
temp_diff = []

# number of common authors
comm_auth = []

# jaccard similarity
jaccard_sim = []

# adamic score
adamic = []

tfidf = []

counter = 0
for i in xrange(len(testing_set)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
    # Title
	# convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
	# remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    # Author
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    overlap_title.append(len(set(source_title).intersection(set(target_title))))
    temp_diff.append(int(source_info[1]) - int(target_info[1]))
    comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
   
    # Summary
    # convert to lowercase and tokenize
    source_summary = source_info[2].lower().split(" ")
    source_summary = [token for token in source_summary if token not in stpwds]
    source_summary = [stemmer_new.stem(token) for token in source_summary]
    
    target_summary = target_info[2].lower().split(" ")
    target_summary = [token for token in target_summary if token not in stpwds]
    target_summary = [stemmer_new.stem(token) for token in target_summary]
    
    overlap_summary.append(len(set(source_summary).intersection(set(target_summary))))
    
    # Jaccard
    preds = nx.jaccard_coefficient(G_undirected, [(source, target)])
    for _, _, p in preds:
        jaccard_sim.append(p)
        
    # Adamic adar
    preds = nx.adamic_adar_index(G_undirected, [(source, target)])
    for _, _, p in preds:
        adamic.append(p)
    
    key = (source, target)
    tfidf.append(tfidf_dict_test[key])
    
    counter += 1
    if counter % 10000 == True:
        print counter, "testing examples processsed"

1 testing examples processsed
10001 testing examples processsed
20001 testing examples processsed
30001 testing examples processsed


In [30]:
testing_features = np.array([overlap_title, temp_diff, comm_auth, overlap_summary, jaccard_sim, adamic, tfidf]).T

In [31]:
testing_features.shape

(32648, 7)

In [32]:
# write features to .csv file
X_test = np.array(testing_set[:]).T
X_test_features = testing_features.T
features = zip(X_test[0], X_test[1], X_test_features[0], X_test_features[1], X_test_features[2], X_test_features[3], X_test_features[4], \
              X_test_features[5], X_test_features[6])
with open("features_test_unscaled.csv","wb") as feat:
    csv_out = csv.writer(feat)
    csv_out.writerow(('source', 'target', 'overlap_title', 'temp_diff', 'common_author', 'overlap_summary', 'jaccard_sim'\
                     ,'adamic', 'tfidf_cosine_sim'))
    for row in features:
        csv_out.writerow(row)