allenai · dykang · Feb 7, 2019 · Feb 7, 2019
diff --git a/code/accept_classify/classify.py b/code/accept_classify/classify.py
@@ -36,9 +36,9 @@ def main(args, scale=False):
     train_counter = Counter(train_labels)
     dev_counter = Counter(dev_labels)
     test_counter = Counter(test_labels)
-    print train_counter, train_features.shape
-    print dev_counter, dev_features.shape
-    print test_counter, test_features.shape
+    print(train_counter, train_features.shape)
+    print(dev_counter, dev_features.shape)
+    print(test_counter, test_features.shape)
     print("Train majority: {}, Dev majority: {} Test majorit: {}".format(
       round(100.0*train_counter[0]/(train_counter[0]+train_counter[1]),3),
       round(100.0*dev_counter[0]/(dev_counter[0]+dev_counter[1]),3),
@@ -61,7 +61,7 @@ def main(args, scale=False):
       neural_network.MLPClassifier(alpha=1),
       ensemble.AdaBoostClassifier()]
     random.shuffle(clfs)
-    print 'Total number of classifiers',len(clfs)
+    print('Total number of classifiers',len(clfs))
 
     ###########################
     # training (CV) and testing
@@ -80,15 +80,15 @@ def main(args, scale=False):
     # train
     train_y_hat = best_classifier.predict(train_features)
     train_score = 100.0 * sum(train_labels == train_y_hat) / len(train_y_hat)
-    print 'Train accuracy: %.2f in %d examples' %(round(train_score,3), sum(train_labels))
+    print('Train accuracy: %.2f in %d examples' %(round(train_score,3), sum(train_labels)))
     # dev
     dev_y_hat = best_classifier.predict(dev_features)
     dev_score = 100.0 * sum(dev_labels == dev_y_hat) / len(dev_y_hat)
-    print 'Dev accuracy: %.2f in %d examples' %(round(dev_score,3), sum(dev_labels))
+    print('Dev accuracy: %.2f in %d examples' %(round(dev_score,3), sum(dev_labels)))
     # test
     test_y_hat = best_classifier.predict(test_features)
     test_score = 100.0 * sum(test_labels == test_y_hat) / len(test_y_hat)
-    print 'Test accuracy: %.2f in %d examples' %(round(test_score,3),sum(test_labels))
+    print('Test accuracy: %.2f in %d examples' %(round(test_score,3),sum(test_labels)))
 
 
 if __name__ == "__main__": sys.exit(main(sys.argv))
diff --git a/code/accept_classify/featurize.py b/code/accept_classify/featurize.py
@@ -3,7 +3,8 @@
 """
 
 import sys,os,random,json,glob,operator,re
-import cPickle as pkl
+# import cPickle as pkl
+import pickle as pkl
 from collections import Counter
 from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
 from itertools import dropwhile
@@ -98,10 +99,10 @@ def main(args, lower=True, max_vocab_size = False, encoder='bowtfidf'):
     is_train = False
     idToFeature = read_features(feature_output_file)
     if encoder:
-      print 'Loading vector file from...',vect_file
+      print('Loading vector file from...',vect_file)
       vect = load_vect(vect_file)
   else:
-    print 'Loading vector file from scratch..'
+    print('Loading vector file from scratch..')
     idToFeature = dict()
 
   outLabelsFile = open(out_dir + '/labels_%s_%s_%s.tsv'%(str(max_vocab_size), str(encoder),str(hand)), 'w')
@@ -114,7 +115,7 @@ def main(args, lower=True, max_vocab_size = False, encoder='bowtfidf'):
   ################################
   # read reviews
   ################################
-  print 'Reading reviews from...',paper_json_dir
+  print('Reading reviews from...',paper_json_dir)
   paper_content_corpus = [] #""
   paper_json_filenames = sorted(glob.glob('{}/*.json'.format(paper_json_dir)))
   papers = []
@@ -124,7 +125,7 @@ def main(args, lower=True, max_vocab_size = False, encoder='bowtfidf'):
     paper_content_corpus.append(paper.SCIENCEPARSE.get_paper_content())
     papers.append(paper)
   random.shuffle(papers)
-  print 'Total number of reviews',len(papers)
+  print('Total number of reviews',len(papers))
 
 
   def get_feature_id(feature):
@@ -150,7 +151,7 @@ def addFeatureToDict(fname):
     pkl.dump(paper_content_corpus_words, open(outCorpusFilename, 'wb'))
   else:
     paper_content_corpus_words = pkl.load(open(outCorpusFilename,'rb'))
-  print 'Total words in corpus',len(paper_content_corpus_words)
+  print('Total words in corpus',len(paper_content_corpus_words))
 
 
 
@@ -159,10 +160,10 @@ def addFeatureToDict(fname):
   ################################
   # Encoding
   ################################
-  print 'Encoding..',encoder
+  print('Encoding..',encoder)
   # 1) tf-idf features on title/author_names/domains
   if not encoder:
-    print 'No encoder',encoder
+    print('No encoder',encoder)
   elif encoder in ['bow', 'bowtfidf']:
     word_counter = Counter(paper_content_corpus_words)
     # vocab limit by frequency
@@ -180,7 +181,7 @@ def addFeatureToDict(fname):
             vocabulary[w] = fid
     print("Got vocab of size",len(vocabulary))
     if is_train:
-      print 'Saving vectorized',vect_file
+      print('Saving vectorized',vect_file)
       if encoder == 'bow':
         vect = CountVectorizer( max_df=0.5, analyzer='word', stop_words='english', vocabulary=vocabulary)
       else:
@@ -192,18 +193,26 @@ def addFeatureToDict(fname):
   # 2) sentence encoder features
   elif encoder in ['w2v', 'w2vtfidf']:
     from sent2vec import MeanEmbeddingVectorizer,TFIDFEmbeddingVectorizer,import_embeddings
+
+    # vocab limit by frequency
+    word_counter = False
+    if max_vocab_size:
+      word_counter = Counter(paper_content_corpus_words)
+      word_counter = dict(word_counter.most_common()[:max_vocab_size])
+
     if is_train:
       w2v = import_embeddings()
-      vect = MeanEmbeddingVectorizer(w2v) if encoder=='w2v' else TFIDFEmbeddingVectorizer(w2v)
+      vect = MeanEmbeddingVectorizer(w2v,word_counter) if encoder=='w2v' else TFIDFEmbeddingVectorizer(w2v,word_counter)
       for f in range(vect.dim):
         #fid = get_feature_id()
         addFeatureToDict('%s%d'%(encoder,f))
-      print 'Saving vectorized',vect_file
+      print('Saving vectorized',vect_file)
+
       if encoder == 'w2vtfidf':
         vect.fit([p for p in paper_content_corpus])
       save_vect(vect, vect_file)
   else:
-    print 'Wrong type of encoder',encoder
+    print('Wrong type of encoder',encoder)
     sys.exit(1)
 
 
@@ -220,7 +229,7 @@ def addFeatureToDict(fname):
     all_titles_features = vect.transform(all_titles)
 
   if is_train:
-    print 'saving features to file',feature_output_file
+    print('saving features to file',feature_output_file)
     if hand:
       addFeatureToDict("get_most_recent_reference_year")
       addFeatureToDict("get_num_references")
@@ -273,7 +282,7 @@ def addFeatureToDict(fname):
         for word_id in range(vect.dim):
           outSvmLiteFile.write(str(word_id)+":"+ str(title_tfidf[word_id])+" ")
       else:
-        print 'wrong ecndoer', encoder
+        print('wrong ecndoer', encoder)
         sys.exit(1)
 
     if hand:
@@ -359,9 +368,9 @@ def addFeatureToDict(fname):
   outLabelsFile.close()
   outIDFile.close()
   outSvmLiteFile.close()
-  print 'saved',outLabelsFile.name
-  print 'saved',outIDFile.name
-  print 'saved',outSvmLiteFile.name
+  print('saved',outLabelsFile.name)
+  print('saved',outIDFile.name)
+  print('saved',outSvmLiteFile.name)
 
 
 if __name__ == "__main__":

diff --git a/code/accept_classify/run_featurize_classify.sh b/code/accept_classify/run_featurize_classify.sh
@@ -4,7 +4,7 @@
 DATADIR=../../data/iclr_2017
 DATASETS=("train" "dev" "test")
 FEATDIR=dataset
-MAX_VOCAB=False
+MAX_VOCAB=30000 #False
 ENCODER=w2v
 HAND=True
 

diff --git a/code/accept_classify/sent2vec.py b/code/accept_classify/sent2vec.py
@@ -1,7 +1,7 @@
 """
 contains different embedding vectorizers and embedding loader
 """
-
+import sys
 import numpy as np
 import gensim
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -13,20 +13,32 @@ def import_embeddings(filename="../../data/word2vec/glove.840B.300d.w2v.txt", bi
     Loading pre-trained word embeddings
     For speed-up, you can convert the text file to binary and turn on the switch "binary=True"
   """
-  w2v = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=binary)
+
+  if sys.version_info[0] < 3:
+    w2v = gensim.models.Word2Vec.load_word2vec_format(filename, binary=binary)
+  else:
+    w2v = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=binary)
   return w2v
 
 class MeanEmbeddingVectorizer(object):
   """
    Given a input sentence, output averaged vector of word embeddings in the sentence
   """
 
-  def __init__(self, word2vec):
-    self.word2vec = word2vec
+  def __init__(self, word2vec,word_counter=False):
+    if word_counter:
+      self.word2vec = {}
+      for w in word_counter:
+        if w in word2vec:
+          self.word2vec[w] = word2vec[w]
+      #import pdb; pdb.set_trace()
+    else:
+      self.word2vec = word2vec
+
     # if a text is empty we should return a vector of zeros
     # with the same dimensionality as all the other vectors
     self.dim = len(word2vec['a'])
-    print 'Dimension: ',self.dim
+    print('Dimension: ',self.dim)
 
   def fit(self, X, y=None):
     return self
@@ -44,8 +56,17 @@ class TFIDFEmbeddingVectorizer(object):
   """
    Given a input sentence, output averaged vector of word embeddings weighted by TFIDF scores
   """
-  def __init__(self, word2vec):
-    self.word2vec = word2vec
+  def __init__(self, word2vec,word_counter=False):
+    if word_counter:
+      self.word2vec = {}
+      for w in word2vec:
+        if w in word_counter:
+          self.word2vec[w] = word2vec[w]
+    else:
+      self.word2vec = word2vec
+
+
+
     self.word2weight = None
     self.dim = len(word2vec['a'])
 

diff --git a/code/aspect_predict/assign_annot_iclr_2017.py b/code/aspect_predict/assign_annot_iclr_2017.py
@@ -18,7 +18,7 @@ def get_annots_dic(data_dir='./', input_filename = 'annotation_full.tsv'):
         #print 'Keys[%d]'%(len(tks)),tks
         continue
       if len(tks) != 12:
-        print 'WRONG token length',len(tks), tks
+        print('WRONG token length',len(tks), tks)
         continue
 
       aspect_dic = {}
@@ -50,12 +50,12 @@ def main():
 
   # Loading annotaions
   annots = get_annots_dic()
-  print 'Loaded annots: %d papers and %d reviews'%(len(annots), sum([len(v) for k,v in annots.items() ]))
+  print('Loaded annots: %d papers and %d reviews'%(len(annots), sum([len(v) for k,v in annots.items() ])))
 
   # Loading reviews, merging them with annotations, and saving into new directory
   data_dir = "../../data/iclr_2017" # args[1]   #train/reviews
   datasets = ['train','dev','test']
-  print 'Reading reviews from...'
+  print('Reading reviews from...')
   for dataset in datasets:
 
     cnt_p, cnt_r = 0, 0
@@ -101,9 +101,9 @@ def main():
 
       # save to /reviews_annotated
       json.dump(paper.to_json_object(), open(review_annotated_dir+'/%s.json'%(paper.ID),'w'))
-      print paper.ID, len(paper.REVIEWS)
+      print(paper.ID, len(paper.REVIEWS))
       cnt_p += 1
-    print dataset, cnt_p, cnt_r
+    print(dataset, cnt_p, cnt_r)
 
   # note that we replace reviews/ with reviews_annotated/ to reduce duplicates now