Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

print functions for python3 #26

Merged
merged 1 commit into from
Feb 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions code/accept_classify/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ def main(args, scale=False):
train_counter = Counter(train_labels)
dev_counter = Counter(dev_labels)
test_counter = Counter(test_labels)
print train_counter, train_features.shape
print dev_counter, dev_features.shape
print test_counter, test_features.shape
print(train_counter, train_features.shape)
print(dev_counter, dev_features.shape)
print(test_counter, test_features.shape)
print("Train majority: {}, Dev majority: {} Test majorit: {}".format(
round(100.0*train_counter[0]/(train_counter[0]+train_counter[1]),3),
round(100.0*dev_counter[0]/(dev_counter[0]+dev_counter[1]),3),
Expand All @@ -61,7 +61,7 @@ def main(args, scale=False):
neural_network.MLPClassifier(alpha=1),
ensemble.AdaBoostClassifier()]
random.shuffle(clfs)
print 'Total number of classifiers',len(clfs)
print('Total number of classifiers',len(clfs))

###########################
# training (CV) and testing
Expand All @@ -80,15 +80,15 @@ def main(args, scale=False):
# train
train_y_hat = best_classifier.predict(train_features)
train_score = 100.0 * sum(train_labels == train_y_hat) / len(train_y_hat)
print 'Train accuracy: %.2f in %d examples' %(round(train_score,3), sum(train_labels))
print('Train accuracy: %.2f in %d examples' %(round(train_score,3), sum(train_labels)))
# dev
dev_y_hat = best_classifier.predict(dev_features)
dev_score = 100.0 * sum(dev_labels == dev_y_hat) / len(dev_y_hat)
print 'Dev accuracy: %.2f in %d examples' %(round(dev_score,3), sum(dev_labels))
print('Dev accuracy: %.2f in %d examples' %(round(dev_score,3), sum(dev_labels)))
# test
test_y_hat = best_classifier.predict(test_features)
test_score = 100.0 * sum(test_labels == test_y_hat) / len(test_y_hat)
print 'Test accuracy: %.2f in %d examples' %(round(test_score,3),sum(test_labels))
print('Test accuracy: %.2f in %d examples' %(round(test_score,3),sum(test_labels)))


if __name__ == "__main__": sys.exit(main(sys.argv))
43 changes: 26 additions & 17 deletions code/accept_classify/featurize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"""

import sys,os,random,json,glob,operator,re
import cPickle as pkl
# import cPickle as pkl
import pickle as pkl
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from itertools import dropwhile
Expand Down Expand Up @@ -98,10 +99,10 @@ def main(args, lower=True, max_vocab_size = False, encoder='bowtfidf'):
is_train = False
idToFeature = read_features(feature_output_file)
if encoder:
print 'Loading vector file from...',vect_file
print('Loading vector file from...',vect_file)
vect = load_vect(vect_file)
else:
print 'Loading vector file from scratch..'
print('Loading vector file from scratch..')
idToFeature = dict()

outLabelsFile = open(out_dir + '/labels_%s_%s_%s.tsv'%(str(max_vocab_size), str(encoder),str(hand)), 'w')
Expand All @@ -114,7 +115,7 @@ def main(args, lower=True, max_vocab_size = False, encoder='bowtfidf'):
################################
# read reviews
################################
print 'Reading reviews from...',paper_json_dir
print('Reading reviews from...',paper_json_dir)
paper_content_corpus = [] #""
paper_json_filenames = sorted(glob.glob('{}/*.json'.format(paper_json_dir)))
papers = []
Expand All @@ -124,7 +125,7 @@ def main(args, lower=True, max_vocab_size = False, encoder='bowtfidf'):
paper_content_corpus.append(paper.SCIENCEPARSE.get_paper_content())
papers.append(paper)
random.shuffle(papers)
print 'Total number of reviews',len(papers)
print('Total number of reviews',len(papers))


def get_feature_id(feature):
Expand All @@ -150,7 +151,7 @@ def addFeatureToDict(fname):
pkl.dump(paper_content_corpus_words, open(outCorpusFilename, 'wb'))
else:
paper_content_corpus_words = pkl.load(open(outCorpusFilename,'rb'))
print 'Total words in corpus',len(paper_content_corpus_words)
print('Total words in corpus',len(paper_content_corpus_words))



Expand All @@ -159,10 +160,10 @@ def addFeatureToDict(fname):
################################
# Encoding
################################
print 'Encoding..',encoder
print('Encoding..',encoder)
# 1) tf-idf features on title/author_names/domains
if not encoder:
print 'No encoder',encoder
print('No encoder',encoder)
elif encoder in ['bow', 'bowtfidf']:
word_counter = Counter(paper_content_corpus_words)
# vocab limit by frequency
Expand All @@ -180,7 +181,7 @@ def addFeatureToDict(fname):
vocabulary[w] = fid
print("Got vocab of size",len(vocabulary))
if is_train:
print 'Saving vectorized',vect_file
print('Saving vectorized',vect_file)
if encoder == 'bow':
vect = CountVectorizer( max_df=0.5, analyzer='word', stop_words='english', vocabulary=vocabulary)
else:
Expand All @@ -192,18 +193,26 @@ def addFeatureToDict(fname):
# 2) sentence encoder features
elif encoder in ['w2v', 'w2vtfidf']:
from sent2vec import MeanEmbeddingVectorizer,TFIDFEmbeddingVectorizer,import_embeddings

# vocab limit by frequency
word_counter = False
if max_vocab_size:
word_counter = Counter(paper_content_corpus_words)
word_counter = dict(word_counter.most_common()[:max_vocab_size])

if is_train:
w2v = import_embeddings()
vect = MeanEmbeddingVectorizer(w2v) if encoder=='w2v' else TFIDFEmbeddingVectorizer(w2v)
vect = MeanEmbeddingVectorizer(w2v,word_counter) if encoder=='w2v' else TFIDFEmbeddingVectorizer(w2v,word_counter)
for f in range(vect.dim):
#fid = get_feature_id()
addFeatureToDict('%s%d'%(encoder,f))
print 'Saving vectorized',vect_file
print('Saving vectorized',vect_file)

if encoder == 'w2vtfidf':
vect.fit([p for p in paper_content_corpus])
save_vect(vect, vect_file)
else:
print 'Wrong type of encoder',encoder
print('Wrong type of encoder',encoder)
sys.exit(1)


Expand All @@ -220,7 +229,7 @@ def addFeatureToDict(fname):
all_titles_features = vect.transform(all_titles)

if is_train:
print 'saving features to file',feature_output_file
print('saving features to file',feature_output_file)
if hand:
addFeatureToDict("get_most_recent_reference_year")
addFeatureToDict("get_num_references")
Expand Down Expand Up @@ -273,7 +282,7 @@ def addFeatureToDict(fname):
for word_id in range(vect.dim):
outSvmLiteFile.write(str(word_id)+":"+ str(title_tfidf[word_id])+" ")
else:
print 'wrong ecndoer', encoder
print('wrong ecndoer', encoder)
sys.exit(1)

if hand:
Expand Down Expand Up @@ -359,9 +368,9 @@ def addFeatureToDict(fname):
outLabelsFile.close()
outIDFile.close()
outSvmLiteFile.close()
print 'saved',outLabelsFile.name
print 'saved',outIDFile.name
print 'saved',outSvmLiteFile.name
print('saved',outLabelsFile.name)
print('saved',outIDFile.name)
print('saved',outSvmLiteFile.name)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion code/accept_classify/run_featurize_classify.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
DATADIR=../../data/iclr_2017
DATASETS=("train" "dev" "test")
FEATDIR=dataset
MAX_VOCAB=False
MAX_VOCAB=30000 #False
ENCODER=w2v
HAND=True

Expand Down
35 changes: 28 additions & 7 deletions code/accept_classify/sent2vec.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
contains different embedding vectorizers and embedding loader
"""

import sys
import numpy as np
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
Expand All @@ -13,20 +13,32 @@ def import_embeddings(filename="../../data/word2vec/glove.840B.300d.w2v.txt", bi
Loading pre-trained word embeddings
For speed-up, you can convert the text file to binary and turn on the switch "binary=True"
"""
w2v = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=binary)

if sys.version_info[0] < 3:
w2v = gensim.models.Word2Vec.load_word2vec_format(filename, binary=binary)
else:
w2v = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=binary)
return w2v

class MeanEmbeddingVectorizer(object):
"""
Given a input sentence, output averaged vector of word embeddings in the sentence
"""

def __init__(self, word2vec):
self.word2vec = word2vec
def __init__(self, word2vec,word_counter=False):
if word_counter:
self.word2vec = {}
for w in word_counter:
if w in word2vec:
self.word2vec[w] = word2vec[w]
#import pdb; pdb.set_trace()
else:
self.word2vec = word2vec

# if a text is empty we should return a vector of zeros
# with the same dimensionality as all the other vectors
self.dim = len(word2vec['a'])
print 'Dimension: ',self.dim
print('Dimension: ',self.dim)

def fit(self, X, y=None):
return self
Expand All @@ -44,8 +56,17 @@ class TFIDFEmbeddingVectorizer(object):
"""
Given a input sentence, output averaged vector of word embeddings weighted by TFIDF scores
"""
def __init__(self, word2vec):
self.word2vec = word2vec
def __init__(self, word2vec,word_counter=False):
if word_counter:
self.word2vec = {}
for w in word2vec:
if w in word_counter:
self.word2vec[w] = word2vec[w]
else:
self.word2vec = word2vec



self.word2weight = None
self.dim = len(word2vec['a'])

Expand Down
10 changes: 5 additions & 5 deletions code/aspect_predict/assign_annot_iclr_2017.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def get_annots_dic(data_dir='./', input_filename = 'annotation_full.tsv'):
#print 'Keys[%d]'%(len(tks)),tks
continue
if len(tks) != 12:
print 'WRONG token length',len(tks), tks
print('WRONG token length',len(tks), tks)
continue

aspect_dic = {}
Expand Down Expand Up @@ -50,12 +50,12 @@ def main():

# Loading annotaions
annots = get_annots_dic()
print 'Loaded annots: %d papers and %d reviews'%(len(annots), sum([len(v) for k,v in annots.items() ]))
print('Loaded annots: %d papers and %d reviews'%(len(annots), sum([len(v) for k,v in annots.items() ])))

# Loading reviews, merging them with annotations, and saving into new directory
data_dir = "../../data/iclr_2017" # args[1] #train/reviews
datasets = ['train','dev','test']
print 'Reading reviews from...'
print('Reading reviews from...')
for dataset in datasets:

cnt_p, cnt_r = 0, 0
Expand Down Expand Up @@ -101,9 +101,9 @@ def main():

# save to /reviews_annotated
json.dump(paper.to_json_object(), open(review_annotated_dir+'/%s.json'%(paper.ID),'w'))
print paper.ID, len(paper.REVIEWS)
print(paper.ID, len(paper.REVIEWS))
cnt_p += 1
print dataset, cnt_p, cnt_r
print(dataset, cnt_p, cnt_r)

# note that we replace reviews/ with reviews_annotated/ to reduce duplicates now

Expand Down
Loading