In [1]:
from sklearn import linear_model
import gensim
import operator
import string
import random
import numpy as np

In [2]:
def load_data(filename):
    res = []
    with open(filename, 'r') as f:
        for line in f:
            label, question = line.split(" ", 1)
            res.append((label, question))
    return res

def average_vector2(dictionary, question):
    cnt = 0
    s = [0]*vector_dim
    for w in question.split(" "):
        w = w.lower()
        cnt += 1
        try:
            # print word, word_vector[word]
            s = map(operator.add, dictionary[w], s)
        except KeyError:
            cnt -= 1
            # pass #Use random vector or skip?
#             s = map(operator.add, dictionary.seeded_vector(random_generator(50)), s)
    if cnt == 0:
        return s
    return [elem/float(cnt) for elem in s]

def average_vector(dictionary, question):
    splitted = question.split(" ")
    s = [0]*vector_dim
    cnt = 2.0
    try:
        if (len(splitted) == 0):
            return s
        else:
            s = map(operator.add, dictionary[splitted[0].lower()], s)
            if (len(splitted) <= 1):
                return s
            s = map(operator.add, dictionary[splitted[1].lower()], s)
            if (splitted[0].lower() == 'what' and splitted[1].lower() == 'is'):
                return average_vector2(dictionary, question)
#                 s = map(operator.add, dictionary[splitted[3].lower()], s)
#                 cnt += 1.0
            return [elem/cnt for elem in s]         
    except KeyError:
        return s 
    
def compute_accuracy(predicted, original):
    eq = [z[0] == z[1] for z in zip(predicted, original)]
    return eq.count(True)/float(len(eq))

def is_in_class(dictionary, questions, cls, train_lab, test_lab):
    model = linear_model.LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
    tr_lab = [x.split(":")[0] for x in train_lab]
    ts_lab = [x.split(":")[0] for x in test_lab]
    model.fit(questions, np.array(tr_lab) == cls)
    train_data_prediction = [model.predict(vec) for vec in questions]
    test_data_prediction = [model.predict(average_vector(dictionary, line[1].lower())) for line in test_data]
    print "Train accuracy for class " + cls + ": " + str(compute_accuracy(train_data_prediction, np.array(tr_lab) == cls))
    print "Test accuracy for class " + cls + ": " + str(compute_accuracy(test_data_prediction, np.array(ts_lab) == cls))
    # print [model.predict(average_vector(dictionary, line[1].lower())) for line in train_data]
    return [model.predict_proba(average_vector(dictionary, line[1].lower())) for line in train_data]
    # return model

In [3]:
def remove_stop_words(data, threshold=150):
    result_data = []
    stop_word_dict = dict()
    for line in data:
        words = line[1].split()
        first = True
        for w in words:
            if first:
                first = False
                continue
            w = w.lower()
            if (w in stop_word_dict):
                cnt = stop_word_dict[w]
                stop_word_dict[w] = cnt + 1
            else:
                stop_word_dict[w] = 1
#     print sorted(stop_word_dict.items(), key=operator.itemgetter(1))
    for i, line in enumerate(data):
        res = []
        first = True
        for w in line[1].split():            
            if (first or stop_word_dict[w.lower()] < threshold):     
                res.append(w)
            first = False
        result_data.append((line[0]," ".join(res)))
    
    return result_data

In [4]:
word_vector_path = "data/glove.6B.50d.txt"
training_data_path = "data/train_5500.label"
testing_data_path = "data/TREC_10.label"
vector_dim = 50
word_vector = gensim.models.Word2Vec.load_word2vec_format(word_vector_path, binary=False)



In [5]:
train_data = load_data(training_data_path)
test_data = load_data(testing_data_path)
# train_data = remove_stop_words(train_data, 100)
# test_data = remove_stop_words(test_data, 100)
question_vectors = [average_vector(word_vector, line[1]) for line in train_data]
train_labels = [line[0] for line in train_data]
test_labels = [line[0] for line in test_data]

cfier = linear_model.LogisticRegression(multi_class='multinomial',solver='lbfgs')
cfier.fit(question_vectors, train_labels)
train_data_prediction = [cfier.predict(average_vector(word_vector, line[1].lower())) for line in train_data]
test_data_prediction = [cfier.predict(average_vector(word_vector, line[1].lower())) for line in test_data]

print "Accuracy with fine grained question classes:"
print "Train accuracy " + str(compute_accuracy(train_data_prediction, train_labels))
print "Test accuracy " + str(compute_accuracy(test_data_prediction, test_labels))

print ("Accuracy with coarse grained question classes:")
cfier = linear_model.LogisticRegression(multi_class='multinomial',solver='lbfgs')
coarse_test_labels = [line[0].split(":")[0] for line in test_data]
coarse_train_labels = [line[0].split(":")[0] for line in train_data]
cfier.fit(question_vectors, coarse_train_labels)
train_data_prediction = [cfier.predict(average_vector(word_vector, line[1].lower())) for line in train_data]
test_data_prediction = [cfier.predict(average_vector(word_vector, line[1].lower())) for line in test_data]
print "Train accuracy " + str(compute_accuracy(train_data_prediction, coarse_train_labels))
print "Test accuracy " + str(compute_accuracy(test_data_prediction, coarse_test_labels))

Accuracy with fine grained question classes:
Train accuracy 0.523110785033
Test accuracy 0.53
Accuracy with coarse grained question classes:
Train accuracy 0.687820983125
Test accuracy 0.722


In [6]:
#Reduced data set
s = set()
[s.add(elem) for elem in coarse_train_labels]

lab_len = coarse_train_labels.__len__()
cls_len = []
matrix = []
for c in s:
    prob = coarse_train_labels.count(c) / float(lab_len)
    questions = []
    labs = []
    for i, q_lab in enumerate(coarse_train_labels):
        if (q_lab == c or random.random() < prob):
            questions.append(question_vectors[i])
            labs.append(q_lab)
    print len(questions), 2*coarse_train_labels.count(c)
    matrix.append(is_in_class(word_vector, questions, c, labs, coarse_test_labels))

1544 1670
Train accuracy for class LOC: 0.847150259067
Test accuracy for class LOC: 0.818
2180 2446
Train accuracy for class HUM: 0.842201834862
Test accuracy for class HUM: 0.898
1631 1792
Train accuracy for class NUM: 0.82219497241
Test accuracy for class NUM: 0.806
163 172
Train accuracy for class ABBR: 0.815950920245
Test accuracy for class ABBR: 0.728
2207 2500
Train accuracy for class ENTY: 0.750792931581
Test accuracy for class ENTY: 0.588
2096 2324
Train accuracy for class DESC: 0.787213740458
Test accuracy for class DESC: 0.782


In [41]:
matrix = []
for c in s:
    matrix.append(is_in_class(word_vector, question_vectors, c, train_labels, test_labels))

Train accuracy for class LOC: 0.927549523111
Test accuracy for class LOC: 0.93
Train accuracy for class HUM: 0.893617021277
Test accuracy for class HUM: 0.964
Train accuracy for class NUM: 0.920579603815
Test accuracy for class NUM: 0.852
Train accuracy for class ABBR: 0.98422597212
Test accuracy for class ABBR: 0.982
Train accuracy for class ENTY: 0.811812179017
Test accuracy for class ENTY: 0.838
Train accuracy for class DESC: 0.837674247982
Test accuracy for class DESC: 0.728


In [7]:
npmatrix = np.array(matrix)

m2 = []
for i in range(len(matrix[0])):
    m2.append(max(npmatrix[:,i,0,1]) == npmatrix[:,i,0,1])

l = [i for i in s]    
mymap = {}
for i,e in enumerate(s):
    mymap[e] = i
new_lab = np.zeros([len(coarse_train_labels), 6], dtype=bool)
for i,l in enumerate(coarse_train_labels):
    new_lab[i][mymap[l]] = True

cnt = 0
for row1, row2 in zip(m2, new_lab):
    if False not in (row1 == row2):
        cnt += 1

print ("Classification into each class separately:")
print ("Train Accuracy " + str(cnt / float(len(m2))))

Classification into each class separately:
Train Accuracy 0.656089508437


In [141]:
cnt = 0
what_cnt = 0
whatis_cnt = 0
for line in test_data:
#     print line, cfier.predict(average_vector(word_vector, line[1].lower()))
    if (line[0].split(":")[0] != cfier.predict(average_vector(word_vector, line[1].lower()))):
        print line, cfier.predict(average_vector(word_vector, line[1].lower()))
        cnt += 1
    if (line[1].split(" ")[0].lower() == 'what'):
        what_cnt += 1
        if (line[1].split(" ")[1].lower() == 'is'):
            whatis_cnt += 1
print cnt, len(test_data), cnt / float(len(test_data))
print what_cnt/float(len(test_data)), whatis_cnt/float(len(test_data))

 ('ENTY:plant', "What is Australia 's national flower ?\n") ['LOC']
('HUM:ind', "What person 's head is on a dime ?\n") ['DESC']
('NUM:weight', 'What is the average weight of a Yellow Labrador ?\n') ['ENTY']
('NUM:other', 'What is the life expectancy for crickets ?\n') ['ENTY']
('LOC:other', 'What imaginary line is halfway between the North and South Poles ?\n') ['ENTY']
('NUM:speed', 'What is the average speed of the horses at the Kentucky Derby ?\n') ['ENTY']
('NUM:temp', 'What is the temperature at the center of the earth ?\n') ['LOC']
('HUM:gr', 'What is the name of the chocolate company in San Francisco ?\n') ['LOC']
('DESC:desc', 'What is done with worn or outdated flags ?\n') ['ENTY']
('NUM:speed', 'What is the speed hummingbirds fly ?\n') ['ENTY']
('HUM:ind', "What was W.C. Fields ' real name ?\n") ['ENTY']
('ENTY:food', 'What do bats eat ?\n') ['DESC']
('ENTY:termeq', 'What do you call a newborn kangaroo ?\n') ['DESC']
('LOC:other', 'What strait separates North America from As

AttributeError: 'list' object has no attribute 'lower'