Skip to content

Commit

Permalink
Added notebook for testing classification using LAT features
Browse files Browse the repository at this point in the history
  • Loading branch information
pichljan committed Jul 29, 2015
1 parent 2a3f9dd commit 20bd6ca
Show file tree
Hide file tree
Showing 7 changed files with 18,104 additions and 0 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ The accuracy on test data set is around 72%. The question vector is computed as
If the first two words are "what is" then the question vector is computed as an averge over all words. The questions starting with "what"
or "what is" are hard to classify because it would need some more information about which word in question is relevant for its type.

Classification using LAT features
=================================

With question LAT features, the average accuracy with cross validation is around 82%.
The notebook testing this type of classification is called classify-from-features.ipynb.
TODO: dump train adn test data sets at once in order to get same number of features for both data sets
TODO: combine this classifier with the one using words embeddings


DATASETS
========
Expand Down
6,084 changes: 6,084 additions & 0 deletions classify-from-features.ipynb

Large diffs are not rendered by default.

500 changes: 500 additions & 0 deletions data/test-data.json

Large diffs are not rendered by default.

500 changes: 500 additions & 0 deletions data/test-data.tsv

Large diffs are not rendered by default.

5,452 changes: 5,452 additions & 0 deletions data/train-data.json

Large diffs are not rendered by default.

5,452 changes: 5,452 additions & 0 deletions data/train-data.tsv

Large diffs are not rendered by default.

108 changes: 108 additions & 0 deletions fbpathtrain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""
Service routines for training a Naive Bayes classifier to predict which
Freebase property paths would match answers given the question features.
"""

from __future__ import print_function

import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import sys


def q_to_fdict(q):
fdict = {}
for lat in q['LAT']:
fdict['lat/' + lat['text'] + '/' + lat['type']] = 1
for sv in q['SV']:
fdict['sv'] = sv
return fdict


def q_to_lset(q):
lset = set()
for rp in q['relPaths']:
lset.add('|'.join(rp[0]))
return lset


def mrr_by_score(Y, Yscores):
recipr_ranks = []
for i in range(np.size(Y, axis=0)):
pathj_by_score = [k[0] for k in sorted(enumerate(Yscores[i]), key=lambda k: k[1], reverse=True)]
n_j = 0
rank = None
for j in pathj_by_score:
if Y[i][j] == 1:
rank = n_j+1
break
n_j += 1
if rank is not None:
recipr_ranks.append(1/float(rank))
else:
# we are interested in MRR just for questions that *have* a solution
pass
# recipr_ranks.append(0)
return np.mean(recipr_ranks)


class VectorizedData:
""" Simple container that holds the input dataset
in a sklearn-friendly form, with X, y numpy vectors.
TODO: we ignore # of matches for each fbpath """
def __init__(self, data, Xdict=None, Ydict=None):
fdict = [q_to_fdict(q) for q in data]
lset = [q_to_lset(q) for q in data]

if Xdict is None:
self.Xdict = DictVectorizer()
self.X = self.Xdict.fit_transform(fdict)
else:
self.Xdict = Xdict
self.X = self.Xdict.transform(fdict)

if Ydict is None:
self.Ydict = MultiLabelBinarizer()
self.Y = self.Ydict.fit_transform(lset)
else:
self.Ydict = Ydict

# Filter out data with unknown labels, MultiLabelBinarizer() cannot
# handle this
known_lset = [set([label for label in ls if label in self.Ydict.classes_]) for ls in lset]
lset_n = sum([len(ls) for ls in lset])
known_lset_n = sum([len(ls) for ls in known_lset])
if known_lset_n < lset_n:
print('dropped %d out of %d labels (not in training set)' % (lset_n - known_lset_n, lset_n), file=sys.stderr)

self.Y = self.Ydict.transform(known_lset)

def cfier_score(self, cfier, scorer):
""" Measure cfier performance on this dataset.
scorer -> lambda cfier, X: cfier.predict_proba(X)
(or decision_function when probabilities not predicted) """
skl_score = cfier.score(self.X.toarray(), self.Y)

# XXX: Matched paths might/could be weighted by their nMatches too...

# Measure prediction performance
Ypred = cfier.predict(self.X.toarray())
n_q = float(np.size(self.Y, axis=0))
# number of questions where all correct paths have been recalled
recall_all = np.sum(np.sum(self.Y, axis=1) == np.sum(Ypred * self.Y, axis=1)) / n_q
# number of questions where at least one correct path has been recalled
recall_any = np.sum((np.sum(self.Y, axis=1) != 0) == (np.sum(Ypred * self.Y, axis=1) != 0)) / n_q
# number of *PATHS* (not q.) that were correct
precision = np.sum((Ypred + self.Y) == 2) / float(np.sum(Ypred))

# Measure scoring performance
Yscores = scorer(cfier, self.X.toarray())
# MRR of first correct path
mrr = mrr_by_score(self.Y, Yscores)
# number of questions where at least one correct path has been recalled in top N paths
# TODO

return {'sklScore': skl_score, 'qRecallAll': recall_all, 'qRecallAny': recall_any, 'pPrec': precision, 'qScoreMRR': mrr}

0 comments on commit 20bd6ca

Please sign in to comment.