Added notebook for testing classification using LAT features

brmson · Jul 29, 2015 · 20bd6ca · 20bd6ca
1 parent 2a3f9dd
commit 20bd6ca
Show file tree

Hide file tree

Showing 7 changed files with 18,104 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -24,6 +24,14 @@ The accuracy on test data set is around 72%. The question vector is computed as
 If the first two words are "what is" then the question vector is computed as an averge over all words. The questions starting with "what"
 or "what is" are hard to classify because it would need some more information about which word in question is relevant for its type.
 
+Classification using LAT features
+=================================
+
+With question LAT features, the average accuracy with cross validation is around 82%.
+The notebook testing this type of classification is called classify-from-features.ipynb.
+TODO: dump train adn test data sets at once in order to get same number of features for both data sets
+TODO: combine this classifier with the one using words embeddings 
+
 
 DATASETS
 ========

diff --git a/classify-from-features.ipynb b/classify-from-features.ipynb
diff --git a/data/test-data.json b/data/test-data.json
diff --git a/data/test-data.tsv b/data/test-data.tsv
diff --git a/data/train-data.json b/data/train-data.json
diff --git a/data/train-data.tsv b/data/train-data.tsv
diff --git a/fbpathtrain.py b/fbpathtrain.py
@@ -0,0 +1,108 @@
+"""
+Service routines for training a Naive Bayes classifier to predict which
+Freebase property paths would match answers given the question features.
+"""
+
+from __future__ import print_function
+
+import numpy as np
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.preprocessing import MultiLabelBinarizer
+import sys
+
+
+def q_to_fdict(q):
+    fdict = {}
+    for lat in q['LAT']:
+        fdict['lat/' + lat['text'] + '/' + lat['type']] = 1
+    for sv in q['SV']:
+        fdict['sv'] = sv
+    return fdict
+
+
+def q_to_lset(q):
+    lset = set()
+    for rp in q['relPaths']:
+        lset.add('|'.join(rp[0]))
+    return lset
+
+
+def mrr_by_score(Y, Yscores):
+    recipr_ranks = []
+    for i in range(np.size(Y, axis=0)):
+        pathj_by_score = [k[0] for k in sorted(enumerate(Yscores[i]), key=lambda k: k[1], reverse=True)]
+        n_j = 0
+        rank = None
+        for j in pathj_by_score:
+            if Y[i][j] == 1:
+                rank = n_j+1
+                break
+            n_j += 1
+        if rank is not None:
+            recipr_ranks.append(1/float(rank))
+        else:
+            # we are interested in MRR just for questions that *have* a solution
+            pass
+            # recipr_ranks.append(0)
+    return np.mean(recipr_ranks)
+
+
+class VectorizedData:
+    """ Simple container that holds the input dataset
+    in a sklearn-friendly form, with X, y numpy vectors.
+
+    TODO: we ignore # of matches for each fbpath """
+    def __init__(self, data, Xdict=None, Ydict=None):
+        fdict = [q_to_fdict(q) for q in data]
+        lset = [q_to_lset(q) for q in data]
+
+        if Xdict is None:
+            self.Xdict = DictVectorizer()
+            self.X = self.Xdict.fit_transform(fdict)
+        else:
+            self.Xdict = Xdict
+            self.X = self.Xdict.transform(fdict)
+
+        if Ydict is None:
+            self.Ydict = MultiLabelBinarizer()
+            self.Y = self.Ydict.fit_transform(lset)
+        else:
+            self.Ydict = Ydict
+
+            # Filter out data with unknown labels, MultiLabelBinarizer() cannot
+            # handle this
+            known_lset = [set([label for label in ls if label in self.Ydict.classes_]) for ls in lset]
+            lset_n = sum([len(ls) for ls in lset])
+            known_lset_n = sum([len(ls) for ls in known_lset])
+            if known_lset_n < lset_n:
+                print('dropped %d out of %d labels (not in training set)' % (lset_n - known_lset_n, lset_n), file=sys.stderr)
+
+            self.Y = self.Ydict.transform(known_lset)
+
+    def cfier_score(self, cfier, scorer):
+        """ Measure cfier performance on this dataset.
+
+        scorer -> lambda cfier, X: cfier.predict_proba(X)
+        (or decision_function when probabilities not predicted) """
+        skl_score = cfier.score(self.X.toarray(), self.Y)
+
+        # XXX: Matched paths might/could be weighted by their nMatches too...
+
+        # Measure prediction performance
+        Ypred = cfier.predict(self.X.toarray())
+        n_q = float(np.size(self.Y, axis=0))
+        # number of questions where all correct paths have been recalled
+        recall_all = np.sum(np.sum(self.Y, axis=1) == np.sum(Ypred * self.Y, axis=1)) / n_q
+        # number of questions where at least one correct path has been recalled
+        recall_any = np.sum((np.sum(self.Y, axis=1) != 0) == (np.sum(Ypred * self.Y, axis=1) != 0)) / n_q
+        # number of *PATHS* (not q.) that were correct
+        precision = np.sum((Ypred + self.Y) == 2) / float(np.sum(Ypred))
+
+        # Measure scoring performance
+        Yscores = scorer(cfier, self.X.toarray())
+        # MRR of first correct path
+        mrr = mrr_by_score(self.Y, Yscores)
+        # number of questions where at least one correct path has been recalled in top N paths
+        # TODO
+
+        return {'sklScore': skl_score, 'qRecallAll': recall_all, 'qRecallAny': recall_any, 'pPrec': precision, 'qScoreMRR': mrr}