In [26]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [2]:
data_dir = "../data/SimpleQuestions_v2_modified/"
train_path = os.path.join(data_dir, "train.txt")
val_path = os.path.join(data_dir, "val.txt")
test_path = os.path.join(data_dir, "test.txt")

In [15]:
train_df = pd.read_table(train_path, header=None, names=["lineid", "entity_mid", "relation", "object", "question"])
val_df = pd.read_table(val_path, header=None, names=["lineid", "entity_mid", "relation", "object", "question"])
test_df = pd.read_table(test_path, header=None, names=["lineid", "entity_mid", "relation", "object", "question"])

In [4]:
# test out code on small dataset first
train_df = train_df[:2000]
test_df = test_df[:200]

In [5]:
train_df.head()

Unnamed: 0,lineid,entity_mid,relation,object,question
0,train-1,fb:m.04whkz5,fb:book.written_work.subjects,fb:m.01cj3p,what is the book e about
1,train-2,fb:m.0tp2p24,fb:music.release_track.release,fb:m.0sjc7c1,to what release does the release track cardiac...
2,train-3,fb:m.04j0t75,fb:film.film.country,fb:m.07ssc,what country was the film the debt from
3,train-4,fb:m.0ftqr,fb:music.producer.tracks_produced,fb:m.0p600l,what songs have nobuo uematsu produced?
4,train-5,fb:m.036p007,fb:music.release.producers,fb:m.0677ng,Who produced eve-olution?


In [16]:
# 1st baseline - Logistic regression with tf-idf
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression()),
])

text_clf.fit(train_df['question'], train_df['relation'])
predicted = text_clf.predict(test_df['question'])
accuracy = 100.0 * np.mean(predicted == test_df['relation'])
print(accuracy)

In [19]:
print(len(predicted))
print(len(test_df['relation']))

print(predicted[:5])
print(test_df.head())

Unnamed: 0,lineid,entity_mid,relation,object,question
0,test-1,fb:m.01jp8ww,fb:music.album.genre,fb:m.01qzt1,Which genre of album is harder.....faster?
1,test-2,fb:m.0np6z99,fb:music.album.release_type,fb:m.02lx2r,what format is fearless
2,test-3,fb:m.0wzc58l,fb:people.person.place_of_birth,fb:m.0n2z,what city was alex golfis born in
3,test-4,fb:m.0jtw9c,fb:film.writer.film,fb:m.05szq8z,what film is by the writer phil hay?
4,test-5,fb:m.0gys2sn,fb:people.deceased_person.place_of_death,fb:m.0tzls,Where did roger marquis die


In [31]:
# 2nd baseline - SGD classifier - SVM hinge loss
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42))])
text_clf.fit(train_df['question'], train_df['relation'])
predicted = text_clf.predict(test_df['question'])
accuracy = 100.0 * np.mean(predicted == test_df['relation'])          
print(accuracy)

66.6297782081
