In [1]:
import pandas as pd
import numpy as np

import codecs
import json

import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
def unwrap_lucene_features(name, row):
    doc_scores = ['qScores', 'aScores', 'bothQAScores', 'bothQAScoresMustHave']
    doc_ids = ['qDocs', 'aDocs', 'bothQADoc', 'bothQADocAMustHave']

    d = row[name]

    for scores, docs in zip(doc_scores, doc_ids):
        d_scores = d[scores]
        std = np.std(d_scores)
        d[scores + '_mean'] = np.mean(d_scores)
        d[scores + '_std'] = std
        d[scores + '_median'] = np.median(d_scores)
        d[scores + '_doc_scores'] = zip(d[docs], d_scores)

        for i, score in enumerate(d_scores):
            d['%s_doc_%d' % (scores, i + 1)] = score
            d['%s_doc_%d_up' % (scores, i + 1)] = score + std
            d['%s_doc_%d_down' % (scores, i + 1)] = score - std

    blacklist = set(doc_scores + doc_ids)
    tuples = [(name + '_' + n, f) for (n, f) in d.items() if n not in blacklist]

    return dict(tuples)

In [96]:
lucene_features = ['wiki_ck12_ngrams_slide']

def transform(row):
    for lucene_feature in lucene_features:
        lfs = unwrap_lucene_features(lucene_feature, row['luceneFeatures'])
        row.update(lfs)

    del row['luceneFeatures']
    del row['word2VecFeatures']
    return row

In [97]:
dicts = []

with codecs.open('/home/agrigorev/git-projects/allen-qa/lucene-features-7-wikislide.json', 'r', 'utf-8') as f:
    for line in f:
        row = json.loads(line)
        row = transform(row)
        dicts.append(row)


In [98]:
questions = pd.DataFrame(dicts)
del dicts

In [99]:
questions.sort(columns=['questionId', 'answerLetter'], inplace=1)

In [100]:
questions.head()

Unnamed: 0,answer,answerLetter,label,ngramsAnswer,ngramsQuestion,question,questionId,rawAnswer,rawQuestion,source,type,wiki_ck12_ngrams_slide_aScores_doc_1,wiki_ck12_ngrams_slide_aScores_doc_10,wiki_ck12_ngrams_slide_aScores_doc_10_down,wiki_ck12_ngrams_slide_aScores_doc_10_up,wiki_ck12_ngrams_slide_aScores_doc_1_down,wiki_ck12_ngrams_slide_aScores_doc_1_up,wiki_ck12_ngrams_slide_aScores_doc_2,wiki_ck12_ngrams_slide_aScores_doc_2_down,wiki_ck12_ngrams_slide_aScores_doc_2_up,Unnamed: 21
60162,[tissue level],A,False,"[tissue, level, tissue level]","[athlete, exercise, heart, rate, respiration, ...","[athlete, exercise, heart rate, respiration ra...",100001,at the tissue level,"when athletes begin to exercise, their heart r...",TRAIN,USUAL,33.65089,31.992455,31.411829,32.57308,33.070265,34.231516,33.65089,33.070265,34.231516,...
60166,[organ level],B,False,"[organ, level, organ level]","[athlete, exercise, heart, rate, respiration, ...","[athlete, exercise, heart rate, respiration ra...",100001,at the organ level,"when athletes begin to exercise, their heart r...",TRAIN,USUAL,39.921066,12.742194,4.138801,21.345588,31.317673,48.52446,37.932362,29.328968,46.535755,...
60170,[system level],C,True,"[system, level, system level]","[athlete, exercise, heart, rate, respiration, ...","[athlete, exercise, heart rate, respiration ra...",100001,at the system level,"when athletes begin to exercise, their heart r...",TRAIN,USUAL,28.896399,26.15295,24.91477,27.39113,27.658218,30.134579,28.867155,27.628975,30.105335,...
60175,[cellular level],D,False,"[cellular, level, cellular level]","[athlete, exercise, heart, rate, respiration, ...","[athlete, exercise, heart rate, respiration ra...",100001,at the cellular level,"when athletes begin to exercise, their heart r...",TRAIN,USUAL,29.844318,29.844318,29.844318,29.844318,29.844318,29.844318,29.844318,29.844318,29.844318,...
60178,"[smell, air, odor]",A,False,"[smell, air, odor, smell air, air odor, smell ...","[describe, learn, behavior, dog, describe lear...","[describe, learn behavior, dog]",100002,smelling the air for odors,which example describes a learned behavior in ...,TRAIN,USUAL,12.171963,7.382993,5.365214,9.400772,10.154184,14.189742,11.81347,9.795691,13.831249,...


### Let's find what numeric features are most accurate

In [101]:
train = questions[questions.source == 'TRAIN']

In [102]:
num = train._get_numeric_data()
numeric_features = list(num.columns)

In [103]:
results = []

for id, group in train.groupby('questionId'):
    row = {}
    row['correct'] = (group.label == 'true').values.argmax()

    for f in numeric_features:
        row[f] = group[f].values.argmax()
    
    results.append(row)

results = pd.DataFrame(results)

In [105]:
bf = []
for f in numeric_features:
    accucary = (results[f] == results.correct).mean()
    bf.append((f, accucary)) 
    
bf = pd.DataFrame(bf, columns=['feature', 'accuracy'])
bf.sort('accuracy', ascending=0)

Unnamed: 0,feature,accuracy
94.0,wiki_ck12_ngrams_slide_bothQAScores_doc_8_up,0.3612
83.0,wiki_ck12_ngrams_slide_bothQAScores_doc_5,0.3608
80.0,wiki_ck12_ngrams_slide_bothQAScores_doc_4,0.3600
91.0,wiki_ck12_ngrams_slide_bothQAScores_doc_7_up,0.3600
98.0,wiki_ck12_ngrams_slide_bothQAScores_mean,0.3600
75.0,wiki_ck12_ngrams_slide_bothQAScores_doc_2_down,0.3592
97.0,wiki_ck12_ngrams_slide_bothQAScores_doc_9_up,0.3588
71.0,wiki_ck12_ngrams_slide_bothQAScores_doc_10_up,0.3584
88.0,wiki_ck12_ngrams_slide_bothQAScores_doc_6_up,0.3580
86.0,wiki_ck12_ngrams_slide_bothQAScores_doc_6,0.3580


In [94]:
best_features = bf[bf.accuracy >= .35]
len(best_features)

26

In [95]:
boolean_features = pd.DataFrame(dict(questionId=questions.questionId, 
                                     answerLetter=questions.answerLetter, 
                                     source=questions.source, 
                                     label=questions.label))

In [61]:
def chunks(l, n):
    for i in xrange(0, len(l), n):
        yield l[i:i+n]

In [62]:
def to_boolean(series):
    series = np.array(series)

    res = np.zeros_like(series, dtype=int)    
    for i, chunk in enumerate(chunks(series, 4)):
        max = np.max(chunk)
        max_ids, = np.where(chunk == max)
        if len(max_ids) == 4:
            continue
        for id in max_ids:
            res[4 * i + id] = 1
    return res

In [63]:
for f in best_features.feature:
    boolean_features[f + 'max'] = to_boolean(  questions[f])
    boolean_features[f + 'min'] = to_boolean(- questions[f])

In [64]:
total = best_features.accuracy.sum() 

In [65]:
feature_weights = dict((f, a / total) for (f, a) in zip(bf.feature, bf.accuracy) if a >= 0.35)
features, weights = zip(*feature_weights.items())
features = list(features)
weights = list(weights)

In [66]:
all_features = [f + 'max' for f in features] + [f + 'min' for f in features]

## Training

In [21]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from scipy import sparse
from collections import Counter

In [22]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

In [23]:
from sklearn.decomposition import randomized_svd
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection

In [24]:
train = boolean_features[boolean_features.source == 'TRAIN']

In [25]:
y = (train.label == 'true').astype(int).values

In [27]:
def calc_accuracy(y, y_score):
    questions = chunks(zip(y, y_score),  4)

    total = 0
    correct = 0
    for chunk in questions:
        y_chunk, y_score_chunk = zip(*chunk)
        correct = correct + (np.argmax(y_chunk) == np.argmax(y_score_chunk))
        total = total + 1
    
    return 1.0 * correct / total

In [28]:
def train_test_split(X, y, n_iter=5, test_size=0.25):
    if sparse.issparse(X):
        X = X.tocsr()
    n = len(y) / 4

    split = cross_validation.ShuffleSplit(n=n, n_iter=n_iter, test_size=test_size)

    for train, test in split:
        train_idx = np.repeat(train, 4) * 4 + np.arange(len(train) * 4) % 4
        test_idx  = np.repeat(test, 4) * 4  + np.arange(len(test) * 4) % 4
        
        yield (X[train_idx], y[train_idx], X[test_idx], y[test_idx])

In [29]:
X_0 = train[all_features].values

In [30]:
accs = []
for X_train, y_train, X_test, y_test in train_test_split(X_0, y, n_iter=10, test_size=0.1):
    clf = LogisticRegressionCV()
    clf.fit(X_train, y_train)

    y_score = clf.predict_proba(X_test)[:, 1]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    
mean_acc = np.mean(accs)
print '%0.3f ± %0.3f' % (mean_acc, np.std(accs))

0.362 ± 0.032


In [31]:
accs = []
for X_train, y_train, X_test, y_test in train_test_split(X_0, y, n_iter=50, test_size=0.1):
    nmf = NMF(n_components=2, init='random')
    X_train_red = nmf.fit_transform(X_train)

    clf = LogisticRegressionCV()
    clf.fit(X_train_red, y_train)

    X_test_red = nmf.transform(X_test)
    y_score = clf.predict_proba(X_test_red)[:, 1]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    print acc, roc_auc_score(y_test, y_score)
    
mean_acc = np.mean(accs)
print '%0.3f ± %0.3f' % (mean_acc, np.std(accs))

0.368 0.59868
0.32 0.580837333333
0.36 0.5854
0.36 0.571826666667
0.38 0.598088
0.332 0.586261333333
0.368 0.60564
0.38 0.583664
0.4 0.608442666667
0.42 0.659642666667
0.372 0.635237333333
0.388 0.626304
0.4 0.601997333333
0.336 0.58504
0.364 0.583554666667
0.348 0.602792
0.312 0.582512
0.344 0.611072
0.416 0.643592
0.384 0.611050666667
0.376 0.613874666667
0.352 0.611658666667
0.332 0.603984
0.372 0.624469333333
0.424 0.650152
0.348 0.610210666667
0.384 0.627578666667
0.368 0.585816
0.38 0.616074666667
0.392 0.628592
0.324 0.565925333333
0.42 0.63232
0.412 0.616736
0.408 0.625616
0.388 0.623101333333
0.384 0.615208
0.34 0.566205333333
0.368 0.597346666667
0.352 0.600909333333
0.376 0.624069333333
0.34 0.601752
0.356 0.626485333333
0.36 0.570874666667
0.356 0.597330666667
0.368 0.612117333333
0.368 0.591733333333
0.348 0.611562666667
0.292 0.539816
0.404 0.650656
0.368 0.605858666667
0.368 ± 0.029


## Similarity b/w q and a

### LSA on q/a docs

In [67]:
questions_train = questions[questions.source == 'TRAIN']

In [68]:
def identity_analyzer(x):
    return x

In [69]:
from sklearn.feature_extraction import DictVectorizer

In [70]:
all_docs_train = \
    list(questions_train.wiki_full_qScores_doc_scores[::4].apply(dict)) + \
    list(questions_train.wiki_full_aScores_doc_scores.apply(dict))

In [71]:
dv = DictVectorizer()
dv.fit(all_docs_train)

DictVectorizer(dtype=<type 'numpy.float64'>, separator='=', sort=True,
        sparse=True)

In [72]:
q_vec = dv.transform(list(questions.wiki_full_qScores_doc_scores.apply(dict)))
a_vec = dv.transform(list(questions.wiki_full_aScores_doc_scores.apply(dict)))


In [73]:
sim_vec = []
for q, a in zip(q_vec, a_vec):
    qa = (q * a.T)
    if qa:
        sim_vec.append(qa.data[0])
    else:
        sim_vec.append(0.0)

sim_vec = np.array(sim_vec)

In [74]:
questions['ck12_qa_doc_sim'] = sim_vec

In [75]:
calc_accuracy(y, sim_vec[questions.source.values == 'TRAIN'])

0.3124

In [76]:
q_vec_train = dv.transform(list(questions_train.wiki_full_qScores_doc_scores.iloc[::4].apply(dict)))
a_vec_train = dv.transform(list(questions_train.wiki_full_aScores_doc_scores.apply(dict)))

all_docs = sparse.vstack([q_vec_train, a_vec_train])

In [77]:
svd_doc = TruncatedSVD(n_components=300)
svd_doc.fit(all_docs)

TruncatedSVD(algorithm='randomized', n_components=300, n_iter=5,
       random_state=None, tol=0.0)

In [78]:
q_vec_svd = svd_doc.transform(q_vec)
a_vec_svd = svd_doc.transform(a_vec)

In [79]:
doc_sim_svd = []
for q, a in zip(q_vec_svd, a_vec_svd):
    doc_sim_svd.append(q.dot(a))

doc_sim_svd =  np.array(doc_sim_svd)

In [80]:
questions['wiki_qa_doc_sim_svd'] = doc_sim_svd

In [81]:
calc_accuracy(y, doc_sim_svd[questions.source.values == 'TRAIN'])

0.3264

In [612]:
#to_boolean(questions['ck12_qa_doc_sim_svd'].values
features = pd.DataFrame({
              'questionId': questions.questionId, 
              'answerLetter': questions.answerLetter, 
              'source'
              'label': questions.label, 
              'ck12_qa_doc_sim_svd': docsim.ck12_qa_doc_sim_svd_max,
              'ck12_nmf_0': nmf_all[:, 0],
              'ck12_nmf_1': nmf_all[:, 1]},
            columns=['questionId', 'answerLetter', 'label', 
                     'ck12_qa_doc_sim_svd', 'ck12_nmf_0', 'ck12_nmf_1'])

In [615]:
features.to_csv('/home/agrigorev/git-projects/allen-qa/features/ck12_features.txt', index=0, sep='\t')

## Submission

In [42]:
test = boolean_features[questions.source == 'TEST']
test_docsim = docsim[boolean_features.source == 'TEST']

In [582]:
X_0_test = test[all_features].values
X_0_test_red = nmf.transform(X_0_test)

In [583]:
calc_accuracy(y, X_0_red[:, 1])

0.4208

In [587]:
y_score = X_0_test_red[:, 1]

In [546]:
X_2_test = np.hstack([X_0_test_red, test_docsim.values])

In [547]:
clf = LogisticRegressionCV()
clf.fit(X_2, y)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [548]:
y_score = clf.predict_proba(X_2_test)[:, 1]

In [588]:
idx = np.arange(len(test))

val_index = pd.DataFrame({'idx': idx, 'questionId': test.questionId, 
                          'answer': test.answerLetter,
                          'score': y_score})
val_index.head(n=8)

Unnamed: 0,answer,idx,questionId,score
52382,A,0,102501,0.0
52383,B,1,102501,0.0
52386,C,2,102501,1.22432
52388,D,3,102501,0.0
52390,A,4,102502,0.0
52393,B,5,102502,0.383347
52394,C,6,102502,0.172953
52397,D,7,102502,0.670249


In [589]:
result = []

for id, group in val_index.groupby('questionId'):
    answer_idx = group.score.values.argmax()
    answer = group.answer.values[answer_idx]
    result.append((id, answer))

In [590]:
res = pd.DataFrame(result, columns=['id', 'correctAnswer'])
res.to_csv('/home/agrigorev/git-projects/allen-qa/test_result.csv', index=0)

Majority Vote

In [192]:
result = []

for id, group in test.groupby('questionId'):
    row = {'questionId': id}
    for f in features:
        answer_idx = group[f + 'max'].values.argmax()
        answer = group.answerLetter.values[answer_idx]
        row[f] = answer

    result.append(row)

result = pd.DataFrame(result)

def majority_vote(row):
    cnt = Counter()
    cnt.setdefault(lambda x: 0.0)
    for pred, weight in zip(row[features].values, weights):
        cnt[pred] = cnt[pred] + weight

    (lab, _), = cnt.most_common(1)
    return lab

mvote = result.apply(majority_vote, axis=1)

In [198]:
mvote_res = pd.DataFrame({'id': result.questionId, 'correctAnswer': mvote}, columns=['id', 'correctAnswer'])
mvote_res.to_csv('/home/agrigorev/git-projects/allen-qa/test_result_mvote.csv', index=0)