In [1]:
import pandas as pd
import numpy as np
import codecs 
import json
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
dicts = []
with codecs.open('/home/agrigorev/git-projects/allen-qa/lucene-features-4-no-w2v.json', 'r', 'utf-8') as f:
    for line in f:
        #if line.contains('"source":"VALIDATION"'):
        #    continue
        dicts.append(json.loads(line))


In [3]:
len(dicts)

42528

In [3]:
questions = pd.DataFrame(dicts)
questions.columns

Index([u'answer', u'answerLetter', u'ck12EbookAnswerResult', u'ck12EbookBothQADoc', u'ck12EbookBothQADocAMustHave', u'ck12EbookBothQADocCount', u'ck12EbookBothQADocCountAMustHave', u'ck12EbookBothQAScores', u'ck12EbookBothQAScoresMustHave', u'ck12EbookQuestionResult', u'ck12WikiAnswerResult', u'ck12WikiBothQADoc', u'ck12WikiBothQADocAMustHave', u'ck12WikiBothQADocCount', u'ck12WikiBothQADocCountAMustHave', u'ck12WikiBothQAScores', u'ck12WikiBothQAScoresMustHave', u'ck12WikiQuestionResult', u'label', u'ngramsAnswer', u'ngramsCk12EbookAnswerResult', u'ngramsCk12EbookBothQADoc', u'ngramsCk12EbookBothQADocCount', u'ngramsCk12EbookBothQADocCountAMustHave', u'ngramsCk12EbookBothQADocMustHave', u'ngramsCk12EbookBothQAScores', u'ngramsCk12EbookBothQAScoresMustHave', u'ngramsCk12EbookQuestionResult', u'ngramsCk12WikiAnswerResult', u'ngramsCk12WikiBothQADoc', u'ngramsCk12WikiBothQADocCount', u'ngramsCk12WikiBothQADocCountAMustHave', u'ngramsCk12WikiBothQADocMustHave', u'ngramsCk12WikiBothQAScore

In [4]:
def extract_doc_features(docs):
    top = docs[:3]
    names = [d['title'] for d in top]
    all_scores = [d['score'] for d in docs]
    if not all_scores:
        all_scores = [0]

    top_scores = [d['score'] for d in top]
    if not top_scores:
        top_scores = [0]

    median_score = np.median(all_scores)
    mean_score = np.mean(all_scores)
    min_score = np.min(all_scores)

    score_1 = np.max(top_scores)
    score_2 = np.median(top_scores)
    score_3 = np.min(top_scores)

    return pd.Series((names, median_score, mean_score, min_score, score_1, score_2, score_3))

def names(pref):
    res = ['names', 'median_score', 'mean_score', 'min_score', 'score_1', 'score_2', 'score_3']
    return [pref + '_' + n for n in res]

In [5]:
questions[names('ck_12_answer')] = questions.ck12EbookAnswerResult.apply(extract_doc_features)
questions[names('ck_12_question')] = questions.ck12EbookQuestionResult.apply(extract_doc_features)

questions[names('wiki_answer')] = questions.ck12WikiAnswerResult.apply(extract_doc_features)
questions[names('wiki_question')] = questions.ck12WikiQuestionResult.apply(extract_doc_features)

questions[names('ck_12_ngram_answer')] = questions.ngramsCk12EbookAnswerResult.apply(extract_doc_features)
questions[names('ck_12_ngram_question')] = questions.ngramsCk12EbookQuestionResult.apply(extract_doc_features)

questions[names('wiki_ngram_answer')] = questions.ngramsCk12WikiAnswerResult.apply(extract_doc_features)
questions[names('wiki_ngram_question')] = questions.ngramsCk12WikiQuestionResult.apply(extract_doc_features)

In [42]:
questions.drop(['word2vecAnswer', 'word2vecQuestion', 'word2vecCosine', 'word2vecMissing'], inplace=1, axis=1)

ValueError: labels ['word2vecAnswer' 'word2vecQuestion' 'word2vecCosine'] not contained in axis

    questions.word2vecAnswer = questions.word2vecAnswer.apply(lambda x: np.array(x).astype('float'))
    questions.word2vecQuestion = questions.word2vecQuestion.apply(lambda x: np.array(x).astype('float'))
    questions.word2vecCosine = questions.word2vecCosine.astype('float')

    def NaNs_to_zeros(vec):
        if np.isnan(vec).all():
            return np.zeros_like(vec)
        else:
            return vec

    questions.word2vecAnswer = questions.word2vecAnswer.apply(NaNs_to_zeros)
    questions.word2vecQuestion = questions.word2vecQuestion.apply(NaNs_to_zeros)
    questions.word2vecCosine[questions.word2vecCosine.isnull()] = 0.0

In [6]:
score_features = [u'ck12EbookBothQAScores', u'ck12EbookBothQAScoresMustHave',
                    u'ck12WikiBothQAScores', u'ck12WikiBothQAScoresMustHave',
                    u'ngramsCk12EbookBothQAScores', u'ngramsCk12EbookBothQAScoresMustHave',
                    u'ngramsCk12WikiBothQAScores', u'ngramsCk12WikiBothQAScoresMustHave']

for f in score_features:
    questions[f] = questions[f].apply(lambda x: np.array(x).astype('float'))

In [8]:
def score_names(pref):
    return ['%s_both_score_%d' % (pref, i) for i in range(1, 4)]

def top_scores(scores):
    return pd.Series(scores[:3])

for f in score_features:
    questions[score_names(f)] = questions[f].apply(top_scores)


In [121]:
def chunks(l, n):
    for i in xrange(0, len(l), n):
        yield l[i:i+n]

In [127]:
def to_boolean(series):
    series = np.array(series)

    res = np.zeros_like(series, dtype=int)    
    for i, chunk in enumerate(chunks(series, 4)):
        max = np.max(chunk)
        max_ids, = np.where(chunk == max)
        for id in max_ids:
            res[4 * i + id] = 1
    return res

In [130]:
num = questions._get_numeric_data()
benchmark_features = list(num.columns)
benchmark_features

[u'ck12EbookBothQADocCount',
 u'ck12EbookBothQADocCountAMustHave',
 u'ck12WikiBothQADocCount',
 u'ck12WikiBothQADocCountAMustHave',
 u'ngramsCk12EbookBothQADocCount',
 u'ngramsCk12EbookBothQADocCountAMustHave',
 u'ngramsCk12WikiBothQADocCount',
 u'ngramsCk12WikiBothQADocCountAMustHave',
 'ck_12_answer_median_score',
 'ck_12_answer_mean_score',
 'ck_12_answer_min_score',
 'ck_12_answer_score_1',
 'ck_12_answer_score_2',
 'ck_12_answer_score_3',
 'ck_12_question_median_score',
 'ck_12_question_mean_score',
 'ck_12_question_min_score',
 'ck_12_question_score_1',
 'ck_12_question_score_2',
 'ck_12_question_score_3',
 'wiki_answer_median_score',
 'wiki_answer_mean_score',
 'wiki_answer_min_score',
 'wiki_answer_score_1',
 'wiki_answer_score_2',
 'wiki_answer_score_3',
 'wiki_question_median_score',
 'wiki_question_mean_score',
 'wiki_question_min_score',
 'wiki_question_score_1',
 'wiki_question_score_2',
 'wiki_question_score_3',
 'ck_12_ngram_answer_median_score',
 'ck_12_ngram_answer_mea

In [131]:
for f in benchmark_features:
    questions[f + '_ismax'] = to_boolean(questions[f])

In [9]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from scipy import sparse
from collections import Counter

In [132]:
train = questions[questions.source == 'TRAIN']

In [12]:
results = []

for id, group in train.groupby('questionId'):
    row = {}
    row['correct'] = (group.label == 'true').values.argmax()

    for f in benchmark_features:
        row[f] = group[f].values.argmax()
    
    results.append(row)

results = pd.DataFrame(results)

In [13]:
bf = []
for f in benchmark_features:
    accucary = (results[f] == results.correct).mean()
    bf.append((f, accucary)) 
    
bf = pd.DataFrame(bf, columns=['feature', 'accuracy'])
bf.sort('accuracy', ascending=0)

Unnamed: 0,feature,accuracy
70.0,ngramsCk12EbookBothQAScores_both_score_1,0.3896
71.0,ngramsCk12EbookBothQAScores_both_score_2,0.3848
59.0,ck12EbookBothQAScores_both_score_2,0.3788
73.0,ngramsCk12EbookBothQAScoresMustHave_both_score_1,0.3772
58.0,ck12EbookBothQAScores_both_score_1,0.3760
74.0,ngramsCk12EbookBothQAScoresMustHave_both_score_2,0.3732
72.0,ngramsCk12EbookBothQAScores_both_score_3,0.3724
61.0,ck12EbookBothQAScoresMustHave_both_score_1,0.3704
60.0,ck12EbookBothQAScores_both_score_3,0.3620
75.0,ngramsCk12EbookBothQAScoresMustHave_both_score_3,0.3616


In [31]:
def calc_accuracy(y, y_score):
    questions = chunks(zip(y, y_score),  4)

    total = 0
    correct = 0
    for chunk in questions:
        y_chunk, y_score_chunk = zip(*chunk)
        correct = correct + (np.argmax(y_chunk) == np.argmax(y_score_chunk))
        total = total + 1
    
    return 1.0 * correct / total

calc_accuracy(y, questions.ngramsCk12EbookBothQAScores_both_score_1.values)

0.3896

In [32]:
def train_test_split(X, y, n_iter=5, test_size=0.25):
    if sparse.issparse(X):
        X = X.tocsr()
    n = len(y)
    qidx = np.arange(n) / 4
    qidx = qidx[::4]

    split = cross_validation.ShuffleSplit(n=len(qidx), n_iter=n_iter, test_size=test_size)

    for train, test in split:
        train_qidx = qidx[train]
        test_qidx = qidx[test]
        
        train_idx = np.repeat(train_qidx, 4) * 4 + np.arange(len(train_qidx) * 4) % 4
        test_idx  = np.repeat(test_qidx, 4) * 4  + np.arange(len(test_qidx) * 4) % 4
        
        yield (X[train_idx], y[train_idx], X[test_idx], y[test_idx])

In [29]:
y = (train.label == 'true').astype(int)

In [144]:
X_0 = train[best_features].values

In [154]:
best_features = bf[bf.accuracy >= 0.30].feature
best_features = best_features.apply(lambda x: x + '_ismax').values

In [171]:
from sklearn.neighbors import KNeighborsClassifier

In [173]:
accs = []
for X_train, y_train, X_test, y_test in train_test_split(X_0, y):
    clf = KNeighborsClassifier(n_neighbors=1)
    clf.fit(X_train, y_train)
    
    y_score = clf.predict_proba(X_test)[:, 1]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    print acc, roc_auc_score(y_test, y_score)

np.mean(accs), np.std(accs)

0.304 0.555466666667
0.28 0.537333333333
0.2768 0.530933333333
0.2336 0.495466666667
0.3072 0.5472


(0.28032000000000001, 0.026376231724793434)

In [27]:
train_f1 = list(train.apply(lambda x:'%s' % (x['question']),axis=1))
train_f2 = list(train.apply(lambda x:'%s' % (x['answer']),axis=1))
train_f3 = list(train.apply(lambda x:'%s' % (x['type']),axis=1))

tfv1 = TfidfVectorizer(input=u'content', encoding=u'utf-8', decode_error=u'strict', 
                       strip_accents=None, lowercase=True, preprocessor=None, 
                       tokenizer=None, analyzer=u'word', stop_words=None, 
                       token_pattern=u'(?u)\\b\\w\\w+\\b', ngram_range=(1, 2), max_df=1.0, min_df=1)
train_f1 = tfv1.fit_transform(train_f1)

tfv2 = TfidfVectorizer(input=u'content', encoding=u'utf-8', decode_error=u'strict', 
                       strip_accents=None, lowercase=True, preprocessor=None, 
                       tokenizer=None, analyzer=u'word', stop_words=None, 
                       token_pattern=u'(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1)
train_f2 = tfv2.fit_transform(train_f2)

tfv3 = TfidfVectorizer(input=u'content', encoding=u'utf-8', decode_error=u'strict', 
                       strip_accents=None, lowercase=True, preprocessor=None, 
                       tokenizer=None, analyzer=u'word', stop_words=None, 
                       token_pattern=u'(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1)
train_f3 = tfv3.fit_transform(train_f3)



In [28]:
# X_w2v = np.column_stack((np.vstack(train.word2vecQuestion), 
#                      np.vstack(train.word2vecAnswer),
#                      train.word2vecCosine))

# X = sparse.hstack((X_w2v, train_f1, train_f2, train_f3))
X = sparse.hstack((train_f1, train_f2, train_f3))

In [33]:
accs = []
for X_train, y_train, X_test, y_test in train_test_split(X, y):
    clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    clf.fit(X_train, y_train)
    
    y_score = clf.predict_proba(X_test)[:, 1]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    print acc, roc_auc_score(y_test, y_score)

np.mean(accs), np.std(accs)

0.2976 0.552881066667
0.2848 0.54693376
0.3264 0.564781653333
0.2864 0.54079104
0.2976 0.555137706667


(0.29855999999999999, 0.014927236850803981)

In [34]:
from sklearn.decomposition import randomized_svd
from sklearn.decomposition import NMF
from sklearn.random_projection import GaussianRandomProjection

from sklearn.preprocessing import Normalizer

In [35]:
X_12 = sparse.hstack((train_f1, train_f2))
X_12.shape

(10000, 25508)

In [36]:
def svd(X, K):
    normalizer = Normalizer(copy=False)
    _, _, Vt = randomized_svd(X, n_components=K)
    X_red = X.dot(Vt.T)
    X_red = normalizer.fit_transform(X_red)
    return Vt, normalizer, X_red

In [37]:
def apply_svd(X, Vt, normalizer):
    X_red = X.dot(Vt.T)
    X_red = normalizer.transform(X_red)
    return X_red

In [38]:
_, _, X_12_lsa = svd(X_12, 100)

In [39]:
X_2 = sparse.hstack((train_f1, train_f2, train_f3, X_12_lsa))

In [40]:
accs = []

for X_train, y_train, X_test, y_test in train_test_split(X_2, y):
    clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    clf.fit(X_train, y_train)
    
    y_score = clf.predict_proba(X_test)[:, 1]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    print acc, roc_auc_score(y_test, y_score)

np.mean(accs), np.std(accs)

0.2704 0.52239872
0.3312 0.56574592
0.288 0.549410133333
0.2912 0.537565013333
0.2608 0.510480213333


(0.28832000000000002, 0.02418912152187426)

In [44]:
def identity_analyzer(x):
    return x

In [46]:
tf_ck12_d = TfidfVectorizer(analyzer=identity_analyzer, min_df=1)
ck12_docs_train = tf_ck12_d.fit_transform(train.ck12EbookBothQADoc)

tf_ck12_dmh = TfidfVectorizer(analyzer=identity_analyzer, min_df=1)
ck12_docs_mh_train = tf_ck12_dmh.fit_transform(train.ck12EbookBothQADocAMustHave)

In [48]:
X_3 = sparse.hstack((ck12_docs_train, ck12_docs_mh_train))

In [49]:
accs = []

for X_train, y_train, X_test, y_test in train_test_split(X_3, y):
    clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    clf.fit(X_train, y_train)
    
    y_score = clf.predict_proba(X_test)[:, 1]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    print acc, roc_auc_score(y_test, y_score)

np.mean(accs), np.std(accs)

0.272 0.528690346667
0.2944 0.54183296
0.2928 0.526051413333
0.3072 0.537538133333
0.296 0.534621866667


(0.29247999999999996, 0.011421803710447823)

In [77]:
ck12_Vt, ck12_norm, ck12_doc_lsa = svd(X_3, 100)

In [51]:
accs = []

for X_train, y_train, X_test, y_test in train_test_split(ck12_doc_lsa, y):
    clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    clf.fit(X_train, y_train)
    
    y_score = clf.predict_proba(X_test)[:, 1]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    print acc, roc_auc_score(y_test, y_score)

np.mean(accs), np.std(accs)

0.2912 0.5165632
0.2816 0.524692053333
0.2784 0.526139733333
0.2864 0.530528426667
0.272 0.51839616


(0.28192, 0.0065892032902316782)

In [57]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [53]:
ck_12_num_feature_names = names('ck_12_question')[1:] + names('ck_12_answer')[1:] + \
        score_names(u'ck12EbookBothQAScores') + score_names(u'ck12EbookBothQAScoresMustHave')
ck_12_num_features = train[ck_12_num_feature_names]

In [78]:
X_5 = np.hstack([ck_12_num_features.values])

accs = []

for X_train, y_train, X_test, y_test in train_test_split(X_5, y):
    clf = LogisticRegressionCV()
    clf.fit(X_train, y_train)
    
    y_score = clf.predict_proba(X_test)[:, 1]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    print acc, roc_auc_score(y_test, y_score)

np.mean(accs), np.std(accs)

0.4112 0.587826773333
0.3664 0.5839232
0.3728 0.595456853333
0.3776 0.5882432
0.3792 0.593838506667


(0.38144, 0.015532366207374847)

In [79]:
X_5 = np.hstack([ck12_doc_lsa, ck_12_num_features.values])

accs = []

for X_train, y_train, X_test, y_test in train_test_split(X_5, y):
    clf = LogisticRegressionCV()
    clf.fit(X_train, y_train)
    
    y_score = clf.predict_proba(X_test)[:, 1]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    print acc, roc_auc_score(y_test, y_score)

np.mean(accs), np.std(accs)

0.376 0.569581653333
0.4208 0.596921173333
0.3696 0.573149866667
0.352 0.589038506667
0.392 0.594828373333


(0.38207999999999992, 0.023217097148437841)

In [76]:
accs = []

for X_train, y_train, X_test, y_test in train_test_split(X_5, y):
    clf = LogisticRegressionCV(penalty='l1', solver='liblinear')
    clf.fit(X_train, y_train)
    
    y_score = clf.predict_proba(X_test)[:, 1]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    print acc, roc_auc_score(y_test, y_score)

np.mean(accs), np.std(accs)

0.2128 0.47862784
0.24 0.49056896
0.2 0.480704


KeyboardInterrupt: 

In [113]:
ck12_lucene = train[['ck12EbookBothQADocCount', 'ck12EbookBothQADocCountAMustHave', 
                    'ck12EbookKendallTauCorr', 'ck12EbookSpearmanCorr']]

In [114]:
X_6 = sparse.hstack([train_f1, train_f2, ck12_doc_lsa, ck_12_num_features.values, ck12_lucene.values])

accs = []

for X_train, y_train, X_test, y_test in train_test_split(X_5, y):
    clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    clf.fit(X_train, y_train)
    
    y_score = clf.predict_proba(X_test)[:, 1]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    print acc, roc_auc_score(y_test, y_score)

np.mean(accs), np.std(accs)

0.3072 0.543762773333
0.2912 0.540124586667
0.272 0.54571776
0.3248 0.543277653333
0.2704 0.524720213333


(0.29311999999999999, 0.020822142060796713)

array([[  6.7411747 ],
       [  6.85592365],
       [  6.7411747 ],
       ..., 
       [  2.63126969],
       [  0.        ],
       [ 21.66536331]])

In [102]:
X_7 = train.ngramsCk12EbookBothQAScoresMustHave_both_score_1.values.reshape((-1, 1))
accs = []

for X_train, y_train, X_test, y_test in train_test_split(X_7, y):
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    
    y_score = clf.predict_proba(X_test)[:, 1]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    print acc, roc_auc_score(y_test, y_score)

np.mean(accs), np.std(accs)

0.368 0.554702933333
0.3584 0.560657493333
0.344 0.5491392
0.3616 0.558532693333
0.3888 0.55894016


(0.36415999999999993, 0.014615279675736623)

In [120]:
X_7 = train.ngramsCk12EbookBothQAScoresMustHave_both_score_1.values.reshape((-1, 1))
accs = []

for X_train, y_train, X_test, y_test in train_test_split(X_7, y):
    #clf = LogisticRegression()
    #clf.fit(X_train, y_train)
    
    y_score = X_test[:, 0]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    print acc, roc_auc_score(y_test, y_score)

np.mean(accs), np.std(accs)

 0.3712 0.550682026667
0.3792 0.560206933333
0.368 0.552768426667
0.3632 0.554533546667
0.384 0.557558186667


(0.37311999999999995, 0.0075319054694014797)

## Submission

In [159]:
validation = questions[questions.source == 'VALIDATION']

In [101]:
validation_f1 = list(validation.apply(lambda x:'%s' % (x['question']),axis=1))
validation_f2 = list(validation.apply(lambda x:'%s' % (x['answer']),axis=1))

validation_f1 = tfv1.transform(validation_f1)
validation_f2 = tfv2.transform(validation_f2)


In [103]:
# ck12_doc_features = sparse.hstack([train_ck12_wiki_ans_docs, train_ck12_wiki_q_docs])
# ck12_Vt, ck12_norm, ck12_doc_lsa = svd(ck12_doc_features, 100)

val_ck12_wiki_ans_docs = tf_ck12_wiki_a.transform(validation.ck_12_answer_names)
val_ck12_wiki_q_docs = tf_ck12_wiki_q.transform(validation.ck_12_question_names)

In [104]:
ck12_doc_lsa_val = apply_svd(sparse.hstack([val_ck12_wiki_ans_docs, val_ck12_wiki_q_docs]), 
                             ck12_Vt, ck12_norm)

In [102]:
ck_12_num_features_validation = validation[ck_12_num_feature_names]

In [111]:
X_5 = sparse.hstack([train_f1, train_f2, ck12_doc_lsa, ck_12_num_features.values])

clf = RandomForestClassifier(n_jobs=-1, n_estimators=1000, max_features=60)
clf.fit(X_5, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=60, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [112]:
import pickle

In [114]:
with open('rf_1000_60_tfidf_ck12_doc.bin', 'w') as f:
    pickle.dump([clf, ck12_Vt, ck12_norm], f)

In [116]:
X_5_val = sparse.hstack([validation_f1, validation_f2, ck12_doc_lsa_val, ck_12_num_features_validation.values])


In [160]:
X_0_val = validation[best_features]


In [161]:
clf = LogisticRegressionCV()
clf.fit(X_0, y)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [81]:
ck_12_num_features_val = validation[ck_12_num_feature_names]

In [162]:
y_score = clf.predict_proba(X_0_val)[:, 1]

In [163]:
idx = np.arange(len(validation))

val_index = pd.DataFrame({'idx': idx, 'questionId': validation.questionId, 
                          'answer': validation.answerLetter,
                          'score': y_score})
val_index.head(n=8)

Unnamed: 0,answer,idx,questionId,score
10000,A,0,102501,0.174482
10001,B,1,102501,0.162006
10002,C,2,102501,0.435921
10003,D,3,102501,0.19768
10004,A,4,102502,0.174482
10005,B,5,102502,0.19986
10006,C,6,102502,0.221377
10007,D,7,102502,0.341383


In [165]:
result = []

for id, group in val_index.groupby('questionId'):
    answer_idx = group.score.values.argmax()
    answer = group.answer.values[answer_idx]
    result.append((id, answer))

In [166]:
res = pd.DataFrame(result, columns=['id', 'correctAnswer'])
res.to_csv('/home/agrigorev/git-projects/allen-qa/validation_result.csv', index=0)

Wiki