In [1]:
import pandas as pd
import numpy as np

import codecs
import json

import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
dicts = []


In [None]:
with codecs.open('lucene-features-6-processed.json', 'r', 'utf-8') as f:
    for line in f:
        row = json.loads(line)
        dicts.append(row)

In [41]:
questions = pd.DataFrame(dicts)
del dicts

In [42]:
questions.sort(columns=['questionId', 'answerLetter'], inplace=1)

In [43]:
questions.head()

Unnamed: 0,answer,answerLetter,ck12_ebook_aScores_doc_1,ck12_ebook_aScores_doc_10,ck12_ebook_aScores_doc_10_down,ck12_ebook_aScores_doc_10_up,ck12_ebook_aScores_doc_1_down,ck12_ebook_aScores_doc_1_up,ck12_ebook_aScores_doc_2,ck12_ebook_aScores_doc_2_down,ck12_ebook_aScores_doc_2_up,ck12_ebook_aScores_doc_3,ck12_ebook_aScores_doc_3_down,ck12_ebook_aScores_doc_3_up,ck12_ebook_aScores_doc_4,ck12_ebook_aScores_doc_4_down,ck12_ebook_aScores_doc_4_up,ck12_ebook_aScores_doc_5,ck12_ebook_aScores_doc_5_down,ck12_ebook_aScores_doc_5_up,Unnamed: 21
7344,[tissue level],A,7.024899,0.0,-3.441484,3.441484,3.583416,10.466383,7.024899,3.583416,10.466383,7.024899,3.583416,10.466383,7.024899,3.583416,10.466383,0.0,-3.441484,3.441484,...
7345,[organ level],B,7.535725,0.0,-3.01429,3.01429,4.521435,10.550015,7.535725,4.521435,10.550015,0.0,-3.01429,3.01429,0.0,-3.01429,3.01429,0.0,-3.01429,3.01429,...
7346,[system level],C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...
7347,[cellular level],D,7.024899,0.0,-3.441484,3.441484,3.583416,10.466383,7.024899,3.583416,10.466383,7.024899,3.583416,10.466383,7.024899,3.583416,10.466383,0.0,-3.441484,3.441484,...
7348,"[smell, air, odor]",A,20.842577,4.753002,-0.872465,10.378468,15.21711,26.468044,20.197306,14.571839,25.822772,9.281975,3.656508,14.907441,8.380727,2.75526,14.006194,8.253588,2.628121,13.879054,...


### Let's find what numeric features are most accurate

In [44]:
train = questions[questions.source == 'TRAIN']

In [45]:
num = train._get_numeric_data()
numeric_features = list(num.columns)

In [46]:
results = []

for id, group in train.groupby('questionId'):
    row = {}
    row['correct'] = (group.label == 'true').values.argmax()

    for f in numeric_features:
        row[f] = group[f].values.argmax()
    
    results.append(row)

results = pd.DataFrame(results)

In [47]:
bf = []
for f in numeric_features:
    accucary = (results[f] == results.correct).mean()
    bf.append((f, accucary)) 
    
bf = pd.DataFrame(bf, columns=['feature', 'accuracy'])
bf.sort('accuracy', ascending=0)

Unnamed: 0,feature,accuracy
91.0,ck12_ebook_bothQAScores_doc_7_up,0.3916
85.0,ck12_ebook_bothQAScores_doc_5_up,0.3864
97.0,ck12_ebook_bothQAScores_doc_9_up,0.3856
71.0,ck12_ebook_bothQAScores_doc_10_up,0.3848
165.0,ck12_ebook_ngrams_bothQAScoresMustHave_doc_9_up,0.3844
150.0,ck12_ebook_ngrams_bothQAScoresMustHave_doc_4_up,0.3832
162.0,ck12_ebook_ngrams_bothQAScoresMustHave_doc_8_up,0.3832
88.0,ck12_ebook_bothQAScores_doc_6_up,0.3816
98.0,ck12_ebook_bothQAScores_mean,0.3800
64.0,ck12_ebook_bothQAScoresMustHave_doc_9_up,0.3800


In [48]:
best_features = bf[bf.accuracy >= .35]

In [49]:
boolean_features = pd.DataFrame(dict(questionId=questions.questionId, 
                                     answerLetter=questions.answerLetter, 
                                     source=questions.source, 
                                     label=questions.label))

In [50]:
def chunks(l, n):
    for i in xrange(0, len(l), n):
        yield l[i:i+n]

In [51]:
def to_boolean(series):
    series = np.array(series)

    res = np.zeros_like(series, dtype=int)    
    for i, chunk in enumerate(chunks(series, 4)):
        max = np.max(chunk)
        max_ids, = np.where(chunk == max)
        if len(max_ids) == 4:
            continue
        for id in max_ids:
            res[4 * i + id] = 1
    return res

In [52]:
for f in best_features.feature:
    boolean_features[f] = to_boolean(questions[f])

In [53]:
total = best_features.accuracy.sum() 

In [54]:
feature_weights = dict((f, a / total) for (f, a) in zip(bf.feature, bf.accuracy) if a >= 0.35)
features, weights = zip(*feature_weights.items())
features = list(features)
weights = list(weights)

## Training

Let's find uncorrelated features

In [141]:
def select_uncorrelated(size=3):
    selected = []

    feature1, = np.random.choice(features, p=weights, size=1, replace=0)
    selected.append(feature1)

    pair_wise = boolean_features[features].corr()
    pw1 = pair_wise[feature1]
    pw1_names = pd.Series(pw1.index, index=pw1.index)
    feature2 = pw1[pw1_names.apply(lambda c: c not in selected)].argmin()
    selected.append(feature2)

    n = size - 2    
    while n > 0:
        pair_wise = boolean_features[features].corr()
        pw2 = pair_wise[feature2]
        pw12 = pw1 + pw2
        pw12_names = pd.Series(pw12.index, index=pw12.index)
        feature3 = pw1[pw1_names.apply(lambda c: c not in selected)].argmin()
        selected.append(feature3)
        pw1 = pw12
    
        n = n - 1

    return selected

In [116]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from scipy import sparse
from collections import Counter

In [119]:
y = (train.label == 'true').astype(int).values

In [120]:
def calc_accuracy(y, y_score):
    questions = chunks(zip(y, y_score),  4)

    total = 0
    correct = 0
    for chunk in questions:
        y_chunk, y_score_chunk = zip(*chunk)
        correct = correct + (np.argmax(y_chunk) == np.argmax(y_score_chunk))
        total = total + 1
    
    return 1.0 * correct / total

calc_accuracy(y, train.ck12_ebook_bothQAScores_doc_7_up.values)

0.3916

In [121]:
def train_test_split(X, y, n_iter=5, test_size=0.25):
    if sparse.issparse(X):
        X = X.tocsr()
    n = len(y) / 4

    split = cross_validation.ShuffleSplit(n=n, n_iter=n_iter, test_size=test_size)

    for train, test in split:
        train_idx = np.repeat(train, 4) * 4 + np.arange(len(train) * 4) % 4
        test_idx  = np.repeat(test, 4) * 4  + np.arange(len(test) * 4) % 4
        
        yield (X[train_idx], y[train_idx], X[test_idx], y[test_idx])

In [150]:
selected = select_uncorrelated(5)
selected

['ck12_ebook_ngrams_bothQAScoresMustHave_doc_5_up',
 'wiki_full_bothQAScoresMustHave_doc_1',
 'wiki_full_bothQAScores_doc_9_up',
 'ck12_ebook_bothQAScores_std',
 'ck12_ebook_bothQAScores_doc_2_down']

In [151]:
accs = []

X_0 = train[selected].values

for X_train, y_train, X_test, y_test in train_test_split(X_0, y):
    clf = LogisticRegressionCV()
    clf.fit(X_train, y_train)

    y_score = clf.predict_proba(X_test)[:, 1]
    acc = calc_accuracy(y_test, y_score)
    accs.append(acc)
    print acc, roc_auc_score(y_test, y_score)

mean_acc = np.mean(accs)
print mean_acc, np.std(accs)

0.3984 0.6357056
0.3984 0.6473728
0.4048 0.637325653333
0.3904 0.631365546667
0.3664 0.608250453333
0.39168 0.01344


## Submission

In [159]:
test = boolean_features[questions.source == 'TEST']

In [153]:
clf = LogisticRegressionCV().fit(X_0, y)

In [157]:
X_test = test[selected].values

In [158]:
y_score = clf.predict_proba(X_test)[:, 1]

ValueError: Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required.

In [257]:
idx = np.arange(len(test))

val_index = pd.DataFrame({'idx': idx, 'questionId': test.questionId, 
                          'answer': test.answerLetter,
                          'score': y_score})
val_index.head(n=8)

Unnamed: 0,answer,idx,questionId,score
70363,A,0,102501,0.243561
70367,B,1,102501,0.243561
70371,C,2,102501,0.271181
70376,D,3,102501,0.243561
70381,A,4,102502,0.243561
70384,B,5,102502,0.243561
70388,C,6,102502,0.248903
70393,D,7,102502,0.265529


In [258]:
result = []

for id, group in val_index.groupby('questionId'):
    answer_idx = group.score.values.argmax()
    answer = group.answer.values[answer_idx]
    result.append((id, answer))

In [259]:
res = pd.DataFrame(result, columns=['id', 'correctAnswer'])
res.to_csv('/home/agrigorev/git-projects/allen-qa/test_result.csv', index=0)

Wiki