In [8]:
import pandas as pd
import numpy as np

from collections import Counter

import matplotlib.pyplot as plt 
%matplotlib inline

In [102]:
features = pd.read_csv('/home/agrigorev/git-projects/allen-qa/features.txt', sep='\t')


In [103]:
def string_to_vec(s):
    return np.array(map(float, s.split(',')))

features.question_vec = features.question_vec.apply(string_to_vec)
features.answer_vec = features.answer_vec.apply(string_to_vec)

In [174]:
def apply_row(row):
    q = row.question_vec.values
    a = row.answer_vec.values
    cos = row.question_answer_cos_sim
    return np.concatenate([np.abs(q - a), [cos]])

def get_X(df):
    return np.array([apply_row(row) for _, row in df.iterrows()])

def get_y(df):
    return (df.correct == 'CORRECT').values

In [128]:
Counter(features.correct)

Counter({'CORRECT': 2440, 'NOT_CORRECT': 7320})

In [121]:
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score

In [111]:
question_ids = features.question_id
uniq_ids = np.unique(question_ids.values)


In [184]:
correct = 0
total = 0
for id, group in features.groupby('question_id'):
    scores = group.question_answer_cos_sim.values
    corr = group.answer[group.correct == 'CORRECT'].iloc[0] == group.answer.iloc[scores.argmax()]
    correct = correct + corr
    total = total + 1

print correct, total, 1.0 * correct / total

781 2440 0.320081967213


In [191]:
def predict_answer(model, group):
    X_test = get_X(group)
    y_scores = model.predict_proba(X_test)[:, 1]
    return group.answer.iloc[np.argmax(y_scores)]

In [243]:
kf = KFold(n=len(uniq_ids), n_folds=10)

models = []

for train, test in kf:
    train_ids = uniq_ids[train]
    test_ids = uniq_ids[test]

    X_train = get_X(features[question_ids.isin(train_ids)])
    y_train = get_y(features[question_ids.isin(train_ids)])

    lr = RandomForestClassifier(n_estimators=15)
    
    #LogisticRegression(C=0.01, penalty='l1')
    lr.fit(X_train, y_train)

    X_test = get_X(features[question_ids.isin(test_ids)])
    y_test = get_y(features[question_ids.isin(test_ids)])
    score = lr.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, score)
    print auc

    #acc = []
    #for id, group in features[question_ids.isin(test_ids)].groupby('question_id'):
    #    correct_answer = group.answer[group.correct == 'CORRECT'].iloc[0]
    #    predicted_asnwer = predict_answer(lr, group)
    #    acc.append(correct_answer == predicted_asnwer)         
    # print np.mean(acc)
    # models.append(lr)

 0.488715511063 0.47568417988 0.503882804802 0.520939711547 0.519937516797 0.493796470483 0.508675423273 0.466482464391 0.540978007704 0.492458344531


In [212]:
def predict_ensemble(models, group):
    answers = [predict_answer(m, group) for m in models]
    ans, _ = Counter(answers).most_common(1)[0]
    return ans

In [213]:
predict_ensemble(models, group)

'A'

In [194]:
features_validation = pd.read_csv('/home/agrigorev/git-projects/allen-qa/validation_features.txt', sep='\t')


In [235]:
result = []
i = 0
for id, group in features_validation.groupby('question_id'):
    if i % 500 == 0:
        print 'iteration %d' % i
    good = group[~ group.question_answer_cos_sim.isnull()]
    if not good.empty:
        good.question_vec = good.question_vec.apply(string_to_vec)
        good.answer_vec = good.answer_vec.apply(string_to_vec)
        result.append((id, predict_ensemble(models, good)))
    else:
        result.append((id, 'A'))
    
    i = i + 1

iteration 0
iteration 500
iteration 1000
iteration 1500
iteration 2000
iteration 2500
iteration 3000
iteration 3500
iteration 4000
iteration 4500
iteration 5000
iteration 5500
iteration 6000
iteration 6500
iteration 7000
iteration 7500
iteration 8000


In [239]:
res = pd.DataFrame(result, columns=['id', 'correctAnswer'])
res.to_csv('/home/agrigorev/git-projects/allen-qa/validation_result.csv', index=0)