In [None]:
import pandas as pd
import numpy as np
import glob
import json

from itertools import chain
from urllib2 import urlopen

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
response = urlopen('https://codalab.fragilefamilieschallenge.org/f/api/codebook/')
code_book = json.loads(response.read())
code_book = pd.DataFrame(code_book)
code_book = code_book.set_index('code')

In [None]:
m_files = glob.glob('../output/m*year*year*.csv')
f_files = glob.glob('../output/f*year*year*.csv')

In [None]:
def findBestMatches(x):
    x = x.drop_duplicates()
    matched = x['distance score'] < x['threshold']
    
    if any(matched):
        indexes = x[x['distance score'] == x['distance score'].min()].dropna()
        # Remember multi-index..............vvv
        return [[index[0], index[1], x.loc[index]['distance score']] for index in indexes.index]
    else:
        return [[x.index[0][0], np.nan, np.nan]]
    
def getMatchList(f):
    
    df = pd.read_csv(f)

    question,candidate = zip(*df.relation.apply(lambda x: x.split(':')))
    index = pd.MultiIndex.from_arrays([question,candidate], names=['question', 'candidate'])
    df.index = index

    grp = df['distance score'].groupby(by='question')
    mean = grp.mean()
    std = grp.std()
    thr = pd.DataFrame(mean - std*2)
    thr.columns = ['threshold']

    df = df.merge(thr,left_index=True, right_index=True)

    grp = df.groupby(by='question')
    results = grp.apply(findBestMatches)
    vals = list(chain.from_iterable(results.values))
    #print vals
    idx, match, score = zip(*vals)
    bestMatches = pd.DataFrame({'match': match,'score': score},index=idx)
    bestMatches.index.name = 'question'
    
    return bestMatches

In [None]:
def smush(X):
    out = []
    for x in X.itertuples():
        out.append((x.match, x.score))
    out.append((x.Index, '-'))
    out.sort()
    return out

def getCodeDescription(x):
    if code_book.index.isin([x]).any():
        return code_book.loc[x].description.encode('utf-8')
    else:
        return 'SUSPECT: No Description!'

In [None]:
matches = pd.DataFrame()
for f in f_files:
    new = getMatchList(f)
    matches = pd.concat((matches,new))

matches = matches.dropna()
grp = matches.groupby('question')
question_groups = grp.apply(smush)
question_groups = question_groups.reset_index(drop=True)

In [None]:
def scoreKey(x):
    if (x > 26) and (x != '-'):
        return '*'
    else: 
        return ' '
    
with open('../output/question-group-validation-fatherSurvey-v2.txt', 'wb') as wFile:
    
    for key,val in question_groups.iteritems():
        string = '\n'.join(['{:2}{:10}{:<7} {}'.format(scoreKey(i[1]),i[0],i[1],getCodeDescription(i[0])) for i in val])
        line = '{}\n\n---------------------------\n'.format(string)
        
        print line
        wFile.write(line)