In [2]:
def friendly_assert(condition, msg):
    try:
        assert condition
    except AssertionError:
        print(msg)
        return False
    return True

In [7]:
import re
import numpy as np
import pandas

In [5]:
pandas.set_option('display.max_columns', 347)
data = pandas.read_stata('data/clinic_data.dta', convert_categoricals=False)

In [36]:
f = open('data/ids/OCR_all.txt')

last_pasient_ids = [] # temp variable to hold up to three ids
prev_pasient_id = 0 # temp variable for previous pasient id
prev_pasient_ids = []
for line in f:
    line = line.strip()
    if 'TP' in line:
        # line with TP[0-9]+
        tp = line
        TP = float(re.findall('[0-9]+', tp)[0])
        i = 0
        continue
    if not line:
        # empty line
        continue
    if len(last_pasient_ids) != 3:
        last_pasient_ids.append(line)
    if len(last_pasient_ids) == 3:
        # remove empty sylinders
        last_pasient_ids = [id_ for id_ in last_pasient_ids if 'ingen sylinder' not in id_]

        # row, col
        r = i//9 + 1
        c = i%9 + 1
        pre = '%4s, row %2s, col %2s - ' % (tp, r, c)
        # ids should be splitted with an -
        for id_ in last_pasient_ids:
            friendly_assert('-' in id_, pre + 'pasient id should be formatted 123-1: %s' % id_)
        
        p_ids = set([x.split('-')[0] for x in last_pasient_ids])
        # all pasient ids should be the same
        friendly_assert(len(p_ids) == 1, pre + "pasient id not the same: %s" % last_pasient_ids)
        
        try:
            s_ids = [x.split('-')[1] for x in last_pasient_ids]
        except IndexError:
            # wrongly formated pasient id
            pass
        # sample id should be 1,2,3
        res = friendly_assert(len(s_ids) == 3, pre + 'there should be 3 samples: %s' % last_pasient_ids)
        if res:
            friendly_assert('1' in s_ids and '2' in s_ids and '3' in s_ids, pre + "sample ids not 1,2,3:    %s" % last_pasient_ids)
        
        # pasient id should increment
        p_id = p_ids.pop()
        p_id = next(iter(re.findall(r'[0-9]+', p_id)), 0) # in case not formatted correctly
        p_id = int(p_id)
        friendly_assert(p_id >= prev_pasient_id, pre + "pasient id did not increment: %s < %s" % (last_pasient_ids, prev_pasient_ids))
        
        # pasient should be in database
        res = friendly_assert(np.any(data.ID_deltaker == p_id), pre + "pasient id missing in db: %s" % p_id)
        if res:
            # exists
            TP_db = next(iter(data.loc[data.ID_deltaker == p_id, 'TP_nr']), np.nan)
            if np.isfinite(TP_db):
                friendly_assert(TP == TP_db, pre + "id %s, wrong TP_nr in db: %s != %s" % (p_id, TP, TP_db))
            else:
                print(pre + "TP_nr not registered in db for ID_deltaker %s" % p_id)
        
        # set/reset temp variables
        prev_pasient_id = p_id
        prev_pasient_ids = last_pasient_ids
        last_pasient_ids = []
    
    i+=1

f.close()

 TP2, row  3, col  6 - pasient id missing in db: 66
 TP3, row  1, col  3 - pasient id did not increment: ['68-1', '68-2', '68-3'] < ['102b-1', '102b-2', '102b-3']
 TP3, row  1, col  3 - id 68, wrong TP_nr in db: 3.0 != 2.0
 TP3, row  9, col  6 - TP_nr not registered in db for ID_deltaker 140
 TP4, row  1, col  3 - pasient id did not increment: ['162a-1', '162a-2', '162a-3'] < ['163-1', '163-2', '163-3']
 TP5, row  9, col  9 - TP_nr not registered in db for ID_deltaker 251
 TP6, row  1, col  3 - pasient id did not increment: ['209-1', '209-2', '209-3'] < ['268-1', '268-2', '268-3']
 TP6, row  1, col  3 - id 209, wrong TP_nr in db: 6.0 != 4.0
 TP6, row  1, col  6 - id 221, wrong TP_nr in db: 6.0 != 5.0
 TP6, row  1, col  9 - pasient id missing in db: 222
 TP9, row 10, col  9 - there should be 3 samples: ['467a-1']
 TP9, row 11, col  3 - there should be 3 samples: ['467b-1', '467b-2']
 TP9, row 12, col  6 - there should be 3 samples: ['471a-1', '471a-2']
 TP9, row 12, col  9 - there shoul

In [37]:
df = pandas.DataFrame(columns=['TP_nr', 'ID_deltaker'])

In [38]:
df.append(pandas.DataFrame({'TP_nr': 23, 'ID_deltaker': 2}, index=[1]))

Unnamed: 0,ID_deltaker,TP_nr
1,2,23


In [39]:
pandas.DataFrame({'TP_nr': [1], 'id': ['asdf']})

Unnamed: 0,TP_nr,id
0,1,asdf


In [40]:
df

Unnamed: 0,TP_nr,ID_deltaker


In [41]:
import numpy as np

In [42]:
locations = pandas.DataFrame(columns=['person_id', 'ID_deltaker', 'TMA_navn', 'TP_nr', 'TP_rad', 'TP_kolonne'])

In [43]:
f = open('data/ids/OCR_all.txt')

rows = []
for line in f:
    line = line.strip()
    if 'TP' in line:
        # line with TP[0-9]+
        tp = line
        TP = float(re.findall('[0-9]+', tp)[0])
        i = 0
        continue
    if not line:
        # empty line
        continue
    if 'ingen sylinder' in line:
        # emtpy sylinder
        continue
    # row, col
    r = i//9 + 1
    c = i%9 + 1
        
    d_id = int(re.findall('[0-9]+', line)[0]) # not including a/b, eg 162a -> 162
    assert d_id > 0
    s_id = int(line.split('-')[1])
    assert (s_id == 1 or s_id == 2 or s_id == 3)
    
    person_id = next(iter(data.loc[data.ID_deltaker == d_id, 'person_id']), np.nan)
    
    rows.append({
            'person_id': person_id, 
            'ID_deltaker': d_id,
            'TMA_navn': line,
            'TP_nr': TP,
            'TP_rad': r,
            'TP_kolonne': c
        })    
    i+=1

    
locations = locations.append(rows, ignore_index=True)
f.close()

In [44]:
np.any(data.ID_deltaker == 1101), 1101 in data.ID_deltaker, 1101.0 in data.ID_deltaker, len(data.ID_deltaker == 1101)

(True, False, False, 1056)

In [48]:
locations[locations['ID_deltaker'] == 1]

Unnamed: 0,person_id,ID_deltaker,TMA_navn,TP_nr,TP_rad,TP_kolonne
0,83522,1,1-1,1,1,1
1,83522,1,1-2,1,1,2
2,83522,1,1-3,1,1,3


In [46]:
locations.loc[np.isnan(locations.person_id)]

Unnamed: 0,person_id,ID_deltaker,TMA_navn,TP_nr,TP_rad,TP_kolonne
147,,66,66-1,2,3,4
148,,66,66-2,2,3,5
149,,66,66-3,2,3,6
636,,222,222-1,6,1,7
637,,222,222-2,6,1,8
638,,222,222-3,6,1,9


In [47]:
locations.to_stata('data/ids/locations.dta')