# Import itab module

In [1]:
import itab

# Simple example

The data file looks like this:

In [2]:
%%bash
head -n +4 simple.itab.tsv
tail -n +5 simple.itab.tsv | csvlook -t

# 
# Simple example file
# 
## schema=./simple.itab.schema.tsv
|----------------------+---------+---------|
|  DATE                | INTEGER | FLOAT   |
|----------------------+---------+---------|
|  2015-05-22 10:39:27 | 23      | 3.4     |
|  avui                | 22.3    | 3.4e10  |
|  2015/05/22 10:39:27 | 4       | 2.4     |
|  2012-05-22 10:45:22 | 12      | 2.5     |
|----------------------+---------+---------|


The schema data file looks like this:

In [3]:
%%bash
head -n +3 simple.itab.schema.tsv
tail -n +4 simple.itab.schema.tsv | csvlook -t

#
# This is a simple example of an iTab schema file.
#
|----------+------------------------------+----------------|
|  header  | reader                       | validator      |
|----------+------------------------------+----------------|
|  DATE    | date(x, '%Y-%m-%d %H:%M:%S') | x.month == 5   |
|  INTEGER | int(x)                       | x > 10         |
|  FLOAT   | float(x)                     | 2.3 < x < 2.6  |
|----------+------------------------------+----------------|


### Basic reader that returns each row as a list of parsed cell values

In [4]:
reader = itab.reader('simple.itab.tsv')

for row, errors in reader:
    if len(errors) > 0:
        print("\nLine {}. ERRORS: \n\t{}".format(reader.line_num, '\n\t'.join(errors)))
    else:
        print("\nLine {}. VALUES: \n\t{}".format(reader.line_num, '\t'.join(["{}".format(c) for c in row])))


Line 6. ERRORS: 
	Validation error at line 6 column 2: FLOAT. [value:'3.4' validator:'2.3 < x < 2.6']

Line 7. ERRORS: 
	Reading error at line 7 column 1: DATE. [value:'avui' reader:'date(x, '%Y-%m-%d %H:%M:%S')']
	Reading error at line 7 column 2: INTEGER. [value:'22.3' reader:'int(x)']
	Validation error at line 7 column 2: FLOAT. [value:'3.4e10' validator:'2.3 < x < 2.6']

Line 8. ERRORS: 
	Reading error at line 8 column 1: DATE. [value:'2015/05/22 10:39:27' reader:'date(x, '%Y-%m-%d %H:%M:%S')']
	Validation error at line 8 column 1: INTEGER. [value:'4' validator:'x > 10']

Line 9. VALUES: 
	2012-05-22 10:45:22	12	2.5


### Reader that reaturns each row as a dictionary 

In [5]:
reader = itab.DictReader('simple.itab.tsv')

for row, errors in reader:
    if len(errors) == 0:
        print("\nLine {}. VALUES: \n\t{}".format(reader.line_num, row))


Line 9. VALUES: 
	{'FLOAT': 2.5, 'DATE': datetime.datetime(2012, 5, 22, 10, 45, 22), 'INTEGER': 12}


## Pass the schema as a python dictionary 

In [6]:
reader = itab.DictReader('simple.itab.tsv', schema={'fields': {'INTEGER': {'reader': 'int(x)'}}})

for row, errors in reader:
    if len(errors) == 0:
        print("\nLine {}. VALUES: \n\t{}".format(reader.line_num, row))    




Line 6. VALUES: 
	{'FLOAT': '3.4', 'DATE': '2015-05-22 10:39:27', 'INTEGER': 23}

Line 8. VALUES: 
	{'FLOAT': '2.4', 'DATE': '2015/05/22 10:39:27', 'INTEGER': 4}

Line 9. VALUES: 
	{'FLOAT': '2.5', 'DATE': '2012-05-22 10:45:22', 'INTEGER': 12}


# Advance example

The schema file looks like this:

In [7]:
%%bash
head -n +3 mutations.itab.schema.tsv
tail -n +4 mutations.itab.schema.tsv | csvlook -t

#
# Mutations file schema 
#
|-------------+----------------+----------------------------------------------------|
|  header     | reader         | validator                                          |
|-------------+----------------+----------------------------------------------------|
|  CHROMOSOME | str(x).upper() | x in ([str(c) for c in range(1,23)] + ['X', 'Y'])  |
|  POSITION   | int(x)         | x > 0                                              |
|  REF        | str(x).upper() | x in "ACTG"                                        |
|  ALT        | str(x).upper() | x in "ACTG"                                        |
|  SAMPLE     | str(x)         | match("^CGP_donor_[0-9]{7}$", x)                   |
|  TYPE       | str(x).lower() | x in ['subs']                                      |
|-------------+----------------+----------------------------------------------------|


And the data file is a standard TSV file without any metadata:

In [8]:
%%bash
head mutations.itab.tsv | csvlook -t

|-------------+----------+-----+-----+-------------------+-------|
|  CHROMOSOME | POSITION | REF | ALT | SAMPLE            | TYPE  |
|-------------+----------+-----+-----+-------------------+-------|
|  1          | 99150    | T   | A   | CGP_donor_1397260 | subs  |
|  1          | 231793   | A   | G   | CGP_donor_1337223 | subs  |
|  1          | 404447   | C   | T   | CGP_donor_1337236 | subs  |
|  1          | 559388   | G   | T   | CGP_donor_1397282 | subs  |
|  1          | 585741   | G   | T   | CGP_donor_1353434 | subs  |
|  1          | 661926   | G   | A   | CGP_donor_1163904 | subs  |
|  1          | 717900   | T   | C   | CGP_donor_1234124 | subs  |
|  1          | 718896   | T   | A   | CGP_donor_1186990 | subs  |
|  1          | 753461   | C   | T   | CGP_donor_1397086 | subs  |
|-------------+----------+-----+-----+-------------------+-------|


In this example the tabbulated file is a normal tsv file without any metadata. For this reason we have to provide a valid schema file path or URL when we open the reader. And we will load the result as a pandas dataframe.

In [9]:
import pandas as pd

def load_mutations(file):
    reader = itab.DictReader(file, schema='mutations.itab.schema.tsv')
    for ix, (row, errors) in enumerate(reader, start=1):
      
        if len(errors) > 0:
            # Manage here the errors of parsing and validation
            print("\nLine {}. ERRORS: \n   {}".format(reader.line_num, '\n\t'.join(errors)))
            continue

        yield row   
        
data = pd.DataFrame.from_dict(load_mutations('mutations.itab.tsv'))
data.head()


Line 16. ERRORS: 
   Validation error at line 16 column 0: CHROMOSOME. [value:'chr1' validator:'x in ([str(c) for c in range(1,23)] + ['X', 'Y'])']


Unnamed: 0,ALT,CHROMOSOME,POSITION,REF,SAMPLE,TYPE
0,A,1,99150,T,CGP_donor_1397260,subs
1,G,1,231793,A,CGP_donor_1337223,subs
2,T,1,404447,C,CGP_donor_1337236,subs
3,T,1,559388,G,CGP_donor_1397282,subs
4,T,1,585741,G,CGP_donor_1353434,subs


In [10]:
data.dtypes

ALT           object
CHROMOSOME    object
POSITION       int64
REF           object
SAMPLE        object
TYPE          object
dtype: object