# Introduction

This notebook aims to describe the extract, transform, load (ETL) process for gene-specific data from NCBI and UniProt.

In [None]:
import gzip
import os
import shutil

def _uncompress(_path):
    with gzip.open(_path, 'rb') as _input, open('./' + os.path.splitext(os.path.basename(_path))[0], 'w') as _output:
        shutil.copyfileobj(_input, _output) 
        return os.path.basename(_output.name)
    
import pandas
    
def _read(_path):
    return pandas.read_csv(_path, delimiter = '\t')

# Methods

## Datasets

This section describe the datasets used to in this project

In [None]:
_datasets = {
    'Homo_sapiens.gene_info': '../datasets/NCBI/Homo_sapiens.gene_info.gz',
    'gene2go': '../datasets/NCBI/gene2go.gz' 
}

for _index in _datasets:
    _datasets[_index] = _uncompress(_datasets[_index])

### NCBI

#### Homo_sapiens.gene_info

In [None]:
_dataset = _read(_datasets['Homo_sapiens.gene_info'])

_dataset.head()

#### gene2go

In [None]:
_dataset = _read(_datasets['gene2go'])

_dataset.head()

The target taxonomy of this project is the Homo sapiens (Human) that holds the taxon indentifier 9606.

In [None]:
taxon = 9606

### Database

In [None]:
def _replace(_text, _data):
    _text.strip()
    if _data is not None:
        for _part in _text.split('|'):
            _data.append(_part.strip())
        return _data
    if _text == '-':
        return 'NOT AVAILABLE'
    return _text

def _strings(_data, _fields, _prefix):
    _lines = []
    _line = ''
    _arrays = []
    for _index, _value in enumerate(_data):
        if isinstance(_value, list):
            _arrays.append(_index)
        if isinstance(_value, str):
            _line = _line + '\t' + _fields[_index] + '\t' + _value
        else :
            _line =  _fields[_index] + '\t ERROR'
            _arrays = []
            break
    _lines.append(_line)
    for _index in _arrays:
        for _value in _data[_index]:
                _lines.append(_prefix + '\t' + _fields[_index] + '\t' + _value)
    return _lines

import tempfile

from datetime import datetime

In [None]:
# connection

In [None]:
with open(_datasets['Homo_sapiens.gene_info'], 'r') as _input:
    _output, _path = tempfile.mkstemp()
    _fields = [
        'taxonomy_id',
        'gene_id',
        'symbol',
        'locus_tag',
        'synonym',
        'db_xref',
        'chromosome',
        'map_location',
        'description',
        'type_of_gene',
        'symbol_from_nomenclature_authority',
        'full_name_from_nomenclature_authority',
        'nomenclature_status',
        'other_designation',
        'modification_date',
        'feature_type'
    ]
    for _line in _input:
        _data = [
            None,
            None,
            None,
            [],
            [],
            [],
            [],
            [],   
            None,
            None,
            None,
            None,
            None,
            [],
            None, 
            []
        ]
        _line = _line.strip()
        if _line.startswith('#'): 
            continue
        _parts = _line.split('\t')
        if len(_parts) != len(_fields):
            continue
        if not _parts[0].startswith(str(taxon)): 
            continue      
        for _index, _part in enumerate(_parts):
            _data[_index] = _replace(_part, _data[_index])
        _data[14] = datetime.strptime(_data[14], '%Y%m%d')
        _prefix = _fields[0] + '\t' + _data[0]
        for _line in _strings(_data, _fields, _prefix):
            os.write(_output, _line)
    os.close(_output)
    shutil.move(_path, _datasets['Homo_sapiens.gene_info'])