# Introduction

This notebook aims to describe the extract, transform, load (ETL) process for gene-specific data from NCBI and UniProt.

# Methods

## Requirements

This notebook works with Python 2.7.14 or Python 3.6.4. However, some external libraries are necessary to be include in the python enviroment if not yet done:

In [3]:
import pandas
import psycopg2

In [4]:
import gzip
import os
import shutil

# uncompressing files
def _uncompress(_path):
    with gzip.open(_path, 'rb') as _input, open('./' + os.path.splitext(os.path.basename(_path))[0], 'w') as _output:
        # moving and renaming files
        shutil.copyfileobj(_input, _output) 
        return os.path.basename(_output.name)
    
# reading files    
def _read(_path):
    return pandas.read_csv(_path, delimiter = '\t')

## Datasets

This section describe the datasets used in this project

In [5]:
_datasets = '../datasets/'

_datasets = {
    'Homo_sapiens.gene_info': _datasets + 'NCBI/Homo_sapiens.gene_info.gz',
    'gene2go': _datasets + 'NCBI/gene2go.gz',
    'UniProtKB': _datasets + 'UniProt/uniprot-cancer+AND+reviewed%3Ayes+AND+organism%3A%22Homo+sapiens+%28Human%29+%5B--.txt.gz'
}

# uncompressing files
for _index in _datasets:
    _datasets[_index] = _uncompress(_datasets[_index])

TypeError: write() argument must be str, not bytes

### NCBI

#### Homo_sapiens.gene_info

In [None]:
_dataset = _read(_datasets['Homo_sapiens.gene_info'])

The table below shows a sample of the contents of this dataset:

In [None]:
_dataset.head()

#### gene2go

In [None]:
_dataset = _read(_datasets['gene2go'])

The table below shows a sample of the contents of this dataset:

In [None]:
_dataset.head()

The target taxonomy of this project is the Homo sapiens (Human) that holds the taxon indentifier 9606.

In [None]:
taxon = 9606

### UniProt

#### UniProtKB

In [None]:
_dataset = _read(_datasets['UniProtKB'])

The table below shows a sample of the contents of this dataset:

In [None]:
_dataset.head()

### Database

#### Database schema 

The image below describe the schema of the target database:

![Database schema](../database/database.png)

#### Configuration

In [None]:
# configuration file
_configuration = {
    'host': 'localhost',
    'database': 'bioinformatics',
    'user': 'postgres',
    'password': 'postgres'
}

with open('database.ini', 'w') as _output:
    _output.write('[postgresql]' + '\n')
    # writing configuration file
    for _parameter in _configuration:
        _output.write(_parameter + '=' + _configuration[_parameter] + '\n')

#### Scripts

In [None]:
import tempfile
import itertools

from datetime import datetime

# converting any value to a string insertable into PostgreSQL
def _convert(_data):
    if isinstance(_data, int):
        return str(_data)
    if isinstance(_data, datetime):
        # converting a date value to a string insertable into PostgreSQL
        return '\'' + datetime.strftime(_data, "%Y-%m-%d") + '\''
    return '\'' + _data.strip().replace('\'', '\'\'') + '\''

# skipping useless lines
def _skip(_line, _fields):
    _line = _line.strip()
    # skipping comments
    if _line.startswith('#'):
        return None
    # skipping comments  
    _parts = _line.split('\t')
    # skipping lines with bad formatation
    if len(_parts) != len(_fields):
        return None
    # skipping lines does not satisfy conditions
    if not _parts[0].startswith(str(taxon)): 
        return None
    return _parts

# replacing values under specific conditions
def _replace(_text, _data = None):
    _text.strip()
    # applying user-provided function to values
    if isinstance(_data, tuple):
        _text = _data[0](_text, _data[1])
    # expading value to a list of values
    if isinstance(_data, list):
        for _part in _text.split('|'):
            _data.append(_replace(_part))
        return _data
    # providing meaning for null values
    if _text == '-':
        _text = 'NOT AVAILABLE'
    # converting any value to a string insertable into PostgreSQL
    return _convert(_text)

# applying functions to lines
def _process(_line, _data, _fields):
    _parts = _skip(_line, _fields)
    # skipping useless lines
    if _parts is None:
        return None
    # replacing values under specific conditions
    for _index, _part in enumerate(_parts):
        _data[_index] = _replace(_part, _data[_index])
    return _data

# creating SQL INSERT INTO statements
def _strings(_data, _fields, _relation, _prefix):
    _lines = []
    _line = ''
    _arrays = []
    for _index, _value in enumerate(_data):
        # adding values to new SQL INSERT INTO statements
        if isinstance(_value, list):
            _arrays.append(_index)
        # appending values to SQL INSERT INTO statement
        elif isinstance(_value, str):
            _line = _line + ', ' + _value
        else :
            # warning the user about errors in the SQL INSERT INTO statement
            _line = '=>' + _fields[_index] + '\t ERROR (' + type(_value)
            _arrays = []
            break
    _lines.append('INSERT INTO ' + _relation + ' VALUES (' + _line[2:] + ');\n') 
    # adding new SQL INSERT INTO statements
    for _index in _arrays:
        # adding new SQL INSERT INTO statement
        for _value in _data[_index]:
            _lines.append('INSERT INTO ' + _fields[_index] + ' VALUES (' + _prefix + ', ' + _value + ');\n') 
    return _lines

# providing consistency to SQL INSERT INTO statements
def _consistency(_data, _fields, _relation):
    _line = ''
    # appending values to SQL INSERT INTO statement
    for _value in _data:
        _line = _line + ', ' + _value
    _line = 'INSERT INTO ' + _relation + ' ' + _line[2:] + ' WHERE NOT (SELECT TRUE FROM ' + _relation + ' WHERE '
    # appending clausules to SQL INSERT INTO statement
    for _field, _value in zip(_fields, _data):
        _line = _line + _field + ' = ' + _value + ' AND '
    return _line[:-5] + ');\n'

# removing duplicate lines
def _deduplicate(_dataset, *_files):
    _output, _path = tempfile.mkstemp()
    for _file in _files:
        with open(_file, 'r') as _input:
            for _line, _group in itertools.groupby(sorted(_input)):
                os.write(_output, _line)
    os.close(_output)
    shutil.move(_path, _dataset)

In [None]:
with open(_datasets['Homo_sapiens.gene_info'], 'r') as _input:
    _fact_file, _fact_path = tempfile.mkstemp()
    _dimension_file, _dimension_path = tempfile.mkstemp()
    _fields = [
        'taxonomy_id',
        'gene_id',
        'symbol',
        'locus_tag',
        'synonym',
        'db_xref',
        'chromosome',
        'map_location',
        'description',
        'type_of_gene',
        'symbol_from_nomenclature_authority',
        'full_name_from_nomenclature_authority',
        'nomenclature_status',
        'other_designation',
        'modification_date',
        'feature_type'
    ]
    for _line in _input:
        _data = [
            (int, 10),
            (int, 10),
            None,
            [],
            [],
            [],
            [],
            [], 
            None,
            None,
            None,
            None,
            None,
            [],
            (datetime.strptime, '%Y%m%d'), 
            []
        ]
        # applying functions to lines
        _data = _process(_line, _data, _fields)
        if _data is None:
            continue
        _data.insert(0, _data.pop(1)) # gene_id
        # creating SQL INSERT INTO statements
        _lines = _strings(_data, _fields, 'ncbi', _data[0])
        os.write(_fact_file, _lines.pop(0))
        # writing SQL statements
        for _line in _lines:
            os.write(_dimension_file, _line)
    os.close(_fact_file)
    os.close(_dimension_file)
    # removing duplicate lines
    _deduplicate(_datasets['Homo_sapiens.gene_info'], _fact_path, _dimension_path)

In [None]:
with open(_datasets['gene2go'], 'r') as _input:
    _fact_file, _fact_path = tempfile.mkstemp()
    _dimension_file, _dimension_path = tempfile.mkstemp()
    _fields = [
        'taxonomy_id',
        'gene_id',
        'go_id',
        'evidence',
        'qualifier',
        'go_term',
        'pubmed',
        'category'
    ]
    _labels = [
        'go_id',
        'evidence',
        'go_term',
        'category'
    ]
    for _line in _input:
        _data = [
            None,
            (int, 10),
            None,
            [],
            None,
            [],
            [],
            None
        ]
        # applying functions to lines
        _data = _process(_line, _data, _fields)
        if _data is None:
            continue
        _data.pop(0) # 'taxonomy_id'
        for _index, _value in enumerate(_data[2]): # 'evidence'
            _data[2][_index] = _value + ', ' + _data[3] + ', ' + _replace('-') 
        _data.pop(3) # 'qualifier'
        _data.pop(4) # 'pubmed'
        gene_id = _data.pop(0) # 'gene_id'
        # creating SQL INSERT INTO statements
        _lines = _strings(_data, _labels, 'go', _data[0])
        os.write(_fact_file, _lines.pop(0))
        # writing SQL statements
        for _line in _lines:
            os.write(_dimension_file, _line)
        # writing SQL statements
        for _line in _strings([gene_id, _data[0]], [_fields[1], _labels[0]], 'ncbi_go', None):
            os.write(_dimension_file, _line)
    os.close(_fact_file)
    os.close(_dimension_file)
    # removing duplicate lines
    _deduplicate(_datasets['gene2go'], _fact_path, _dimension_path)

In [None]:
'''
with open(_datasets['UniProtKB'], 'r') as _input:
    _fact_file, _fact_path = tempfile.mkstemp()
    _dimension_file, _dimension_path = tempfile.mkstemp()
    _fields = [
        'ID', 
        'AC', 
        'DE', 
        'GN', 
        'KW', 
        'DR',
    ]
    _labels = [
        'protein_id',
        'accession_number',
        None,
        None,
        'keyword'
    ]
    _id = None
    for _line in _input:        
        _line = _line.strip()
        if _line.startswith('#'):
            continue
        _parts = _line.split()
        if len(_parts) < 2:
            continue
        _field = _parts.pop(0)
        if _field not in _fields :
            continue
        _data = [_id]
        _lines = []
        _file = None
        if _line[-1] == '.':
            _line = _line[:-1]
        if _id == None or _field == _fields[0] and _convert(_parts[0]) != _id:
            _id = _convert(_parts[0])       
            # creating SQL INSERT INTO statements
            _lines = _strings([_id], [_labels[0]], 'uniprot', None)
            _file = _fact_file
        elif _field == _fields[1] or _field == _fields[4]:
            _parts = _line.split(';')
            _parts[0] = _parts[0][2:]
            _parts[0] = _parts[0].strip()
            _index = _fields.index(_field)
            _label = _labels[_index]
            for _part in _parts:
                _part = _convert(_part)
                if _part != '':
                    # creating SQL INSERT INTO statements
                    _lines = _strings([_id, _part], [_labels[0], _label], _label, None)
        elif _field == _fields[5] and _parts.pop(0) == 'GO;':
            _parts = _line.split(';')
            _parts.pop(0)
            _data = [_convert(_parts.pop(0))]
            _values = _parts.pop(0)
            _values = _values.split(':')
            _category = _values[0].strip()
            if _category == 'P':
                _data.append(_convert('Process'))
            elif _category == 'F':
                _data.append(_convert('Function'))
            elif _category == 'C':
                _data.append(_convert('Component'))
            else:
                assert False
            # providing consistency to SQL INSERT INTO statements
            _lines.append(_consistency(_data, ['go_id', 'category'], 'go'))
            _data.pop()
            _data.append(_convert(_values[1]))
            # providing consistency to SQL INSERT INTO statements
            _lines.append(_consistency(_data, ['go_id', 'go_term'], 'go_term'))        
            _data.pop()
            _values = _parts.pop(0)
            _values = _values.split(':')
            _data.append(_convert(_values[0]))
            _data.append(_replace('-'))
            _data.append(_convert(_values[1]))
            # providing consistency to SQL INSERT INTO statements
            _lines.append(_consistency(_data, ['go_id', 'evidence', 'qualifier', 'source'], 'evidence'))        
        elif _field in _fields:
            continue
        if not _file: 
            _file = _dimension_file
        # writing SQL statements
        for _line in _lines:
            os.write(_file, _line)
    os.close(_fact_file)
    # removing duplicate lines
    _deduplicate(_datasets['UniProtKB'], _fact_path, _dimension_path)
'''



In [7]:
import unitprot_parser as upp

upp.processUnitProtData(_datasets['UniProtKB'], _datasets['UniProtKB'], True)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 33: character maps to <undefined>

#### Connection

In [16]:
# connecting to database
def _connect(_configuration, _create = False):
    _params = _configuration
    print('Connecting to the database \"' + _configuration['database'] + '\" in PostgreSQL')
    _connection = psycopg2.connect(**_params)
    _connection.set_session(autocommit = True)
    _cursor = _connection.cursor()
    print('Version of PostgreSQL:')
    _cursor.execute('SELECT version()')
    _version = _cursor.fetchone()
    print(_version)
    _cursor.close()
    # creating tables
    if _create:
        with _connection.cursor() as _cursor:
            _cursor.execute(open('../database/database.sql', 'r').read())
    return _connection

#### Populating

In [17]:
# populating tables
def _populate(_path, _connection):
    with _connection.cursor() as _cursor:
        _cursor.execute('SELECT current_database()')
        _database = _cursor.fetchone()
        print('Populating the database \"' + _database[0] + '\" using the file \"' + _path + '\"')
        _cursor.execute(open(_path, 'r').read())

In [18]:
_connection = _connect(_configuration, _create = True)

_datasets = [
    _datasets['Homo_sapiens.gene_info'],
    _datasets['gene2go'],
    _datasets['UniProtKB'],
]

# populating tables
for _dataset in _datasets:
    _populate(_dataset, _connection)

Connecting to the database "bioinformatics" in PostgreSQL


OperationalError: could not connect to server: Connection refused
	Is the server running on host "localhost" (::1) and accepting
	TCP/IP connections on port 5432?
could not connect to server: Connection refused
	Is the server running on host "localhost" (127.0.0.1) and accepting
	TCP/IP connections on port 5432?


In [None]:
_connection.close()

In [None]:
 #
    
_configuration = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'postgres'
}

_connection = _connect(_configuration, _create = False)

_connection.cursor().execute('DROP DATABASE IF EXISTS "bioinformatics";')

_connection.close()