# Gene Expression Data Preprocessing<a id='top'></a>
**Sections:**<br>
[0) Description](#0)<br>
[1) Importing Modules and Packages](#1)<br>
[2) Configuration](#2)<br>
[3) Loading Gene Ontology](#3)<br>
[4) Loading Genes and Annotations](#4)<br>
[5) Loading Gene Expression data](#5)<br>
[6) Computing Absolute Pearson Correlation](#6)<br>
[7) Saving the Results](#7)<br>


## Description<a id='0'></a>

**Aim:** This jupyter notebook results in Gene Expression data of species of interest (e.g., _human_ or *yeast*) with which the deepSimDEF networks would be trained and evaluatied.

---
**Output file format:** (space separated)<br>
_gene1_ _gene2_ _coexpression_correlation_ <br>
CD44 ARHGEF1 0.56<br>
POLR2G CTDP1 0.21<br>
OR5L2 SLC7A11 0.11<br>
SCNM1 CEP120 0.99<br>
... <br>

---
Files needed for this preprocessing are:

 * **Gene ontology:** ['go.obo' file](http://current.geneontology.org/ontology/go.obo)<br><br>
 
 * **Association files:** [gene association files ingested from GO Consortium members](http://current.geneontology.org/products/pages/downloads.html)
  * **Human** - [Gene Association file (Homo sapiens)](http://geneontology.org/gene-associations/goa_human.gaf.gz)
  * **Yeast** - [Gene Association file (Saccharomyces cerevisiae)](http://current.geneontology.org/annotations/sgd.gaf.gz)<br><br>
  
 * **Gene Expression data:** <br>
         
     * **Human** - [GenEx (Homo sapiens)](https://multid.se/genex/) (normalization should be applied to the data)<br>
         
     * **Yeast** - [Eisen et al. (Saccharomyces cerevisiae)](http://www.i3s.unice.fr/~pasquier/web/userfiles/downloads/datasets/EisenYeastData_Measures.txt) <br>

---
[back to top](#top)<br>

## Import<a id='1'></a>
[back to top](#top)<br>

In [None]:
import pandas as pd
import numpy as np
import os
import requests
import easydict
import linecache
import pprint
import random
import itertools

pp = pprint.PrettyPrinter(indent=4)

## Configuration<a id='2'></a>
[back to top](#top)<br>

In [None]:
species = 'yeast' # species of interest to load of and save the resut for

if species=='human':
    association_file_name = 'goa_human.gaf.gz' # human
    association_file_url = 'http://geneontology.org/gene-associations/goa_human.gaf.gz'
    expression_file = 'WholeBlood.Gene.Filter6_20.TPM10_20.normalizedLimmaVoom.txt'
elif species=='yeast':
    association_file_name = 'sgd.gaf.gz' # yeast
    association_file_url = 'http://current.geneontology.org/annotations/sgd.gaf.gz'
    expression_file = 'EisenYeastData_Measures.txt'
    expression_url = 'http://www.i3s.unice.fr/~pasquier/web/userfiles/downloads/datasets/EisenYeastData_Measures.txt'
    
args = easydict.EasyDict({
    "go_dir": 'gene_ontology/raw/',     # directory to the Gene Ontology 'go.obo' file
    "association_file_dir": 'species/{}/association_file/raw'.format(species), # directory to the human association file
    "gene_expression_raw_dir": 'species/{}/gene_expression/raw'.format(species),          # directory to the raw gene expression data
    "result_gene_ontology_dir": 'species/{}/gene_expression/processed'.format(species),   # directory in which the results would be saved
    "max_num_pairs": -1,            # maximum number of pairs randomly chosen (-1 means all)
    "download_gene_ontology": True,    # download the latest version of gene ontology into the specified directory above
    "download_association_file": True, # download association file of the specieis of interest into the specified directory above
    #"threshold": 0.8,                   # absolute pearson correlations below this cutoff point would be removed
    "seed": 2021                         # seed to make sure the random negative samples are reproducable
})
    
os.makedirs(args.result_gene_ontology_dir, exist_ok=True)  # create 'result_gene_ontology_dir' folder (if it does not exist already)

np.random.seed(args.seed)
random.seed(args.seed)

subontology_map = {"C":"CC", "P":"BP", "F":"MF"}

#### asserting raw data exist

In [None]:
f"{args.gene_expression_raw_dir}/{expression_file}"

In [None]:
if species=='yeast':
    if os.path.exists(f"{args.gene_expression_raw_dir}/{expression_file}") is not True:
        os.makedirs(args.gene_expression_raw_dir, exist_ok=True)  # create 'gene_expression_raw_dir' folder (if it does not exist already)
        r = requests.get(expression_url, allow_redirects=True)
        open('{}/{}'.format(args.gene_expression_raw_dir, expression_file), 'wb').write(r.content)
elif species=='human':
    assert os.path.exists(f"{args.gene_expression_raw_dir}/{expression_file}") is True, f"\nYou need to download the expression file first using the link and guideline provided above! \nPut the {expression_file} file in '{args.gene_expression_raw_dir}/' directory."

## Loading Gene Ontology<a id='3'></a>
[back to top](#top)<br>

In [None]:
if args.download_gene_ontology:
    os.makedirs(args.go_dir, exist_ok=True)  # create 'data_loc' folder (if it does not exist already)
    print("Downloading the latest version of Gene Ontology into '{}'...".format(args.go_dir))
    url = 'http://current.geneontology.org/ontology/go.obo'
    r = requests.get(url, allow_redirects=True)
    open('{}/go.obo'.format(args.go_dir), 'wb').write(r.content)

print("Gene Ontology {}".format(linecache.getline('{}/go.obo'.format(args.go_dir), 2))) # Now: releases/2020-10-09

In [None]:
"""Reading Gene Ontology to extract Terms and their Descriptive Names"""
with open("{}/go.obo".format(args.go_dir)) as f:
    content = f.readlines()
content = "".join([x for x in content])
content = content.split("[Typedef]")[0].split("[Term]")
print("Information of the last GO term in the file:\n~~~~~~~~~~~~~~~~~~~~~~~~~{}".format(content[-1]))

In [None]:
"""Going through every GO term and extract information needed ('id', 'alt_id', 'namespace', and 'is_obsolete')"""
go_term_dict = {}
for c in content:
    go_id = ''
    for l in c.split("\n"):
        # id
        if "id: GO:" in l[0:len("id: GO:")]:
            go_id = l.split("id: ")[1]
            go_term_dict[go_id] = {}
        # alt_id
        if "alt_id:" in l[0:len("alt_id")+1]:
            go_term_dict[go_id].setdefault("alt_id", []).append(l.split("alt_id: ")[1])
        # namespace
        if "namespace:" in l[0:len("namespace")+1]:
            go_term_dict[go_id]["namespace"] = l.split("namespace: ")[1]
        # is_obsolete
        if "is_obsolete:" in l[0:len("is_obsolete")+1]:
            go_term_dict[go_id]["is_obsolete"] = l.split("is_obsolete: ")[1]

In [None]:
"""printing how the key:values are organized for every GO term"""
for i in range(15):
    print(list(go_term_dict)[i], end=": ")
    pp.pprint(go_term_dict[list(go_term_dict)[i]])

In [None]:
"""grouping GO terms based on the sub-ontologies they belong to"""
subontology_go_term_dict = {}
for go_id in go_term_dict:
    if not go_term_dict[go_id].get('is_obsolete', False): # or => if 'is_obsolete' not in go_term_dict[go_id]:
        subontology_go_term_dict.setdefault(go_term_dict[go_id]['namespace'].split('_')[1][0].upper(), []).append(go_id)

In [None]:
"""including 'alt_id' into the sub-ontology's groups of GO terms"""
for go_id in go_term_dict:
    if go_term_dict[go_id].get('alt_id', False): # or => if 'alt_id' in go_term_dict[go_id]:
        for alt_id in go_term_dict[go_id].get('alt_id'):
            subontology_go_term_dict[go_term_dict[go_id]['namespace'].split('_')[1][0].upper()].append(alt_id)

In [None]:
"""printing how the key:values are organized for different sub-ontologies"""
for subontology in subontology_go_term_dict:
    print("{} ({}):: {} <= {} GO term (with 'alt_id') => {}".format(
        subontology, 
        subontology_map[subontology], 
        " ".join(subontology_go_term_dict[subontology][:3]), 
        len(subontology_go_term_dict[subontology]), 
        " ".join(subontology_go_term_dict[subontology][-3:])))

## Loading Genes and Annotations<a id='4'></a>
[back to top](#top)<br>

In [None]:
if args.download_association_file:
    os.makedirs(args.association_file_dir, exist_ok=True)  # create 'data_loc' folder (if it does not exist already)
    print("Downloading the latest version of association file into '{}'...".format(args.association_file_dir))
    r = requests.get(association_file_url, allow_redirects=True)
    open('{}/{}'.format(args.association_file_dir, association_file_name), 'wb').write(r.content)
print("Done!")

In [None]:
df = pd.read_csv("{}/{}".format(args.association_file_dir, association_file_name), sep='\t', comment="!", skip_blank_lines=True, header=None, dtype=str)
df = df.iloc[:,[1, 2, 3, 4, 6, 8]]
if len(df[df[3].isnull()])==0:
    df = df[~df[3].str.contains("NOT")]
    df = df.dropna().reset_index(drop=True)
else:
    df = df[df[3].isnull()]
    df = df.dropna().reset_index(drop=True)
df = df.drop(df.columns[2], axis=1)
df

In [None]:
"""keeping track of the gene ids and their mappings"""
protein_gene_id_map = {}
for gene_id, protein_id in zip(df[1], df[2]):
    protein_gene_id_map[protein_id] = gene_id

##### removing 'ND' and 'IEA' annotations

In [None]:
df = df[(df[6]!='ND') & (df[6]!='IEA')].reset_index(drop=True)
df

In [None]:
"""protein dictionary to keep track of annotations for proteins (from each sub-ontology)"""
proteins_dict = {}
for index, row in df.iterrows():
    gene = row[1]
    go_term_id = row[4]
    subontology = row[8]
    if go_term_id in subontology_go_term_dict[subontology]:
        proteins_dict.setdefault(gene, dict()).setdefault(subontology, set()).add(go_term_id)
        
"""printing how the key:values are organized for every gene/protein"""
for i in range(5):
    print(list(proteins_dict)[i], end=": ")
    pp.pprint(proteins_dict[list(proteins_dict)[i]])
print("\nTotal number of genes/proteins annotated:", len(proteins_dict))

#### Taking into account only fully annotated genes/proteins

In [None]:
"""keeping track of fully annotated genes/proteins"""
fully_annotated_proteins_wo_iea = []
for protein in proteins_dict:
    if len(proteins_dict[protein]) == 3:
        fully_annotated_proteins_wo_iea.append(protein)
print("Out of {} proteins {} are (experimentally or manually) annotated by all three sub-ontologies.".format(len(proteins_dict), len(fully_annotated_proteins_wo_iea)))

## Loading Gene Expression data<a id='5'></a>
[back to top](#top)<br>

#### Loading species raw gene expression data

In [None]:
df_gene_expression = pd.read_csv("{}/{}".format(args.gene_expression_raw_dir, expression_file), sep='\t')
df_gene_expression.iloc[:, 0] = [i.split(".")[0] for i in df_gene_expression.iloc[:,0]] # useful for human
df_gene_expression

#### Imputing the missing values if needed (using _fancyimpute_ package) 
https://github.com/iskandr/fancyimpute <br><br>
<code>pip install fancyimpute</code>

In [None]:
if species == 'yeast':
    from fancyimpute import KNN#, NuclearNormMinimization, SoftImpute, BiScaler

    XY_incomplete = df_gene_expression.to_numpy()[:, 1:]
    XY_filled_knn = KNN(k=10).fit_transform(XY_incomplete)
    XY_filled_knn = np.round(XY_filled_knn, 2)

In [None]:
if species == 'yeast':
    # replacing the incomplete dataframe with the imputed one
    df_gene_expression.iloc[:, 1:] = XY_filled_knn
df_gene_expression

In [None]:
if species == 'human':
    import urllib.parse
    import urllib.request

    GENENAME_ids = {}

    url = 'https://www.uniprot.org/uploadlists/'

    params = {
    'from': 'ENSEMBL_ID',
    'to': 'ACC',
    'format': 'tab',
    'query': " ".join([i.split(".")[0] for i in df_gene_expression.Name])
    }

    data = urllib.parse.urlencode(params)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    with urllib.request.urlopen(req) as f:
        response = f.read()
    #print(response.decode('utf-8'))
    for i, mapping in enumerate(response.decode('utf-8').strip().split("\n")):
        if i!=0: 
            id1, id2 = mapping.split("\t")
            GENENAME_ids[id1] = id2

In [None]:
if species == 'human':
    df_gene_expression = df_gene_expression[df_gene_expression['Name'].isin(GENENAME_ids)] # for the nagation add ~
    df_gene_expression = df_gene_expression.reset_index(drop=True)
    df_gene_expression.Name = [GENENAME_ids[i] for i in list(df_gene_expression.Name)]
df_gene_expression

#### Removing proteins without complete annotation

In [None]:
df_gene_expression = df_gene_expression[df_gene_expression.iloc[:,0].isin(fully_annotated_proteins_wo_iea)] # for the nagation add ~
df_gene_expression = df_gene_expression.reset_index(drop=True)
df_gene_expression


## Computing Absolute Pearson Correlation<a id='6'></a>
[back to top](#top)<br>

In [None]:
rpearson = np.abs(np.corrcoef(df_gene_expression.iloc[:, 1:].to_numpy()))

In [None]:
seq_list = {i:seq for i, seq in enumerate(df_gene_expression.iloc[:,0].to_numpy())}
print(seq_list)

## Saving the Results<a id='7'></a>
[back to top](#top)<br>

In [None]:
list_of_pairs_full = list(itertools.combinations(list(seq_list.keys()), 2))
print("Full length of pairs:", len(list_of_pairs_full))

if args.max_num_pairs == -1:
    list_of_pairs = [list_of_pairs_full[i] for i in np.random.choice(len(list_of_pairs_full), len(list_of_pairs_full), replace=False)]
    args.max_num_pairs = len(list_of_pairs)
else:
    list_of_pairs = [list_of_pairs_full[i] for i in np.random.choice(len(list_of_pairs_full), len(list_of_pairs_full), replace=False)]

In [None]:
file = f'{species}_gene_expression.tsv'
print(f"Saving data into the file '{file}' with the binning strategy.")
with open(f"{args.result_gene_ontology_dir}/{file}", 'w') as fw:
    fw.write("Gene_1\tGene_2\tExpression_Value\n")
    #while len(list_of_pairs)<args.max_num_pairs:
    k = 0
    for pair in list_of_pairs:
        r = rpearson[pair[0], pair[1]]
        if 0.8<=r:# and k<=args.max_num_pairs:
            k = k + 1
            # transformation of the correlation coefficient into a Fishers’ Z-score will be done by arctanh (inverse hyperbolic tangent function)
            fw.write("{}\t{}\t{}\n".format(seq_list[pair[0]], seq_list[pair[1]], np.round(np.arctanh(r), 8))) # The Fisher transform equals the inverse hyperbolic tangen/arctanh
            #fw.write("{} {} {}\n".format(seq_list[pair[0]], seq_list[pair[1]], np.round(r, 8))) # The Fisher transform equals the inverse hyperbolic tangen/arctanh
    k2 = 0
    for pair in list_of_pairs:
        r = rpearson[pair[0], pair[1]]
        if 0.6<=r and r<0.8 and k2<k:# and k<=args.max_num_pairs:
            k2 = k2 + 1
            fw.write("{}\t{}\t{}\n".format(seq_list[pair[0]], seq_list[pair[1]], np.round(np.arctanh(r), 8))) # The Fisher transform equals the inverse hyperbolic tangen/arctanh
    k2 = 0
    for pair in list_of_pairs:
        r = rpearson[pair[0], pair[1]]
        if 0.4<=r and r<0.6 and k2<k:# and k<=args.max_num_pairs:
            k2 = k2 + 1
            fw.write("{}\t{}\t{}\n".format(seq_list[pair[0]], seq_list[pair[1]], np.round(np.arctanh(r), 8))) # The Fisher transform equals the inverse hyperbolic tangen/arctanh
    k2 = 0
    for pair in list_of_pairs:
        r = rpearson[pair[0], pair[1]]
        if 0.2<=r and r<0.4 and k2<k:# and k<=args.max_num_pairs:
            k2 = k2 + 1
            fw.write("{}\t{}\t{}\n".format(seq_list[pair[0]], seq_list[pair[1]], np.round(np.arctanh(r), 8))) # The Fisher transform equals the inverse hyperbolic tangen/arctanh
    k2 = 0
    for pair in list_of_pairs:
        r = rpearson[pair[0], pair[1]]
        if 0.0<=r and r<0.2 and k2<k:# and k<=args.max_num_pairs:
            k2 = k2 + 1
            fw.write("{}\t{}\t{}\n".format(seq_list[pair[0]], seq_list[pair[1]], np.round(np.arctanh(r), 8))) # The Fisher transform equals the inverse hyperbolic tangen/arctanh


[back to top](#top)<br>

In [None]:
df = pd.read_csv(f"species/{species}/gene_expression/processed/{species}_gene_expression.tsv", sep="\t", dtype=str)
df

In [None]:
ge_genes = set(list(df.Gene_1) + list(df.Gene_2))
print(f"Number of {species} genes:", len(ge_genes))
with open(f'{args.result_gene_ontology_dir}/{species}_gene_expression_genes.tsv', 'w') as fw:
    for gene in sorted(ge_genes):
        fw.write(f"{gene}\n")

---