# Association Analysis using a generalized linear model (GLM)

The present notebook serves as a guide of how to use the library `IDEAL-GENOM` to perform a genome wide association analysis (GWAS). The cornerstone of this proposed analysis is a GLM model.

In [None]:
import sys
import os

# add parent directory to path
library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

from ideal_genom.gwas.gwas_glm import GWASfixed

In the next widgets the user must input the paths and filenames needed to perform the GWAS.

1. `input_path`: folder with the input data. The pipeline assumes that the files are `.bed`, `.bim`, `.fam` files;
2. `input_name`: prefix of the `PLINK` binary files:
3. `output_path`: folder to output the results;
4. `output_name`: the prefix of the output files.

In [None]:
import ipywidgets as widgets
from IPython.display import display

# Create interactive widgets for input
input_path = widgets.Text(
    value='',
    description='Path to input PLINK binary files:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)

input_name = widgets.Text(
    value='',
    description='Prefix of PLINK binary files:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)

output_path = widgets.Text(
    value='',
    description='Path to output files:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)
output_name = widgets.Text(
    value='',
    description='Name of the resulting files:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)
# Display the widgets
display(input_path, input_name, output_path, output_name)

# Function to get the text parameter values
def get_params():
    return input_path.value, input_name.value, output_path.value, output_name.value

In [None]:
path_params = get_params()
print('input_path: ', path_params[0])
print('input_name: ', path_params[1])
print('output_path: ', path_params[2])
print('output_name: ', path_params[3])

with this info we can initialiaze the class `GWASfixed`.

In [None]:
gwas_glm = GWASfixed(
    input_path=path_params[0], 
    input_name=path_params[1],
    output_path=path_params[2], 
    output_name=path_params[3]
)

In the next widgets, please provide the parameters needed to execute the pipeline.

1. `maf`: minor allele frequency;
2. `mind`: individual missing rate;
3. `hwe`: Hardy-Weinberg equilibrium;
4. `ci`: confidence interval;
5. `gtf_path`: path to alternative gtf file, default value is `None`;
6. `build`: build of the human genome, possible values are `37` and `38`;
7. `anno_source`: source for the annotations, possible values are `ensembl` or `refseq`. 

In [None]:
maf = widgets.FloatText(
    value=0.01,
    description='Minor Allele Frequency:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)

mind = widgets.FloatText(
    value=0.1,
    description='Individual missing rate:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)

hwe = widgets.FloatText(
    value=5e-8,
    description='Hardy-Weinberg Equilibrium:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)

ci = widgets.FloatText(
    value=0.95,
    description='Confidence interval:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)

gtf_path = widgets.Text(
    value=None,
    description='Path to GTF file:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)

build = widgets.Text(
    value='38',
    description='Genome build:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)

anno_source = widgets.Text(
    value='ensembl',
    description='Annotation source:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)

display(maf, mind, hwe, ci, gtf_path, build, anno_source)

def get_gwas_params():

    gwas_params = dict()

    gwas_params['maf']  = maf.value
    gwas_params['hwe']  = hwe.value
    gwas_params['mind'] = mind.value
    gwas_params['ci']   = ci.value
    gwas_params['gtf_path'] = gtf_path.value
    gwas_params['build'] = build.value
    gwas_params['anno_source'] = anno_source.value

    return gwas_params

In [None]:
gwas_params = get_gwas_params()
gwas_params

Execute the pipeline steps.

In [None]:
gwas_steps = {
    'train_model': (gwas_glm.fixed_model_association_analysis, {
        'maf' :gwas_params['maf'], 
        'mind':gwas_params['mind'], 
        'hwe' :gwas_params['hwe'], 
        'ci'  :gwas_params['ci']
    }),
    'top_hits'   : (gwas_glm.get_top_hits, {'maf':gwas_params['maf']}),
    'annotate_hits': (gwas_glm.annotate_top_hits, {
        'gtf_path': gwas_params['gtf_path'],
        'build': gwas_params['build'],
        'anno_source': gwas_params['anno_source']
    })
}

step_description = {
    'train_model': 'Train the model',
    'top_hits'   : 'Get top hits',
    'annotate_hits': 'Annotate top hits'
}

for name, (func, params) in gwas_steps.items():
    print(f"\033[1m{step_description[name]}.\033[0m")
    func(**params)