In [1]:
import os
import sys

sys.path.append('/mnt/0A2AAC152AABFBB7/CGE/luxgiant-dstream')

from luxgiant_dstream import plots
import pandas as pd
import numpy as np

In [2]:
# home path
df_gwas_top = pd.read_csv('/mnt/0A2AAC152AABFBB7/data/testMiamiPlot/luxgiant_data_hg19.txt', sep='\t')
df_gwas_bottom = pd.read_csv('/mnt/0A2AAC152AABFBB7/data/testMiamiPlot/known_results_hg19.txt', sep='\t')

In [3]:
def prepare_data(data_top:pd.DataFrame, data_bottom:pd.DataFrame)->pd.DataFrame:

    data_top['split_by'] = 'top'
    data_bottom['split_by'] = 'bottom'

    joint = pd.concat([data_top, data_bottom], axis=0)

    return joint

def compute_relative_pos(data, chr_col='chr', pos_col='pos', p_col='p'):

    # Group by chromosome and compute chromosome size
    chr_grouped = data.groupby(chr_col).agg(chrlength=(pos_col, 'max')).reset_index()

    # Calculate cumulative chromosome length
    chr_grouped['cumulativechrlength'] = chr_grouped['chrlength'].cumsum() - chr_grouped['chrlength']

    # Merge cumulative chromosome length back to the original data
    data = pd.merge(data, chr_grouped[[chr_col, 'cumulativechrlength']], on=chr_col)

    # Sort by chromosome and position
    data = data.sort_values(by=[chr_col, pos_col])

    # Add the relative position of the probe/snp
    data['rel_pos'] = data[pos_col] + data['cumulativechrlength']

    # Drop cumulative chromosome length column
    data = data.drop(columns=['cumulativechrlength'])

    data['log10p']= -np.log10(data[p_col])

    return data

def find_chromosomes_center(data, chr_col='CHR', chr_pos_col='rel_pos'):

    chromosomes = data[chr_col].unique()

    axis_center = pd.DataFrame(columns=['CHR', 'center'])

    for i, chrom in enumerate(chromosomes):

        temp = data[data[chr_col] == chrom].reset_index(drop=True)

        axis_center.loc[i, 'CHR'] = chrom
        axis_center.loc[i, 'center'] = (temp[chr_pos_col].max()+temp[chr_pos_col].min())/2

    return axis_center

def process_miami_data(data_top:pd.DataFrame, data_bottom)->tuple:

    data = prepare_data(data_top, data_bottom)

    data = compute_relative_pos(data, chr_col='CHR', pos_col='bp', p_col='p')

    axis_center = find_chromosomes_center(data)

    maxp = np.ceil(data['log10p'].max(skipna=True))

    df_top = data[data['split_by'] == 'top']
    df_bottom = data[data['split_by'] == 'bottom']

    return df_top, df_bottom, axis_center, maxp


In [4]:
df_top, df_bottom, axis, max_p = process_miami_data(df_gwas_top, df_gwas_bottom)

In [5]:
df_top

Unnamed: 0,CHR,bp,SNP,p,split_by,rel_pos,log10p
0,1,228748,rs1318290174,0.265150,top,228748,0.576508
1,1,228762,rs1302506942,0.489098,top,228762,0.310604
2,1,228763,rs1377009769,0.991466,top,228763,0.003722
3,1,229026,rs1190394566,0.755978,top,229026,0.121491
4,1,662622,rs61769339,0.355696,top,662622,0.448921
...,...,...,...,...,...,...,...
8395158,22,51228888,rs201882178,0.424731,top,2880079681,0.371886
8395159,22,51228910,rs80012748,0.202881,top,2880079703,0.692759
8395160,22,51229805,rs9616985,0.766148,top,2880080598,0.115687
8395161,22,51234343,rs374867791,0.216359,top,2880085136,0.664825


In [6]:
df_bottom

Unnamed: 0,CHR,bp,SNP,p,split_by,rel_pos,log10p
16996521,1,54591,chr1:54591,0.44350,bottom,54591,0.353106
10163717,1,54676,chr1:54676,0.12870,bottom,54676,0.890421
24671282,1,79188,chr1:79188,0.44590,bottom,79188,0.350763
14849379,1,82994,chr1:82994,0.49540,bottom,82994,0.305044
15189204,1,86028,chr1:86028,0.07745,bottom,86028,1.110979
...,...,...,...,...,...,...,...
13491618,22,51234163,chr22:51234163,0.36860,bottom,2880084956,0.433445
21467837,22,51234199,chr22:51234199,0.93880,bottom,2880084992,0.027427
14789004,22,51237063,chr22:51237063,0.81320,bottom,2880087856,0.089803
15234610,22,51238513,chr22:51238513,0.36150,bottom,2880089306,0.441892


In [7]:
axis

Unnamed: 0,CHR,center
0,1,124646555.5
1,2,370837124.0
2,3,591427789.0
3,4,785889859.5
4,5,971851970.5
5,6,1147857059.0
6,7,1312922684.5
7,8,1465642237.5
8,9,1609361314.0
9,10,1747719850.5


In [8]:
max_p

np.float64(58.0)