In [4]:
import os
import glob
import pandas as pd 

os.chdir('/mnt/BioHome/jreyna/jreyna-temp/projects/t1d-loop-catalog/')

In [6]:
#load all sgl data

fns = glob.glob('results/hg38/finemapping/sgls/*/*.5000.finemap_sgls.tsv')

all_data = []
for fn in fns:
    info = fn.split('/')

    genome = info[1]
    causaldb_fn = info[4]
    sample = info[5].rsplit('.', maxsplit=3)[0]

    tdf = pd.read_table(fn)
    tdf.loc[:, 'causaldb_fn'] = causaldb_fn
    tdf.loc[:, 'sample'] = sample

    all_data.append(tdf)

In [7]:
all_df = pd.concat(all_data)

In [8]:
# add meta information
causal_metadata_fn = 'workflow/qscripts/finemap/causal_db/init.gwas_study.causal_db.immune_select_samples.tsv'
causal_metadata = pd.read_table(causal_metadata_fn, header=None)

causal_metadata_mapper =  causal_metadata.iloc[:, [2, 8, 18]]
causal_metadata_mapper.columns = ['mesh_term', 'author', 'filename']

In [9]:
all_df = all_df.merge(causal_metadata_mapper, left_on='causaldb_fn', right_on='filename')

# Count the number of sgls per run

In [10]:
grps = all_df.groupby(['causaldb_fn', 'mesh_term', 'author', 'sample'])
counts = grps['chrA_loop'].count().to_frame()
counts.columns = ['num_sgls']

In [11]:
counts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,num_sgls
causaldb_fn,mesh_term,author,sample,Unnamed: 4_level_1
AT258,Psoriasis,Roslin Institute,CD4_Naive_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1,23
AT258,Psoriasis,Roslin Institute,CD4_Naive_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2,14
AT258,Psoriasis,Roslin Institute,CD4_Naive_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3,7
AT258,Psoriasis,Roslin Institute,CD4_Naive_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1,37
AT258,Psoriasis,Roslin Institute,CD4_Naive_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2,29
...,...,...,...,...
PH378,"Arthritis, Rheumatoid",Okada Y,Nonclassical_Monocyte_1786.phs001703v4p1.Homo_Sapiens.H3K27ac.b1,4
PH378,"Arthritis, Rheumatoid",Okada Y,Nonclassical_Monocyte_1786.phs001703v4p1.Homo_Sapiens.H3K27ac.b2,4
PH378,"Arthritis, Rheumatoid",Okada Y,Nonclassical_Monocyte_1786.phs001703v4p1.Homo_Sapiens.H3K27ac.b3,6
PH378,"Arthritis, Rheumatoid",Okada Y,Nonclassical_Monocyte_1786.phs001703v4p1.Homo_Sapiens.H3K27ac.b4,2


In [12]:
counts.describe()

Unnamed: 0,num_sgls
count,1524.0
mean,61.106955
std,112.265027
min,1.0
25%,7.0
50%,22.0
75%,58.0
max,1011.0


In [13]:
outfn = 'check.xlsx'
counts.to_excel(outfn)

# Count the number of sgls per run

In [14]:
mesh_grps = all_df.groupby(['mesh_term'])


In [15]:
mesh_df = mesh_grps.get_group('Diabetes Mellitus, Type 1')

In [16]:
mesh_df.shape

(60660, 22)

In [17]:
mesh_df.columns

Index(['chrA_loop', 'startA_loop', 'endA_loop', 'chrB_loop', 'startB_loop',
       'endB_loop', '-log10_qval_loop', 'chr_snp', 'start_snp', 'end_snp',
       'snp_anchor', 'chr_gene', 'start_gene', 'end_gene', 'genename',
       'geneid', 'strand', 'causaldb_fn', 'sample', 'mesh_term', 'author',
       'filename'],
      dtype='object')

In [18]:
mesh_df.drop_duplicates(['chrA_loop', 'chrB_loop', 'startA_loop', 'startB_loop', 'geneid', 'start_snp'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mesh_df.drop_duplicates(['chrA_loop', 'chrB_loop', 'startA_loop', 'startB_loop', 'geneid', 'start_snp'], inplace=True)


In [19]:
mesh_df.shape

(5657, 22)

## Load gene list


In [20]:
t1d_consensus_list_fn = '/mnt/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/gene_lists/consensus_gene_list.txt'
t1d_consensus_genes = pd.read_table(t1d_consensus_list_fn, header=None, names=['genename'])

print('The number of consensus genes is: {}'.format(t1d_consensus_genes.shape[0]))

The number of consensus genes is: 497


In [21]:
def print_genelist(genelist):
    print('\n'.join(list(genelist)))

In [22]:
t1d_df = mesh_grps.get_group('Diabetes Mellitus, Type 1')

In [23]:
uniq_genes = t1d_df.genename.unique()
print('The unique number of genes is: {}'.format(len(uniq_genes)))

The unique number of genes is: 591


In [31]:
print_genelist(sorted(uniq_genes))

ABT1
AC003043.1
AC003043.2
AC003071.2
AC003102.1
AC003688.1
AC003688.2
AC004264.2
AC004596.1
AC004832.2
AC004832.4
AC004832.5
AC004832.6
AC004865.2
AC004997.1
AC005593.1
AC005785.1
AC006441.1
AC006449.2
AC006449.6
AC006449.7
AC008649.2
AC008695.1
AC008755.1
AC009560.1
AC009690.1
AC016876.1
AC016876.2
AC017002.6
AC020743.1
AC020743.2
AC020765.1
AC020765.2
AC023509.3
AC026801.2
AC026954.1
AC026954.2
AC034102.3
AC034102.5
AC034102.6
AC034102.7
AC040977.2
AC055813.1
AC060780.1
AC060780.2
AC073896.1
AC073896.2
AC073896.4
AC098613.1
AC099782.2
AC104581.4
AC109326.1
AC113189.1
AC113189.4
AC113410.3
AC117382.2
AC124014.1
AC129492.1
AC137810.1
AC138894.1
AC138904.1
AC139887.2
AC145285.3
AC145285.6
ACADVL
ACAP1
ADARB1
ADPGK-AS1
ADPRHL2
AFG3L2
AGO3
AK3P5
AKAP8
AL021918.1
AL021918.3
AL022345.4
AL031777.1
AL031777.2
AL031777.3
AL109741.1
AL109809.3
AL121936.1
AL121936.2
AL121944.1
AL121972.1
AL133330.1
AL138787.1
AL139286.2
AL157402.1
AL157402.2
AL157823.2
AL353759.1
AL356215.1
AL356234.1
AL357060.

In [24]:
t1d_consensus_df = t1d_df.loc[t1d_df.genename.isin(t1d_consensus_genes.iloc[:, 0].tolist())]

In [25]:
uniq_consensus_genes = sorted(t1d_consensus_df.genename.unique().tolist())
print('The unique number of genes is: {}'.format(len(uniq_consensus_genes)))

The unique number of genes is: 35


In [26]:
print('\n'.join(uniq_consensus_genes))

ACAP1
ATP6V1G3
ATXN2L
BTN2A3P
CCR5
CCRL2
CLN3
HCP5
HLA-DMA
HLA-DMB
HLA-DPB1
HLA-DQB1
HLA-DQB1-AS1
HLA-DRA
HLA-DRB1
HLA-DRB5
HLA-DRB9
IATPR
IL27
IL7R
IRF4
ITPR3
NUPR1
POU5F1
PSMB2
PSMB8
PTPRC
RPS26
SBK1
SIRPG
SLC2A4
SOCS1
TAP1
TATDN3
TNFAIP3


## Checking Specific Genes

#### Checking IL2RA

In [27]:
il2ra_geneid = 'ENSG00000134460'
display(t1d_df.loc[t1d_df.genename=='IL2RA'])
t1d_df.loc[t1d_df.geneid==il2ra_geneid]

Unnamed: 0,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,-log10_qval_loop,chr_snp,start_snp,end_snp,...,start_gene,end_gene,genename,geneid,strand,causaldb_fn,sample,mesh_term,author,filename


Unnamed: 0,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,-log10_qval_loop,chr_snp,start_snp,end_snp,...,start_gene,end_gene,genename,geneid,strand,causaldb_fn,sample,mesh_term,author,filename


There are no SGL results for IL2RA. 

#### Checking Bach2

In [28]:
il2ra_geneid = 'ENSG00000112182'
display(t1d_df.loc[t1d_df.genename=='BACH2'])
t1d_df.loc[t1d_df.geneid==il2ra_geneid]

Unnamed: 0,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,-log10_qval_loop,chr_snp,start_snp,end_snp,...,start_gene,end_gene,genename,geneid,strand,causaldb_fn,sample,mesh_term,author,filename


Unnamed: 0,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,-log10_qval_loop,chr_snp,start_snp,end_snp,...,start_gene,end_gene,genename,geneid,strand,causaldb_fn,sample,mesh_term,author,filename


There are no SGL results for IL2RA. 