In [52]:
import numpy as np
import pandas as pd
import os
import shutil
import sklearn.decomposition
# import qtl_config_utils
import sys

In [53]:
# outdir = "/hps/nobackup/stegle/users/acuomo/all_scripts/sc_neuroseq/eqtl/timepoint_D11/input_files/celltype_FPP/"
# outdir = "/hps/nobackup/stegle/users/acuomo/all_scripts/sc_neuroseq/eqtl/timepoint_D11/input_files/celltype_P_FPP/"
outdir = "/hps/nobackup/stegle/users/acuomo/all_scripts/sc_neuroseq/eqtl/timepoint_D11/input_files/celltype_NB/"

In [54]:
# general set up
genotypes_file = '/hps/nobackup/hipsci/scratch/genotypes/imputed/REL-2018-01/Full_Filtered_Plink-f/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.recode.filtered'
kinship_file = '/hps/nobackup/hipsci/scratch/genotypes/imputed/REL-2018-01/Full_Filtered_Plink-f/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.recode.filtered.rel'
annotation_file = '/hps/nobackup/hipsci/scratch/singlecell_endodiff/data_processed/scQTLs/annos/ensembl_gene_id_annos.tsv'
#annotation_file = '/hps/nobackup/hipsci/scratch/singlecell_endodiff/data_processed/scQTLs/annos/hgnc_symbol_annos.tsv'
chunk_file = '/nfs/leia/research/stegle/mjbonder/ChunkFiles/Ensembl_75_Limix_Annotation_FC_Gene_step100.txt'
n_pcs = 15
base_dir = '/hps/nobackup/stegle/users/acuomo/all_scripts/sc_neuroseq/eqtl/timepoint_D11/'

In [55]:
sample_mapping_file = os.path.join(outdir, 'samplemapping.tsv')
phenotype_file = os.path.join(outdir, 'phenotypes.tsv')
covariates_file = os.path.join(outdir, 'covariates.tsv')
# noise_matrix_file = os.path.join(outdir, 'noise_matrix.tsv')
# config_file = os.path.join(outdir, 'qtl_config.yaml')

In [56]:
# eQTL discovery parameters
#number of top expressed genes to test
n_genes = 50000

In [57]:
if not os.path.exists(outdir):
    os.makedirs(outdir, exist_ok=True)

In [58]:
mydir = '/nfs/leia/research/stegle/dseaton/hipsci/singlecell_neuroseq/data/data_processed/'

# ## FPP
# D11 = 'pool1_13_noddd_D11/pool1_13_noddd_D11.scanpy.w_metadata.w_celltype.scanpy.celltype.FPP.groupedby.donor_id-celltype.mean.tsv'

# ## P_FPP
# D11 = 'pool1_13_noddd_D11/pool1_13_noddd_D11.scanpy.w_metadata.w_celltype.scanpy.celltype.P_FPP.groupedby.donor_id-celltype.mean.tsv'

## NB
D11 = 'pool1_13_noddd_D11/pool1_13_noddd_D11.scanpy.w_metadata.w_celltype.scanpy.celltype.NB.groupedby.donor_id-celltype.mean.tsv'

In [59]:
phenotype_input_file = mydir + D11

In [60]:
# phenotype_input_file = sys.argv[1]
# outdir = sys.argv[2]
# if len(sys.argv)>=4:
#     subset_field = sys.argv[3]
#     selected_subset = sys.argv[4]
# else:
#     subset_field = None
#     selected_subset = None

In [61]:
#load map HGNC to ensembl
mapping_df = pd.read_csv('/nfs/leia/research/stegle/dseaton/genomes/hg19/annotation/geneid_mappings/hgnc_symbol2ensembl_gene_id.txt', sep='\t')

In [62]:
# process phenotype dataframe (untreated/untreated)
groupedby_df = pd.read_csv(phenotype_input_file, sep='\t')
# groupedby_df = groupedby_df.query('treatment=="NONE"')
# groupedby_df = groupedby_df.query('treatment=="ROT"')

In [63]:
# #subset columns
# if selected_subset is not None:
#     phenotype_df = groupedby_df.query('{}==@selected_subset'.format(subset_field))
# else:
#     phenotype_df = groupedby_df.copy()

In [64]:
phenotype_df = groupedby_df.copy()

In [65]:
# should only be cells from one celltype
assert(len(phenotype_df['celltype'].drop_duplicates())==1)

In [66]:
#create merged index
phenotype_df['index'] = phenotype_df['donor_id']
cell_count_ds = phenotype_df['n_cells']
cell_count_ds.index = phenotype_df['index'].tolist()

In [67]:
#take donor and pool cols for samplemapping df
samplemapping_df = phenotype_df[['donor_id','index']]
samplemapping_df.to_csv(sample_mapping_file, sep='\t', index=False, header=False)

In [68]:
#reorganise to just be expression data indexed by the merged index
cols_to_drop = list(set(phenotype_df.columns) & {'donor_id','celltype','n_cells','treatment','pool_id'})
phenotype_df = phenotype_df.drop(cols_to_drop, axis=1).set_index('index')
phenotype_df = phenotype_df.transpose()

In [69]:
#number of top expressed genes
selected_genes = list(phenotype_df.mean(axis=1).nlargest(n_genes).index)
phenotype_df = phenotype_df.loc[selected_genes, :]

In [70]:
phenotype_list = phenotype_df.index
mapping_df = mapping_df.query('hgnc_symbol in @phenotype_list')
mapping_df = mapping_df.drop_duplicates(subset=['hgnc_symbol'])
mapping_df = mapping_df.set_index('hgnc_symbol')

In [71]:
# limit only to hgnc symbols that map to ensembl gene IDs
phenotype_df = phenotype_df.loc[mapping_df.index,:]
phenotype_df.index = mapping_df['ensembl_gene_id']

In [72]:
phenotype_df.to_csv(phenotype_file, sep='\t')

In [73]:
pc_mat = sklearn.decomposition.PCA(n_components=n_pcs).fit_transform(phenotype_df.values.transpose())
pc_df = pd.DataFrame(data=pc_mat, index=phenotype_df.columns, columns=['PC{}'.format(x) for x in range(1,n_pcs+1)])

pc_df.to_csv(covariates_file, sep='\t')

In [74]:
# noise_scaling_vector = [1/float(x) for x in cell_count_ds.tolist()]
# noise_matrix = np.diag(noise_scaling_vector)
# noise_matrix_df = pd.DataFrame(data=noise_matrix, index=cell_count_ds.index, columns=cell_count_ds.index)

In [51]:
# noise_matrix_df.to_csv(noise_matrix_file, sep='\t')
# kinship_file = noise_matrix_file

In [27]:
number_of_permutations = '1000'
minor_allele_frequency = '0.05'
hwe = '0.000001'
call_rate = '1'
window_size = '250000'
block_size = '15000'


config_dict = {'af': annotation_file,
               'pf': phenotype_file,
               'cf': covariates_file,
#                'kf': kinship_file,
               'smf': sample_mapping_file,
               'plink': genotypes_file,
               'maf': minor_allele_frequency,
               'hwe': hwe,
               'np': number_of_permutations,
               'cr': call_rate,
               'w': window_size,
               'bs': block_size,
               'chunk_file': chunk_file
}

# write config to a file
qtl_config_utils.write_config(config_dict, config_file)

NameError: name 'qtl_config_utils' is not defined