In [78]:
import pandas as pd
import numpy as np
import time

In [62]:
gpcrs = pd.read_csv('../data/gpcr_gene_lists/gpcrs.csv',index_col=0)
gpcrs = gpcrs[~gpcrs.HGNC_symbol.isna()]
gpcr_gene_symbols = gpcrs.HGNC_symbol.unique()

In [63]:
with open('../data/gene_symbols/gpcr_gene_symbols.txt','w') as f:
    f.writelines([f'{x}\n' for x in gpcr_gene_symbols]) 

In [87]:
universe = pd.read_csv('../data/gpcr_gene_lists/universe.csv',header=None)
all_gene_symbols = universe[0]
non_gpcr_genes = [gene for gene in all_gene_symbols if gene not in gpcr_gene_symbols]
control_gene_symbols = np.random.choice(a=non_gpcr_genes,size=500)
with open('../data/gene_symbols/control_gene_symbols.txt','w') as f:
    f.writelines([f'{x}\n' for x in control_gene_symbols]) 

Look up gene symbols with Biomart (Grch38 version) and download Ensembl gene IDs as `../data/gene_symbols/gpcr_gene_symbols_ensembl_lookup.txt`

In [90]:
df = pd.read_csv('../data/gene_symbols/gpcr_gene_symbols_ensembl_lookup.txt',sep='\t')
with open('../data/gene_symbols/gpcr_ensembl_ids.txt','w') as fid:
    fid.writelines([f'{x}\n' for x in df['Gene stable ID']])

df = pd.read_csv('../data/gene_symbols/control_gene_symbols_ensembl_lookup.txt',sep='\t')
with open('../data/gene_symbols/control_ensembl_ids.txt','w') as fid:
    fid.writelines([f'{x}\n' for x in df['Gene stable ID']])

Look up Ensembl gene IDs with Biomart (Grch37 version) and download gene locations as `data/gene_symbols/gpcr_gene_locations_Grch37.txt`

In [115]:
gpcr_gene_locations = pd.read_csv('../data/gene_symbols/gpcr_gene_locations_Grch37.txt',sep='\t')
control_gene_locations = pd.read_csv('../data/gene_symbols/control_gene_locations_Grch37.txt',sep='\t')
gene_locations = pd.concat((gpcr_gene_locations, control_gene_locations))
proper_chromosomes = list((str(x) for x in range(1,23))) + ['X','Y']
gene_locations_cleaned = gene_locations[gene_locations['Chromosome/scaffold name'].isin(proper_chromosomes)]
gene_locations_cleaned.columns = ['Ensembl_gene_id','gene_start_bp','gene_end_bp','chromosome','gene_name']
gene_locations_cleaned.to_csv('../data/gene_symbols/gene_locations_Grch37.csv')

Check if there are any genes which are missing after lookup
- OPN1MW & RP11-673D15.8 are mapped to a regions of chromosomes that couldn't be properly assembled in Grch37

In [113]:
for gene in gene_locations['Gene name'].unique():
    if gene not in gene_locations_cleaned['Gene name'].unique():
        print(gene)

RP11-673D15.8
OPN1MW
