# NCBI id to Ensemble id

In [2]:
import csv
import pandas as pd
import numpy as np
from Bio import Entrez

In [3]:
xls = pd.read_excel("Negative_Positive_genes.xlsx", sheet_name=1)
NCBI_id = xls["gene id"].astype(str).to_list()

In [4]:
print(f'There are {len(NCBI_id)} negative genes')

There are 1189 negative genes


In [5]:
xls.head(10)

Unnamed: 0,gene id,symbol
0,10002,NR2E3
1,10008,KCNE3
2,100133941,CD24
3,10019,SH2B3
4,10020,GNE
5,10021,HCN4
6,10046,MAMLD1
7,100506658,OCLN
8,10060,ABCC9
9,10083,USH1C


In [6]:
# check null values
xls.isnull().sum()

gene id    0
symbol     0
dtype: int64

pip install mygene


Mygene looks like a promising package to use for this task.  
- https://docs.mygene.info/projects/mygene-py/en/latest/#optional-dependencies  package documentation  
- https://www.kaggle.com/code/alexandervc/genes-info-with-mygene-python-package kaggle tutorial

In [37]:
import mygene

mg = mygene.MyGeneInfo()

g = mg.getgene(3507)
print( g.keys() )

dict_keys(['AllianceGenome', 'HGNC', 'MIM', '_id', '_version', 'accession', 'alias', 'clingen', 'entrezgene', 'generif', 'go', 'map_location', 'name', 'other_names', 'pantherdb', 'pathway', 'pharmgkb', 'pharos', 'reagent', 'refseq', 'reporter', 'retired', 'summary', 'symbol', 'taxid', 'type_of_gene', 'umls', 'unigene', 'wikipedia'])


In [38]:
print( g['ensembl'] )
print( len(g['ensembl']) )

KeyError: 'ensembl'

In [32]:
print(type(g['ensembl']))


<class 'list'>


In [35]:
if isinstance(g['ensembl'], list):
    for entry in g['ensembl']:
        print(entry['gene'])
else:
    print(g['ensembl']['gene'])

ENSG00000197822
ENSG00000273814


## 1 create a dataset with NCBI id and Ensemble id

In [39]:
NCBI_dict = {}

for gene in NCBI_id:
    try:
        g = mg.getgene(gene)
        if isinstance(g['ensembl'], list):
            gene_list = []
            for entry in g['ensembl']:
                gene_list.append(entry['gene'])
            NCBI_dict[gene] = gene_list
        else:
            NCBI_dict[gene] = g['ensembl']['gene']
    except:
        print(f'gene {gene} not found')
        NCBI_dict[gene] = 'not found'
        continue
    



gene 3507 not found
gene 6315 not found
gene 7012 not found
{'10002': 'ENSG00000278570', '10008': 'ENSG00000175538', '100133941': 'ENSG00000272398', '10019': 'ENSG00000111252', '10020': 'ENSG00000159921', '10021': 'ENSG00000138622', '10046': 'ENSG00000013619', '100506658': ['ENSG00000197822', 'ENSG00000273814'], '10060': 'ENSG00000069431', '10083': 'ENSG00000006611', '10117': 'ENSG00000132464', '10133': 'ENSG00000123240', '10142': 'ENSG00000127914', '10157': 'ENSG00000008311', '10166': 'ENSG00000102743', '10195': 'ENSG00000214160', '10210': 'ENSG00000197579', '1028': ['ENSG00000129757', 'ENSG00000273707'], '1029': 'ENSG00000147889', '10312': 'ENSG00000110719', '10397': 'ENSG00000104419', '10436': 'ENSG00000126749', '10456': 'ENSG00000143575', '10466': ['ENSG00000164597', 'ENSG00000284369'], '10483': 'ENSG00000101310', '10491': 'ENSG00000170275', '1050': 'ENSG00000245848', '10516': 'ENSG00000140092', '10535': 'ENSG00000104889', '10555': 'ENSG00000169692', '10558': 'ENSG00000090054', '10

In [44]:
NCBI_df = pd.DataFrame.from_dict(NCBI_dict, orient='index')
display(NCBI_df.head(10))
print(NCBI_df.shape)


Unnamed: 0,0
10002,ENSG00000278570
10008,ENSG00000175538
100133941,ENSG00000272398
10019,ENSG00000111252
10020,ENSG00000159921
10021,ENSG00000138622
10046,ENSG00000013619
100506658,"[ENSG00000197822, ENSG00000273814]"
10060,ENSG00000069431
10083,ENSG00000006611


(1189, 1)


## Compare the NCBI dataset with the Ensemble dataset

In [None]:
with open("transcript_seq.csv", "r") as f:
    csv_reader = csv.reader(f)
    next(csv_reader)  # Skip the header row

    for row in csv_reader: