In [2]:
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'pandas'

In [None]:
def remove_nan_from_set(s):
    """Remove null instances from set
    
    :param s: selected set
    :return: set with no null values
    """
    return {x for x in s if pd.notna(x)}

In [None]:
def read_subset_genes_csv(location, source= str):
    """Create a df of primary gene symbol- alias symbol pairs

    :param location: file location
    :param source: representation in the source of gene records
    return: a df of gene records
    """
    
    subset_genes_xxxx_df = pd.read_csv(
        location, index_col=[0],dtype={"NCBI_ID": str,"HGNC_ID":str})
    subset_genes_xxxx_df["source"] = str(source)
    subset_genes_xxxx_df['gene_symbol'] = subset_genes_xxxx_df['gene_symbol'].str.upper()
    subset_genes_xxxx_df['alias_symbol'] = subset_genes_xxxx_df['alias_symbol'].str.upper()
    return subset_genes_xxxx_df

In [None]:
def make_col_ortholog_match(recording_df, source_df, animal= str):
    """Check for ortholog matches in the primary gene symbol- alias symbol pairs. 
    Adds a T/F column for each pair. T if the alias is an ortholog from the specified animal and F if not

    :param recording_df: df that contains the primary gene symbol- alias symbol pairs
    :param source_df: df that contains the orthologs and their associated human genes
    :param animal: the animal from with the orthologs are being checked
    return: the number of primary gene symbol- alias symbol pairs where the alias is an ortholog from the specified animal
    """
    
    recording_df[f'{animal} Match'] = recording_df.apply(lambda row: 
                            any((source_df['Gene name'] == row['gene_symbol']) 
                                & 
                                (source_df[f'{animal} gene name'] == row['alias_symbol'])), axis=1)
    animal_df = recording_df[recording_df[f'{animal} Match']]
    return len(animal_df)

## Download gene records from ENSG, HGNC, and NCBI 

#### This subset file was created in the alias-primary collision analysis notebook by the following modifications:
 - Gene records with no aliases were removed.
 - Primary gene symbol- alias symbol pairs where the alias was an exact match to the primary symbol were removed.
 - Primary gene symbol- alias symbol pairs that were duplicated were removed.

In [None]:
subset_genes_ensg_df = read_subset_genes_csv("created_files/subset_genes_ensg_df.csv", "ENSG")
subset_genes_hgnc_df = read_subset_genes_csv("created_files/subset_genes_hgnc_df.csv", "HGNC")
subset_genes_ncbi_df = read_subset_genes_csv("created_files/subset_genes_ncbi_df.csv", "NCBI")

## Combine data from all sources

In [None]:
subset_genes_df = pd.concat([subset_genes_ensg_df, subset_genes_hgnc_df, subset_genes_ncbi_df], axis=0)
subset_genes_df

Unnamed: 0,ENSG_ID,gene_symbol,alias_symbol,HGNC_ID,NCBI_ID,source
0,ENSG00000210049,MT-TF,MTTF,7481,,ENSG
1,ENSG00000210049,MT-TF,TRNF,7481,,ENSG
2,ENSG00000211459,MT-RNR1,12S,7470,,ENSG
3,ENSG00000211459,MT-RNR1,MOTS-C,7470,,ENSG
4,ENSG00000211459,MT-RNR1,MTRNR1,7470,,ENSG
...,...,...,...,...,...,...
190961,,GLTC1,GLTC,56861,131840634,NCBI
193342,,GABRA6-AS1,ARBAG,40248,132532400,NCBI
193377,,LNCARGI,ARGI,56890,133395150,NCBI
193378,,MLDHR,MP31,55481,133834869,NCBI


## Group the associated data by primary gene symbol- alias symbol pairs

#### This will ensure that there are no duplicate primary gene symbol- alias symbol pairs as well as preserving in which sources these pairs occur

In [None]:
subset_genes_df = subset_genes_df.groupby(['gene_symbol',"alias_symbol"], as_index=False).agg({
    "HGNC_ID": lambda x: set(x),
    'ENSG_ID': lambda x: set(x),
    'NCBI_ID': lambda x: set(x),
    "source": lambda x: set(x)
})
subset_genes_df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source
0,A-GAMMA3'E,A-GAMMA-E,{nan},{nan},{109951028},{NCBI}
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI}
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI}
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI}
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI}
...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}"
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}"
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}"
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}"


In [None]:
subset_genes_df['NCBI_ID'] = subset_genes_df['NCBI_ID'].apply(remove_nan_from_set)
subset_genes_df['ENSG_ID'] = subset_genes_df['ENSG_ID'].apply(remove_nan_from_set)
subset_genes_df['HGNC_ID'] = subset_genes_df['HGNC_ID'].apply(remove_nan_from_set)

# Ortholog Analysis

## Download an Ensembl Biomart export file with the Gene Name and the Ortholog Gene Name

In [None]:
mur_dros_ortho_df = pd.read_csv(
    "downloaded_files/ensg_mart_export_dros_murin_ortho.txt", sep=",", index_col=[0])
mur_dros_ortho_df

Unnamed: 0_level_0,Drosophila melanogaster (Fruit fly) gene name,Drosophila melanogaster (Fruit fly) gene stable ID,Mouse gene stable ID,Mouse gene name,Gene name
Gene stable ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000210049,,,,,MT-TF
ENSG00000211459,,,,,MT-RNR1
ENSG00000210077,,,,,MT-TV
ENSG00000210082,,,,,MT-RNR2
ENSG00000209082,,,,,MT-TL1
...,...,...,...,...,...
ENSG00000232679,,,,,LINC01705
ENSG00000200033,,,ENSMUSG00000088001,Gm22883,RNU6-403P
ENSG00000228437,,,,,LINC02474
ENSG00000229463,,,,,LYST-AS1


## Make all of the gene symbols all caps

Different species follow different gene nomenclature conventions. <br>
For example, mouse genes have the first letter capitalized but the rest lowercase.<br>
They need to be all caps for matching

In [None]:
mur_dros_ortho_df["Gene name"] = mur_dros_ortho_df["Gene name"].str.upper()
mur_dros_ortho_df["Drosophila melanogaster (Fruit fly) gene name"] = mur_dros_ortho_df["Drosophila melanogaster (Fruit fly) gene name"].str.upper()
mur_dros_ortho_df["Mouse gene name"] = mur_dros_ortho_df["Mouse gene name"].str.upper()

## Match aliases to orthologs!

### Drosophila melanogaster

In [None]:
ortholog_match_subset_genes_df = subset_genes_df.copy()

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, mur_dros_ortho_df,"Drosophila melanogaster (Fruit fly)")

### Mouse

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, mur_dros_ortho_df,"Mouse")

In [None]:
ortholog_match_subset_genes_df[(ortholog_match_subset_genes_df['DM Match']) & (ortholog_match_subset_genes_df['Mouse Match'])]

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,DM Match,Mouse Match
9418,CCZ1B,CCZ1,{21717},{ENSG00000146574},{221960},{NCBI},True,True
18951,EIF1AX,EIF1A,{3250},{ENSG00000173674},{1964},"{ENSG, NCBI}",True,True
59611,RAB7A,RAB7,{9788},{ENSG00000075785},{7879},"{ENSG, NCBI}",True,True


## Abingdon Island Giant Tortoise

In [None]:
tortoise_ostrich_amouse_ortho_df = pd.read_csv(
    "downloaded_files/tortoise_ostrich_amouse_export.txt", sep=",")

Unnamed: 0,Gene name,Abingdon island giant tortoise gene name,African ostrich gene name,Algerian mouse gene name
0,MT-TF,,,
1,MT-RNR1,,,
2,MT-TV,,,
3,MT-RNR2,,,
4,MT-TL1,,,
...,...,...,...,...
87088,LINC01705,,,
87089,RNU6-403P,,,Gm22883
87090,LINC02474,,,
87091,LYST-AS1,,,


In [None]:
tortoise_ostrich_amouse_ortho_df["Gene name"] = tortoise_ostrich_amouse_ortho_df["Gene name"].str.upper()
tortoise_ostrich_amouse_ortho_df["Abingdon island giant tortoise gene name"] = tortoise_ostrich_amouse_ortho_df["Abingdon island giant tortoise gene name"].str.upper()
tortoise_ostrich_amouse_ortho_df["African ostrich gene name"] = tortoise_ostrich_amouse_ortho_df["African ostrich gene name"].str.upper()
tortoise_ostrich_amouse_ortho_df["Algerian mouse gene name"] = tortoise_ostrich_amouse_ortho_df["Algerian mouse gene name"].str.upper()

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, tortoise_ostrich_amouse_ortho_df,"Abingdon island giant tortoise")

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,DM Match,Mouse Match,Abingdon island giant tortoise Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False
...,...,...,...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}",False,False,False
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}",True,False,False


## African Ostrich

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, tortoise_ostrich_amouse_ortho_df,"African ostrich")

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,DM Match,Mouse Match,Abingdon island giant tortoise Match,African ostrich Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}",False,False,False,False
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}",True,False,False,False


## Algerian Mouse

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, tortoise_ostrich_amouse_ortho_df,"Algerian mouse")

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,DM Match,Mouse Match,Abingdon island giant tortoise Match,African ostrich Match,Algerian mouse Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}",False,False,False,False,False
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}",True,False,False,False,False


## Alpaca

In [None]:
Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df = pd.read_csv(
    "downloaded_files/Alpaca_Amarmot_Amolly_Abison_Ablackbear_mart_export.txt", sep=",")

Unnamed: 0,Gene name,Alpaca gene name,Alpine marmot gene name,Amazon molly gene name,American bison gene name,American black bear gene name
0,MT-TF,,,,,
1,MT-RNR1,,,,,
2,MT-TV,,,,,
3,MT-RNR2,,,,,
4,MT-TL1,,,,,
...,...,...,...,...,...,...
111858,LINC01705,,,,,
111859,RNU6-403P,,,,,
111860,LINC02474,,,,,
111861,LYST-AS1,,,,,


In [None]:
Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df["Gene name"] = Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df["Gene name"].str.upper()
Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df["Alpaca gene name"] = Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df["Alpaca gene name"].str.upper()
Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df["Alpine marmot gene name"] = Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df["Alpine marmot gene name"].str.upper()
Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df["Amazon molly gene name"] = Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df["Amazon molly gene name"].str.upper()
Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df["American bison gene name"] = Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df["American bison gene name"].str.upper()
Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df["American black bear gene name"] = Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df["American black bear gene name"].str.upper()

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df,"Alpaca")

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,DM Match,Mouse Match,Abingdon island giant tortoise Match,African ostrich Match,Algerian mouse Match,Alpaca Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False,False
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False,False
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}",False,False,False,False,False,False
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}",True,False,False,False,False,False


## Alpine Marmot

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df,"Alpine marmot")

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,DM Match,Mouse Match,Abingdon island giant tortoise Match,African ostrich Match,Algerian mouse Match,Alpaca Match,Alpine marmot Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False,False,False
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False,False,False
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}",False,False,False,False,False,False,False
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}",True,False,False,False,False,False,False


## Amazon Molly

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df,"Amazon molly")
subset_genes_df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,DM Match,Mouse Match,Abingdon island giant tortoise Match,African ostrich Match,Algerian mouse Match,Alpaca Match,Alpine marmot Match,Amazon molly Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False,False,False,False
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False,False,False,False
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}",False,False,False,False,False,False,False,False
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}",True,False,False,False,False,False,False,False


## American Bison

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df,"American bison")

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,DM Match,Mouse Match,Abingdon island giant tortoise Match,African ostrich Match,Algerian mouse Match,Alpaca Match,Alpine marmot Match,Amazon molly Match,American bison Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False,False,False,False,False
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False,False,False,False,False
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}",False,False,False,False,False,False,False,False,False
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}",True,False,False,False,False,False,False,False,False


## American Black Bear

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df,"American black bear")

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,DM Match,Mouse Match,Abingdon island giant tortoise Match,African ostrich Match,Algerian mouse Match,Alpaca Match,Alpine marmot Match,Amazon molly Match,American bison Match,American black bear Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,False,False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False,False,False,False,False,False
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False,False,False,False,False,False
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}",False,False,False,False,False,False,False,False,False,False
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}",True,False,False,False,False,False,False,False,False,False


## American Mink

In [None]:
Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df = pd.read_csv(
    "downloaded_files/Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue.txt", sep="\t")
Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df

Unnamed: 0,Gene name,American mink gene name,Arabian camel gene name,Arctic ground squirrel gene name,Argentine black and white tegu gene name,Armadillo gene name,Asian bonytongue gene name
0,MT-TF,,,,,,
1,MT-RNR1,,,,,,
2,MT-TV,,,,,,
3,MT-RNR2,,,,,,
4,MT-TL1,,,,,,
...,...,...,...,...,...,...,...
125912,LINC01705,,,,,,
125913,RNU6-403P,,,,,,
125914,LINC02474,,,,,,
125915,LYST-AS1,,,,,,


In [None]:
Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["Gene name"] = Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["Gene name"].str.upper()
Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["American mink gene name"] = Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["American mink gene name"].str.upper()
Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["Arabian camel gene name"] = Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["Arabian camel gene name"].str.upper()
Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["Arctic ground squirrel gene name"] = Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["Arctic ground squirrel gene name"].str.upper()
Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["Argentine black and white tegu gene name"] = Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["Argentine black and white tegu gene name"].str.upper()
Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["Armadillo gene name"] = Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["Armadillo gene name"].str.upper()
Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["Asian bonytongue gene name"] = Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df["Asian bonytongue gene name"].str.upper()

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df,"American mink")

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,DM Match,Mouse Match,Abingdon island giant tortoise Match,African ostrich Match,Algerian mouse Match,Alpaca Match,Alpine marmot Match,Amazon molly Match,American bison Match,American black bear Match,American mink Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,False,False,False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False,False,False,False,False,False,False
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}",False,False,False,False,False,False,False,False,False,False,False
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}",False,False,False,False,False,False,False,False,False,False,False
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}",True,False,False,False,False,False,False,False,False,False,False


## Arabian Camel

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df,"Arabian camel")

13

## Arctic Ground Squirrel

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df,"Arctic ground squirrel")

1

## Argentine Black and White Tegu

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df,"Argentine black and white tegu")

5

## Armadillo

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df,"Armadillo")

17

## Asian Bonytongue

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df,"Asian bonytongue")

91

## Atlantic Cod

In [None]:
Atlanticcod_Atlanticherring_Atlanticsalmon_Australiancrocodile_Ballanwrasse_ortho_df = pd.read_csv(
    "downloaded_files/Atlanticcod_Atlanticherring_Atlanticsalmon_Australiancrocodile_Ballanwrasse.txt", sep=",")
Atlanticcod_Atlanticherring_Atlanticsalmon_Australiancrocodile_Ballanwrasse_ortho_df

Unnamed: 0,Gene name,Barramundi perch gene name,Beluga whale gene name,Bicolor damselfish gene name,Black snub-nosed monkey gene name,Blue whale gene name
0,MT-TF,,,,,
1,MT-RNR1,,,,,
2,MT-TV,,,,,
3,MT-RNR2,,,,,
4,MT-TL1,,,,,
...,...,...,...,...,...,...
115851,LINC01705,,,,,
115852,RNU6-403P,,,,,
115853,LINC02474,,,,,
115854,LYST-AS1,,,,,


In [None]:
Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df = pd.read_csv(
    "downloaded_files/Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale.txt", sep=",")
Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df

Unnamed: 0,Gene name,Barramundi perch gene name,Beluga whale gene name,Bicolor damselfish gene name,Black snub-nosed monkey gene name,Blue whale gene name
0,MT-TF,,,,,
1,MT-RNR1,,,,,
2,MT-TV,,,,,
3,MT-RNR2,,,,,
4,MT-TL1,,,,,
...,...,...,...,...,...,...
115851,LINC01705,,,,,
115852,RNU6-403P,,,,,
115853,LINC02474,,,,,
115854,LYST-AS1,,,,,


In [None]:
Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df["Gene name"] = Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df["Gene name"].str.upper()
Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df["Barramundi perch gene name"] = Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df["Barramundi perch gene name"].str.upper()
Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df["Beluga whale gene name"] = Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df["Beluga whale gene name"].str.upper()
Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df["Bicolor damselfish gene name"] = Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df["Bicolor damselfish gene name"].str.upper()
Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df["Black snub-nosed monkey gene name"] = Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df["Black snub-nosed monkey gene name"].str.upper()
Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df["Blue whale gene name"] = Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df["Blue whale gene name"].str.upper()

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df,"Barramundi perch")

95

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df,"Beluga whale")

228

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df,"Bicolor damselfish")

93

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df,"Black snub-nosed monkey")

14

In [None]:
make_col_ortholog_match(ortholog_match_subset_genes_df, Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df,"Blue whale")

7

## Trying to load in a file about gene families?

In [None]:
from Bio import AlignIO
import pandas as pd

# Load the EMF file
input_file = "downloaded_files/Compara.112.ncrna_default.nh.emf"  # Update with the extracted filename
output_csv = "output.csv"
output_tsv = "output.tsv"

# Read the EMF file
alignment = AlignIO.read(input_file, "emf")

# Convert to a DataFrame
data = []
for record in alignment:
    data.append([record.id] + record.seq)

df = pd.DataFrame(data)

# Save to CSV
df.to_csv(output_csv, index=False)
# Save to TSV
df.to_csv(output_tsv, sep='\t', index=False)

print("Conversion complete!")


ValueError: Unknown format 'emf'

In [None]:
import pandas as pd

# Initialize lists to hold data
data = []

# Open and read the EMF file
with open("downloaded_files/Compara.112.ncrna_default.nh.emf", "r") as infile:
    for line in infile:
        # Skip comments or empty lines if necessary
        if line.startswith("#") or not line.strip():
            continue
        
        # Split line into components (adjust this based on the actual format)
        components = line.strip().split()
        
        # Append the components to the data list
        data.append(components)

# Create a DataFrame from the data list
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("output.csv", index=False)
# Save to TSV
df.to_csv("output.tsv", sep='\t', index=False)

print("Conversion complete!")

Conversion complete!


In [None]:
output_df = pd.read_csv(
    "output.csv", sep=",")
output_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,SEQ,cavia_porcellus,ENSCPOT00000029270,DS562967.1,4315635,4315772,-1,ENSCPOG00000028438,
1,SEQ,cavia_porcellus,ENSCPOT00000025916,DS562884.1,16918525,16918605,1,ENSCPOG00000025829,
2,SEQ,cervus_hanglu_yarkandensis,ENSCHYT00000027762,23,34283733,34283858,1,ENSCHYG00000016461,
3,SEQ,canis_lupus_dingo,ENSCAFT00020034875,QKWQ01000243.1,12158129,12158261,-1,ENSCAFG00020023588,SNORA61
4,SEQ,canis_lupus_familiaris,ENSCAFT00845017353,2,72475732,72475864,1,ENSCAFG00845009868,
...,...,...,...,...,...,...,...,...,...
372964,SEQ,mus_caroli,MGP_CAROLIEiJ_T0095453,8,39879594,39879690,1,MGP_CAROLIEiJ_G0035515,
372965,SEQ,mus_spretus,MGP_SPRETEiJ_T0098898,8,44218146,44218242,1,MGP_SPRETEiJ_G0036433,
372966,DATA,,,,,,,,
372967,"((((((((ENSBGRT00000042759:0,ENSBGRT0000004275...",,,,,,,,


## Summarize

In [None]:
ortholog_match_subset_genes_df.to_csv(
    "created_files/ortholog_alias_match_df.csv", index=True
)

In [None]:
ortholog_match_subset_genes_df.loc[
    ortholog_match_subset_genes_df["alias_symbol"] == "CALMBP1"
]

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,DM Match,Mouse Match,Abingdon island giant tortoise Match,African ostrich Match,...,Arabian camel Match,Arctic ground squirrel Match,Argentine black and white tegu Match,Armadillo Match,Asian bonytongue Match,Barramundi perch Match,Beluga whale Match,Bicolor damselfish Match,Black snub-nosed monkey Match,Blue whale Match
4476,ASPM,CALMBP1,{19048},{ENSG00000066279},{259266},"{HGNC, ENSG, NCBI}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


how many gene alias pairs dont match any orthologs

In [None]:
no_ortholog_match_subset_genes_df = ortholog_match_subset_genes_df[~ortholog_match_subset_genes_df.select_dtypes(include='bool').any(axis=1)]

In [None]:
no_ortholog_match_subset_genes_df.head(20)

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,DM Match,Mouse Match,Abingdon island giant tortoise Match,African ostrich Match,...,Arabian camel Match,Arctic ground squirrel Match,Argentine black and white tegu Match,Armadillo Match,Asian bonytongue Match,Barramundi perch Match,Beluga whale Match,Bicolor damselfish Match,Black snub-nosed monkey Match,Blue whale Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,A1BG-AS1,A1BG-AS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,A1BG-AS1,A1BGAS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,A1BG-AS1,FLJ23569,{37133},{ENSG00000268895},{503538},"{HGNC, ENSG}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,A1BG-AS1,NCRNA00181,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,A1CF,ACF,{24086},{ENSG00000148584},{29974},"{HGNC, ENSG, NCBI}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# HGNC Previous Symbol Analysis

## Download the HGNC custom download including the gene symbol, ID, and previous symbols

In [None]:
hgnc_previous_symbols_df = pd.read_csv(
    "downloaded_files/HGNC_previous_symbols20241010.txt", sep="\t")
hgnc_previous_symbols_df

Unnamed: 0,HGNC ID,Approved symbol,Previous symbols
0,HGNC:5,A1BG,
1,HGNC:37133,A1BG-AS1,"NCRNA00181, A1BGAS, A1BG-AS"
2,HGNC:24086,A1CF,
3,HGNC:6,A1S9T,
4,HGNC:7,A2M,
...,...,...,...
49077,HGNC:25820,ZYG11B,ZYG11
49078,HGNC:13200,ZYX,
49079,HGNC:51695,ZYXP1,
49080,HGNC:29027,ZZEF1,


## Remove all genes with no previous symbols

In [None]:
hgnc_previous_symbols_df = hgnc_previous_symbols_df.dropna(subset=["Previous symbols"])
hgnc_previous_symbols_df

Unnamed: 0,HGNC ID,Approved symbol,Previous symbols
1,HGNC:37133,A1BG-AS1,"NCRNA00181, A1BGAS, A1BG-AS"
6,HGNC:23336,A2ML1,CPAMD9
9,HGNC:8,A2MP1,A2MP
12,HGNC:30005,A3GALT2,A3GALT2P
13,HGNC:18149,A4GALT,P1
...,...,...,...
49063,HGNC:23528,ZSWIM8,KIAA0913
49065,HGNC:34495,ZSWIM9,C19orf68
49066,HGNC:21224,ZUP1,"C6orf113, ZUFSP"
49071,HGNC:13197,ZWS1,ZWS


## Explode the previous symbols so that it is only one symbol per row

In [None]:
hgnc_previous_symbols_df["previous_symbol"] = hgnc_previous_symbols_df['Previous symbols'].str.split(',').apply(lambda x: [s.strip() for s in x])
hgnc_previous_symbols_df = hgnc_previous_symbols_df.explode('previous_symbol')
hgnc_previous_symbols_df = hgnc_previous_symbols_df.drop(columns=['Previous symbols'])
hgnc_previous_symbols_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hgnc_previous_symbols_df["previous_symbol"] = hgnc_previous_symbols_df['Previous symbols'].str.split(',').apply(lambda x: [s.strip() for s in x])


Unnamed: 0,HGNC ID,Approved symbol,previous_symbol
1,HGNC:37133,A1BG-AS1,NCRNA00181
1,HGNC:37133,A1BG-AS1,A1BGAS
1,HGNC:37133,A1BG-AS1,A1BG-AS
6,HGNC:23336,A2ML1,CPAMD9
9,HGNC:8,A2MP1,A2MP
...,...,...,...
49065,HGNC:34495,ZSWIM9,C19orf68
49066,HGNC:21224,ZUP1,C6orf113
49066,HGNC:21224,ZUP1,ZUFSP
49071,HGNC:13197,ZWS1,ZWS


## Make all of the gene symbols all caps

Different species follow different gene nomenclature conventions. <br>
For example, mouse genes have the first letter capitalized but the rest lowercase.<br>
They need to be all caps for matching

In [None]:
hgnc_previous_symbols_df["Approved symbol"] = hgnc_previous_symbols_df["Approved symbol"].str.upper()
hgnc_previous_symbols_df["previous_symbol"] = hgnc_previous_symbols_df["previous_symbol"].str.upper()

## Match aliases to previous symbols!

In [None]:
previous_symbol_match_subset_genes_df = subset_genes_df.copy()
previous_symbol_match_subset_genes_df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI}
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI}
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI}
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI}
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI}
...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}"
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}"
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}"
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}"


In [None]:
previous_symbol_match_subset_genes_df["Previous Symbol Match"] = previous_symbol_match_subset_genes_df.apply(lambda row: 
                        any((hgnc_previous_symbols_df['Approved symbol'] == row['gene_symbol']) 
                            & 
                            (hgnc_previous_symbols_df["previous_symbol"] == row['alias_symbol'])), axis=1)
previous_symbol_match_df = previous_symbol_match_subset_genes_df[previous_symbol_match_subset_genes_df["Previous Symbol Match"]]
previous_symbol_match_df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Previous Symbol Match
5,A1BG-AS1,A1BG-AS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",True
6,A1BG-AS1,A1BGAS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",True
8,A1BG-AS1,NCRNA00181,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",True
18,A2ML1,CPAMD9,{23336},{ENSG00000166535},{144568},"{ENSG, NCBI}",True
22,A2MP1,A2MP,{8},"{ENSG00000291190, ENSG00000256069}",{3},"{ENSG, NCBI}",True
...,...,...,...,...,...,...,...
86738,ZSWIM8,KIAA0913,{23528},{ENSG00000214655},{23053},"{ENSG, NCBI}",True
86739,ZSWIM9,C19ORF68,{34495},{ENSG00000185453},{374920},"{ENSG, NCBI}",True
86741,ZUP1,C6ORF113,{21224},{ENSG00000153975},{221302},"{ENSG, NCBI}",True
86744,ZUP1,ZUFSP,{21224},{ENSG00000153975},{221302},"{ENSG, NCBI}",True


In [None]:
previous_symbol_match_subset_genes_df.head(20)

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Previous Symbol Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False
5,A1BG-AS1,A1BG-AS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",True
6,A1BG-AS1,A1BGAS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",True
7,A1BG-AS1,FLJ23569,{37133},{ENSG00000268895},{503538},"{HGNC, ENSG}",False
8,A1BG-AS1,NCRNA00181,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",True
9,A1CF,ACF,{24086},{ENSG00000148584},{29974},"{HGNC, ENSG, NCBI}",False


# Clone Names Analysis

In [None]:
with open("downloaded_files/Conversion_table_FLJ(1).txt", 'r') as file:
    for i, line in enumerate(file):
        print(f"Line {i}: {line.strip()}")

In [None]:
clone_symbols_df = pd.read_csv(
    "downloaded_files/Conversion_table_FLJ(1).txt", 
    sep="\t", 
    skiprows=3
)
clone_symbols_df

Unnamed: 0,Accesion No,FLJ ID,Clone ID,Sequence ID,Another Sequence ID
0,AK075326,PSEC0001(FLJ91001),NT2RM1000066,C-NT2RM1000066,
1,AK172724,PSEC0002(FLJ91002),NT2RM1000295,C-NT2RM1000295,
2,AK075327,PSEC0003(FLJ91003),NT2RM1000361,C-NT2RM1000361,
3,AK075328,PSEC0004(FLJ91004),NT2RM1000558,C-NT2RM1000558,
4,AK075329,PSEC0005(FLJ91005),NT2RM1000566,C-NT2RM1000566,
...,...,...,...,...,...
30321,AK057825,FLJ25096,CBR00778,C-CBR00778,
30322,AK000479,FLJ20472,KAT07023,C-KAT07023,
30323,AK125921,FLJ43933,TESTI4013685,C-TESTI4013685,
30324,AK125959,FLJ43971,TESTI4017901,C-TESTI4017901,


In [None]:
extracted_ids = clone_symbols_df['FLJ ID'].str.extract(r'([^()]+)\((.+?)\)')

# Create a DataFrame to hold the results
result_rows = []

# Iterate over the original DataFrame and extract values
for index, row in clone_symbols_df.iterrows():
    flj_id = row['FLJ ID']
    if pd.notnull(extracted_ids.iloc[index, 0]):  # Check if there is a match
        result_rows.append({'Accesion No': row['Accesion No'], 'ID': extracted_ids.iloc[index, 0]})  # First part
        result_rows.append({'Accesion No': row['Accesion No'], 'ID': extracted_ids.iloc[index, 1]})  # Second part
    else:
        result_rows.append({'Accesion No': row['Accesion No'], 'ID': flj_id})  # Keep the original if no match

# Create the final result DataFrame
result_df = pd.DataFrame(result_rows)

# Display the result
result_df

Unnamed: 0,Accesion No,ID
0,AK075326,PSEC0001
1,AK075326,FLJ91001
2,AK172724,PSEC0002
3,AK172724,FLJ91002
4,AK075327,PSEC0003
...,...,...
30581,AK057825,FLJ25096
30582,AK000479,FLJ20472
30583,AK125921,FLJ43933
30584,AK125959,FLJ43971


In [None]:
result_df.loc[
    result_df["ID"] == "FLJ25179"
]

Unnamed: 0,Accesion No,ID
11650,AK057908,FLJ25179


In [None]:
result_df["ID"] = result_df["ID"].str.strip()

In [None]:
previous_symbol_match_subset_genes_df = subset_genes_df.copy()
previous_symbol_match_subset_genes_df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI}
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI}
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI}
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI}
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI}
...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}"
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}"
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}"
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}"


In [None]:
previous_symbol_match_subset_genes_df["Clone Symbol Match"] = previous_symbol_match_subset_genes_df['alias_symbol'].isin(result_df['ID'])

df = previous_symbol_match_subset_genes_df[previous_symbol_match_subset_genes_df["Clone Symbol Match"]]
previous_symbol_match_subset_genes_df.head(20)

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Clone Symbol Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False
5,A1BG-AS1,A1BG-AS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",False
6,A1BG-AS1,A1BGAS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",False
7,A1BG-AS1,FLJ23569,{37133},{ENSG00000268895},{503538},"{HGNC, ENSG}",True
8,A1BG-AS1,NCRNA00181,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",False
9,A1CF,ACF,{24086},{ENSG00000148584},{29974},"{HGNC, ENSG, NCBI}",False


In [None]:
df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Clone Symbol Match
7,A1BG-AS1,FLJ23569,{37133},{ENSG00000268895},{503538},"{HGNC, ENSG}",True
19,A2ML1,FLJ25179,{23336},{ENSG00000166535},{144568},"{HGNC, ENSG}",True
45,AACS,FLJ12389,{21298},{ENSG00000081760},{65985},"{HGNC, ENSG}",True
56,AAGAB,FLJ11506,{25662},{ENSG00000103591},{79719},"{HGNC, ENSG}",True
65,AAMDC,FLJ21035,{30205},{ENSG00000087884},{28971},"{HGNC, ENSG}",True
...,...,...,...,...,...,...,...
86729,ZSWIM4,FLJ12221,{25704},"{ENSG00000132003, ENSG00000288360}",{65249},"{HGNC, ENSG}",True
86747,ZWILCH,FLJ10036,{25468},{ENSG00000174442},{55055},"{HGNC, ENSG}",True
86760,ZXDC,FLJ13861,{28160},{ENSG00000070476},{79364},"{HGNC, ENSG}",True
86764,ZYG11B,FLJ13456,{25820},{ENSG00000162378},{79699},"{HGNC, ENSG}",True


# Gene Family Analysis

In [None]:
hgnc_genefamilies_df = pd.read_csv(
    "downloaded_files/hgnc_genefamily.csv", sep=",")
hgnc_genefamilies_df

NameError: name 'pd' is not defined