In [2]:
import pandas as pd
import numpy as np

In [3]:
def remove_nan_from_set(s):
    """Remove null instances from set
    
    :param s: selected set
    :return: set with no null values
    """
    return {x for x in s if pd.notna(x)}

In [4]:
def read_subset_genes_csv(location, source= str):
    """Create a df of primary gene symbol- alias symbol pairs

    :param location: file location
    :param source: representation in the source of gene records
    return: a df of gene records
    """
    
    subset_genes_xxxx_df = pd.read_csv(
        location, index_col=[0],dtype={"NCBI_ID": str,"HGNC_ID":str})
    subset_genes_xxxx_df["source"] = str(source)
    subset_genes_xxxx_df['gene_symbol'] = subset_genes_xxxx_df['gene_symbol'].str.upper()
    subset_genes_xxxx_df['alias_symbol'] = subset_genes_xxxx_df['alias_symbol'].str.upper()
    return subset_genes_xxxx_df

In [5]:
def convert_all_columns_to_uppercase(df):
    """Convert gene symbols to all-caps. Diffferent species have differing capitalization requirements and this will standardize.

    :param df: DataFrame containing gene symbols of unknown capitalizations
    :return: a DataFrame with all gene symbols all-caps
    """
    for column in df.columns:
        if df[column].dtype == 'object':  # Check if the column type is object
            df[column] = df[column].str.upper()

    return df

In [6]:
def combine_rows(series):
    """Combine duplicate rows.

    :param series: a Pandas Series containing values from a DataFrame column.
                    this Series may contain NaN values, and the function will
                    return the first non-null value, or None if all values are NaN.
    :return: combined value from the Series, or None if the Series is empty or contains only NaNs.
    """
    return series.ffill().bfill().drop_duplicates().values[0] if not series.dropna().empty else None

In [7]:
def make_col_ortholog_match(recording_df, source_df, animal= str):
    """Check for ortholog matches in the primary gene symbol- alias symbol pairs. 
    Adds a T/F column for each pair. T if the alias is an ortholog from the specified animal and F if not

    :param recording_df: df that contains the primary gene symbol- alias symbol pairs
    :param source_df: df that contains the orthologs and their associated human genes
    :param animal: the animal from with the orthologs are being checked
    return: the number of primary gene symbol- alias symbol pairs where the alias is an ortholog from the specified animal
    """
    recording_df[f'{animal} Match'] = recording_df.apply(lambda row: 
                            any((source_df['Gene name'] == row['gene_symbol']) 
                                & 
                                (source_df[f'{animal} gene name'] == row['alias_symbol'])), axis=1)
    print(f"Added column: {animal} Match")
    return recording_df

In [8]:
def match_alias_to_ortholog(og_recording_df, source_df):
    """Apply the make_col_ortholog_match function to all animal columns in the DataFrame.

    :param og_recording_df: DataFrame containing the primary gene symbol- alias symbol pairs
    :param source_df: DataFrame containing the orthologs and their associated human genes
    :return: a DataFrame with all match columns added
    """
    source_df = source_df.dropna(subset=['Gene name'])

    source_df = convert_all_columns_to_uppercase(source_df)
    
    source_df = source_df.groupby('Gene name', as_index=False).agg(combine_rows)

    recording_df = og_recording_df.copy()
    recording_df.columns = recording_df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)
    source_df.columns = source_df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)

    animal_columns = [col for col in source_df.columns if 'gene name' in col and col != 'Gene name']
    
    true_counts = {}

    for animal in animal_columns:
        # Extract the animal name
        animal_name = animal.replace(' gene name', '')
        recording_df = make_col_ortholog_match(recording_df, source_df, animal_name)

        true_count = recording_df[f'{animal_name} Match'].sum()
        true_counts[animal_name] = true_count
    return recording_df, true_counts

## Download gene records from ENSG, HGNC, and NCBI 

#### This subset file was created in the alias-primary collision analysis notebook by the following modifications:
 - Gene records with no aliases were removed.
 - Primary gene symbol- alias symbol pairs where the alias was an exact match to the primary symbol were removed.
 - Primary gene symbol- alias symbol pairs that were duplicated were removed.

In [9]:
subset_genes_ensg_df = read_subset_genes_csv("created_files/subset_genes_ensg_df.csv", "ENSG")
subset_genes_hgnc_df = read_subset_genes_csv("created_files/subset_genes_hgnc_df.csv", "HGNC")
subset_genes_ncbi_df = read_subset_genes_csv("created_files/subset_genes_ncbi_df.csv", "NCBI")

## Combine data from all sources

In [10]:
subset_genes_df = pd.concat([subset_genes_ensg_df, subset_genes_hgnc_df, subset_genes_ncbi_df], axis=0)
subset_genes_df

Unnamed: 0,ENSG_ID,gene_symbol,alias_symbol,HGNC_ID,NCBI_ID,source
0,ENSG00000210049,MT-TF,MTTF,7481,,ENSG
1,ENSG00000210049,MT-TF,TRNF,7481,,ENSG
2,ENSG00000211459,MT-RNR1,12S,7470,,ENSG
3,ENSG00000211459,MT-RNR1,MOTS-C,7470,,ENSG
4,ENSG00000211459,MT-RNR1,MTRNR1,7470,,ENSG
...,...,...,...,...,...,...
190961,,GLTC1,GLTC,56861,131840634,NCBI
193342,,GABRA6-AS1,ARBAG,40248,132532400,NCBI
193377,,LNCARGI,ARGI,56890,133395150,NCBI
193378,,MLDHR,MP31,55481,133834869,NCBI


## Group the associated data by primary gene symbol- alias symbol pairs

#### This will ensure that there are no duplicate primary gene symbol- alias symbol pairs as well as preserving in which sources these pairs occur

In [11]:
subset_genes_df = subset_genes_df.groupby(['gene_symbol',"alias_symbol"], as_index=False).agg({
    "HGNC_ID": lambda x: set(x),
    'ENSG_ID': lambda x: set(x),
    'NCBI_ID': lambda x: set(x),
    "source": lambda x: set(x)
})
subset_genes_df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source
0,A-GAMMA3'E,A-GAMMA-E,{nan},{nan},{109951028},{NCBI}
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI}
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI}
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI}
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI}
...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC}"
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC}"
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{ENSG, NCBI, HGNC}"
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{ENSG, NCBI, HGNC}"


In [12]:
subset_genes_df['NCBI_ID'] = subset_genes_df['NCBI_ID'].apply(remove_nan_from_set)
subset_genes_df['ENSG_ID'] = subset_genes_df['ENSG_ID'].apply(remove_nan_from_set)
subset_genes_df['HGNC_ID'] = subset_genes_df['HGNC_ID'].apply(remove_nan_from_set)

# Ortholog Analysis

## Download an Ensembl Biomart export file with the Gene Name and the Ortholog Gene Name

In [13]:
mur_dros_ortho_df = pd.read_csv(
    "downloaded_files/ensg_mart_export_dros_murin_ortho.txt", sep=",", index_col=[0])
mur_dros_ortho_df

Unnamed: 0_level_0,Drosophila melanogaster (Fruit fly) gene name,Drosophila melanogaster (Fruit fly) gene stable ID,Mouse gene stable ID,Mouse gene name,Gene name
Gene stable ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000210049,,,,,MT-TF
ENSG00000211459,,,,,MT-RNR1
ENSG00000210077,,,,,MT-TV
ENSG00000210082,,,,,MT-RNR2
ENSG00000209082,,,,,MT-TL1
...,...,...,...,...,...
ENSG00000232679,,,,,LINC01705
ENSG00000200033,,,ENSMUSG00000088001,Gm22883,RNU6-403P
ENSG00000228437,,,,,LINC02474
ENSG00000229463,,,,,LYST-AS1


## Make all of the gene symbols all caps

Different species follow different gene nomenclature conventions. <br>
For example, mouse genes have the first letter capitalized but the rest lowercase.<br>
They need to be all caps for matching

In [14]:
mur_dros_ortho_df = convert_all_columns_to_uppercase(mur_dros_ortho_df)

## Match aliases to orthologs!

### Drosophila melanogaster

In [15]:
fruitfly_df = make_col_ortholog_match(subset_genes_df, mur_dros_ortho_df,"Drosophila melanogaster (Fruit fly)")
print(len(fruitfly_df))

### Mouse

In [None]:
mouse_df = make_col_ortholog_match(subset_genes_df, mur_dros_ortho_df,"Mouse")
print(len(mouse_df))

NameError: name 'make_col_ortholog_match' is not defined

In [None]:
fruitfly_mouse_df = pd.merge(fruitfly_df, mouse_df, how='inner', 
                  left_on=["gene_symbol", "alias_symbol", "HGNC_ID", "ENSG_ID","NCBI_ID", "source"],
                  right_on=["gene_symbol", "alias_symbol", "HGNC_ID", "ENSG_ID","NCBI_ID", "source"])
fruitfly_mouse_df[(fruitfly_mouse_df['DM Match']) & (fruitfly_mouse_df['Mouse Match'])]

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,DM Match,Mouse Match
9418,CCZ1B,CCZ1,{21717},{ENSG00000146574},{221960},{NCBI},True,True
18951,EIF1AX,EIF1A,{3250},{ENSG00000173674},{1964},"{ENSG, NCBI}",True,True
59611,RAB7A,RAB7,{9788},{ENSG00000075785},{7879},"{ENSG, NCBI}",True,True


### (1)Abingdon island giant tortoise
### (1)African ostrich
### (1)Algerian mouse

In [141]:
tortoise_ostrich_amouse_ortho_df = pd.read_csv(
    "downloaded_files/tortoise_ostrich_amouse_export.txt", sep=",")

In [91]:
ortholog_analysis_one_df, ortholog_analysis_one_counts = match_alias_to_ortholog(subset_genes_df, tortoise_ostrich_amouse_ortho_df)
print(ortholog_analysis_one_counts)

Added column: Abingdon island giant tortoise Match
Added column: African ostrich Match
Added column: Algerian mouse Match
{'Abingdon island giant tortoise': 1, 'African ostrich': 2, 'Algerian mouse': 167}
      gene_symbol  alias_symbol  HGNC_ID            ENSG_ID      NCBI_ID  \
0      A-GAMMA3'E     A-GAMMA-E       {}                 {}  {109951028}   
1            A1BG           A1B      {5}  {ENSG00000121410}          {1}   
2            A1BG           ABG      {5}  {ENSG00000121410}          {1}   
3            A1BG           GAB      {5}  {ENSG00000121410}          {1}   
4            A1BG      HYST2477      {5}  {ENSG00000121410}          {1}   
...           ...           ...      ...                ...          ...   
86768       ZZEF1      FLJ10821  {29027}  {ENSG00000074755}      {23140}   
86769       ZZEF1      KIAA0399  {29027}  {ENSG00000074755}      {23140}   
86770       ZZEF1          ZZZ4  {29027}  {ENSG00000074755}      {23140}   
86771        ZZZ3         ATAC1  {2

In [96]:
ortholog_analysis_one_df.head()

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Abingdon island giant tortoise Match,African ostrich Match,Algerian mouse Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False
...,...,...,...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC}",False,False,False
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC}",False,False,False
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC, NCBI}",False,False,False
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{ENSG, HGNC, NCBI}",False,False,False


### (2)Alpaca
### (2)Alpine marmot
### (2)Amazon molly
### (2)American bison
### (2)American black bear

In [97]:
Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df = pd.read_csv(
    "downloaded_files/Alpaca_Amarmot_Amolly_Abison_Ablackbear_mart_export.txt", sep=",")

In [98]:
ortholog_analysis_two_df, ortholog_analysis_two_counts = match_alias_to_ortholog(subset_genes_df, Alpaca_Amarmot_Amolly_Abison_Ablackbear_ortho_df)
print(ortholog_analysis_two_counts)

Added column: Alpaca Match
Added column: Alpine marmot Match
Added column: Amazon molly Match
Added column: American bison Match
Added column: American black bear Match
{'Alpaca': 13, 'Alpine marmot': 8, 'Amazon molly': 101, 'American bison': 1, 'American black bear': 1}


In [99]:
ortholog_analysis_two_df.head()

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Alpaca Match,Alpine marmot Match,Amazon molly Match,American bison Match,American black bear Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False


### (3)American Mink
### (3)Arabian camel
### (3)Arctic ground squirrel
### (3)Argentine black and white tegu
### (3)Armadillo
### (3)Asian bonytongue

In [100]:
Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df = pd.read_csv(
    "downloaded_files/Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue.txt", sep="\t")

In [101]:
ortholog_analysis_three_df, ortholog_analysis_three_counts = match_alias_to_ortholog(subset_genes_df, Americanmink_Arabiancamel_Arcticsquirrel_Argentinetegu_Armadillo_Asianbonytongue_ortho_df)
print(ortholog_analysis_three_counts)

Added column: American mink Match
Added column: Arabian camel Match
Added column: Arctic ground squirrel Match
Added column: Argentine black and white tegu Match
Added column: Armadillo Match
Added column: Asian bonytongue Match
{'American mink': 182, 'Arabian camel': 13, 'Arctic ground squirrel': 1, 'Argentine black and white tegu': 5, 'Armadillo': 17, 'Asian bonytongue': 91}


In [102]:
ortholog_analysis_three_df.head()

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,American mink Match,Arabian camel Match,Arctic ground squirrel Match,Argentine black and white tegu Match,Armadillo Match,Asian bonytongue Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False,False


### (4)Atlantic Cod
### (4)Atlantic herring
### (4)Atlantic salmon
### (4)Australian saltwater crocodile
### (4)Ballan wrasse

In [203]:
Atlanticcod_Atlanticherring_Atlanticsalmon_Australiancrocodile_Ballanwrasse_ortho_df = pd.read_csv(
    "downloaded_files/Atlanticcod_Atlanticherring_Atlanticsalmon_Australiancrocodile_Ballanwrasse.txt", sep=",")
Atlanticcod_Atlanticherring_Atlanticsalmon_Australiancrocodile_Ballanwrasse_ortho_df.head(50)

  Atlanticcod_Atlanticherring_Atlanticsalmon_Australiancrocodile_Ballanwrasse_ortho_df = pd.read_csv(


Unnamed: 0,Gene name,Atlantic cod gene name,Atlantic herring gene name,Atlantic salmon gene name,Australian saltwater crocodile gene name,Ballan wrasse gene name
0,MT-TF,,,,,
1,MT-RNR1,,,,,
2,MT-TV,,,,,
3,MT-RNR2,,,,,
4,MT-TL1,,,,,
5,MT-ND1,,ND1,,ND1,
6,MT-ND1,,ND1,,ND1,
7,MT-TI,,,,,
8,MT-TQ,,,,,
9,MT-TM,,,,,


In [205]:
Atlanticcod_Atlanticherring_Atlanticsalmon_Australiancrocodile_Ballanwrasse_ortho_df = Atlanticcod_Atlanticherring_Atlanticsalmon_Australiancrocodile_Ballanwrasse_ortho_df.groupby('Gene name', as_index=False).agg(combine_rows)

In [207]:
ortholog_analysis_four_df, ortholog_analysis_four_counts = match_alias_to_ortholog(subset_genes_df, Atlanticcod_Atlanticherring_Atlanticsalmon_Australiancrocodile_Ballanwrasse_ortho_df)
print(ortholog_analysis_four_counts)

Added column: Atlantic cod Match
Added column: Atlantic herring Match
Added column: Atlantic salmon Match
Added column: Australian saltwater crocodile Match
Added column: Ballan wrasse Match
{'Atlantic cod': 49, 'Atlantic herring': 211, 'Atlantic salmon': 194, 'Australian saltwater crocodile': 12, 'Ballan wrasse': 85}


In [208]:
ortholog_analysis_four_df.head()

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Atlantic cod Match,Atlantic herring Match,Atlantic salmon Match,Australian saltwater crocodile Match,Ballan wrasse Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False


### (5)Barramundi perch
### (5)Beluga whale
### (5)Bicolor damselfish
### (5)Black snub-nosed monkey
### (5)Blue whale


In [209]:
Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df = pd.read_csv(
    "downloaded_files/Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale.txt", sep=",")

In [115]:
ortholog_analysis_five_df, ortholog_analysis_five_counts = match_alias_to_ortholog(subset_genes_df, Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df)
print(ortholog_analysis_five_counts)

Added column: Barramundi perch Match
Added column: Beluga whale Match
Added column: Bicolor damselfish Match
Added column: Black snub-nosed monkey Match
Added column: Blue whale Match
{'Barramundi perch': 95, 'Beluga whale': 228, 'Bicolor damselfish': 93, 'Black snub-nosed monkey': 14, 'Blue whale': 7}


In [216]:
Barramundiperch_Belugawhale_Bicolordamselfish_Blacksnubnosedmonkey_Bluewhale_ortho_df

Unnamed: 0,Gene name,Barramundi perch gene name,Beluga whale gene name,Bicolor damselfish gene name,Black snub-nosed monkey gene name,Blue whale gene name
0,MT-TF,,,,,
1,MT-RNR1,,,,,
2,MT-TV,,,,,
3,MT-RNR2,,,,,
4,MT-TL1,,,,,
...,...,...,...,...,...,...
115851,LINC01705,,,,,
115852,RNU6-403P,,,,,
115853,LINC02474,,,,,
115854,LYST-AS1,,,,,


In [116]:
ortholog_analysis_five_df.head()

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Barramundi perch Match,Beluga whale Match,Bicolor damselfish Match,Black snub-nosed monkey Match,Blue whale Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False


### (6) Blue-ringed sea krait
### (6) Burton's mouthbrooder
### (6) C.intestinalis
### (6) C.savignyi
### (6) Caenorhabditis elegans (Nematode, N2)

In [237]:
Blueringedseakrait_Burtonmouthbrooder_Cintestinalis_Csavignyi_Caenorhabditiselegans_ortho_df = pd.read_csv(
    "downloaded_files/Blueringedseakrait_Burtonmouthbrooder_Cintestinalis_Csavignyi_Caenorhabditiselegans.txt", sep=",")

  Blueringedseakrait_Burtonmouthbrooder_Cintestinalis_Csavignyi_Caenorhabditiselegans_ortho_df = pd.read_csv(


In [238]:
ortholog_analysis_six_df, ortholog_analysis_six_counts = match_alias_to_ortholog(subset_genes_df, Blueringedseakrait_Burtonmouthbrooder_Cintestinalis_Csavignyi_Caenorhabditiselegans_ortho_df)
print(ortholog_analysis_six_counts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.upper()


Added column: Blue-ringed sea krait Match
Added column: Burton's mouthbrooder Match
Added column: C.intestinalis Match
Added column: C.savignyi Match
Added column: Caenorhabditis elegans (Nematode, N2) Match
{'Blue-ringed sea krait': 2, "Burton's mouthbrooder": 88, 'C.intestinalis': 34, 'C.savignyi': 9, 'Caenorhabditis elegans (Nematode, N2)': 101}


In [239]:
ortholog_analysis_six_df.head()

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Blue-ringed sea krait Match,Burton's mouthbrooder Match,C.intestinalis Match,C.savignyi Match,"Caenorhabditis elegans (Nematode, N2) Match"
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False


### (7) Cat
### (7) Chacoan peccary
### (7) Channel bull blenny
### (7) Channel catfish
### (7) Chicken

In [242]:
Cat_Chacoanpeccary_Channelbullblenny_Channelcatfish_Chicken_ortho_df = pd.read_csv(
    "downloaded_files/Cat_Chacoanpeccary_Channelbullblenny_Channelcatfish_Chicken.txt", sep=",")

In [243]:
ortholog_analysis_seven_df, ortholog_analysis_seven_counts = match_alias_to_ortholog(subset_genes_df, Cat_Chacoanpeccary_Channelbullblenny_Channelcatfish_Chicken_ortho_df)
print(ortholog_analysis_seven_counts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.upper()


Added column: Cat Match
Added column: Chacoan peccary Match
Added column: Channel bull blenny Match
Added column: Channel catfish Match
Added column: Chicken Match
{'Cat': 30, 'Chacoan peccary': 10, 'Channel bull blenny': 129, 'Channel catfish': 172, 'Chicken': 154}


In [244]:
ortholog_analysis_seven_df.head()

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Cat Match,Chacoan peccary Match,Channel bull blenny Match,Channel catfish Match,Chicken Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,False


## Summarize

In [247]:
print(globals().keys())

dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__builtin__', '__builtins__', '_ih', '_oh', '_dh', 'In', 'Out', 'get_ipython', 'exit', 'quit', 'open', '_', '__', '___', '__vsc_ipynb_file__', '_i', '_ii', '_iii', '_i1', '_i2', 'pd', 'np', '_i3', 'remove_nan_from_set', '_i4', 'read_subset_genes_csv', '_i5', 'make_col_ortholog_match', '_i6', '_i7', '_i8', '_i9', '_i10', 'subset_genes_ensg_df', 'subset_genes_hgnc_df', 'subset_genes_ncbi_df', '_i11', 'subset_genes_df', '_11', '_i12', '_12', '_i13', '_i14', 'mur_dros_ortho_df', '_14', '_i15', '_i16', 'ortholog_match_subset_genes_df', '_i17', '_17', '_i18', '_i19', '_i20', '_i21', '_21', '_i22', '_22', '_i23', '_i24', '_24', '_i25', '_i26', 'fruitfly_df', '_i27', 'Atlanticcod_Atlanticherring_Atlanticsalmon_Australiancrocodile_Ballanwrasse_ortho_df', '_27', '_i28', '_28', '_i29', '_i30', '_i31', '_31', '_i32', '_i33', '_33', '_i34', '_i35', '_i36', '_36', '_i37', '_37', '_i38', '_i39', '_39', '_i40', '_40', '_i41', 

In [248]:
list_of_ortholog_analysis_dfs = [value for key, value in globals().items() 
                if key.startswith('ortholog_analysis') 
                and key.endswith('_df') 
                and isinstance(value, pd.DataFrame)]
list_of_ortholog_analysis_dfs

[      gene_symbol  alias_symbol  HGNC_ID            ENSG_ID      NCBI_ID  \
 0      A-GAMMA3'E     A-GAMMA-E       {}                 {}  {109951028}   
 1            A1BG           A1B      {5}  {ENSG00000121410}          {1}   
 2            A1BG           ABG      {5}  {ENSG00000121410}          {1}   
 3            A1BG           GAB      {5}  {ENSG00000121410}          {1}   
 4            A1BG      HYST2477      {5}  {ENSG00000121410}          {1}   
 ...           ...           ...      ...                ...          ...   
 86768       ZZEF1      FLJ10821  {29027}  {ENSG00000074755}      {23140}   
 86769       ZZEF1      KIAA0399  {29027}  {ENSG00000074755}      {23140}   
 86770       ZZEF1          ZZZ4  {29027}  {ENSG00000074755}      {23140}   
 86771        ZZZ3         ATAC1  {24523}  {ENSG00000036549}      {26009}   
 86772        ZZZ3  DKFZP564I052  {24523}  {ENSG00000036549}      {26009}   
 
                    source  Abingdon island giant tortoise Match  \
 0    

In [249]:
ortholog_analysis_df = subset_genes_df.copy()
for df in list_of_ortholog_analysis_dfs:
    ortholog_columns = df.iloc[:, 6:]
    ortholog_analysis_df = pd.concat([ortholog_analysis_df, ortholog_columns], axis=1)

ortholog_analysis_df.reset_index(drop=True, inplace=True)

In [250]:
ortholog_analysis_df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Abingdon island giant tortoise Match,African ostrich Match,Algerian mouse Match,Alpaca Match,...,Blue-ringed sea krait Match,Burton's mouthbrooder Match,C.intestinalis Match,C.savignyi Match,"Caenorhabditis elegans (Nematode, N2) Match",Cat Match,Chacoan peccary Match,Channel bull blenny Match,Channel catfish Match,Chicken Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC, NCBI}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{ENSG, HGNC, NCBI}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [251]:
ortholog_analysis_df.shape

(86773, 40)

In [252]:
ortholog_analysis_df.to_csv(
    "created_files/ortholog_analysis_df.csv", index=True
)

In [253]:
ortholog_analysis_df.loc[
    ortholog_analysis_df["alias_symbol"] == "CALMBP1"
]

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Abingdon island giant tortoise Match,African ostrich Match,Algerian mouse Match,Alpaca Match,...,Blue-ringed sea krait Match,Burton's mouthbrooder Match,C.intestinalis Match,C.savignyi Match,"Caenorhabditis elegans (Nematode, N2) Match",Cat Match,Chacoan peccary Match,Channel bull blenny Match,Channel catfish Match,Chicken Match
4476,ASPM,CALMBP1,{19048},{ENSG00000066279},{259266},"{ENSG, HGNC, NCBI}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


how many gene alias pairs dont match any orthologs

In [254]:
no_ortholog_match_genes_df = ortholog_analysis_df[~ortholog_analysis_df.select_dtypes(include='bool').any(axis=1)]

In [256]:
no_ortholog_match_genes_df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Abingdon island giant tortoise Match,African ostrich Match,Algerian mouse Match,Alpaca Match,...,Blue-ringed sea krait Match,Burton's mouthbrooder Match,C.intestinalis Match,C.savignyi Match,"Caenorhabditis elegans (Nematode, N2) Match",Cat Match,Chacoan peccary Match,Channel bull blenny Match,Channel catfish Match,Chicken Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC, NCBI}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{ENSG, HGNC, NCBI}",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# HGNC Previous Symbol Analysis

## Download the HGNC custom download including the gene symbol, ID, and previous symbols

In [None]:
hgnc_previous_symbols_df = pd.read_csv(
    "downloaded_files/HGNC_previous_symbols20241010.txt", sep="\t")
hgnc_previous_symbols_df

Unnamed: 0,HGNC ID,Approved symbol,Previous symbols
0,HGNC:5,A1BG,
1,HGNC:37133,A1BG-AS1,"NCRNA00181, A1BGAS, A1BG-AS"
2,HGNC:24086,A1CF,
3,HGNC:6,A1S9T,
4,HGNC:7,A2M,
...,...,...,...
49077,HGNC:25820,ZYG11B,ZYG11
49078,HGNC:13200,ZYX,
49079,HGNC:51695,ZYXP1,
49080,HGNC:29027,ZZEF1,


## Remove all genes with no previous symbols

In [None]:
hgnc_previous_symbols_df = hgnc_previous_symbols_df.dropna(subset=["Previous symbols"])
hgnc_previous_symbols_df

Unnamed: 0,HGNC ID,Approved symbol,Previous symbols
1,HGNC:37133,A1BG-AS1,"NCRNA00181, A1BGAS, A1BG-AS"
6,HGNC:23336,A2ML1,CPAMD9
9,HGNC:8,A2MP1,A2MP
12,HGNC:30005,A3GALT2,A3GALT2P
13,HGNC:18149,A4GALT,P1
...,...,...,...
49063,HGNC:23528,ZSWIM8,KIAA0913
49065,HGNC:34495,ZSWIM9,C19orf68
49066,HGNC:21224,ZUP1,"C6orf113, ZUFSP"
49071,HGNC:13197,ZWS1,ZWS


## Explode the previous symbols so that it is only one symbol per row

In [None]:
hgnc_previous_symbols_df["previous_symbol"] = hgnc_previous_symbols_df['Previous symbols'].str.split(',').apply(lambda x: [s.strip() for s in x])
hgnc_previous_symbols_df = hgnc_previous_symbols_df.explode('previous_symbol')
hgnc_previous_symbols_df = hgnc_previous_symbols_df.drop(columns=['Previous symbols'])
hgnc_previous_symbols_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hgnc_previous_symbols_df["previous_symbol"] = hgnc_previous_symbols_df['Previous symbols'].str.split(',').apply(lambda x: [s.strip() for s in x])


Unnamed: 0,HGNC ID,Approved symbol,previous_symbol
1,HGNC:37133,A1BG-AS1,NCRNA00181
1,HGNC:37133,A1BG-AS1,A1BGAS
1,HGNC:37133,A1BG-AS1,A1BG-AS
6,HGNC:23336,A2ML1,CPAMD9
9,HGNC:8,A2MP1,A2MP
...,...,...,...
49065,HGNC:34495,ZSWIM9,C19orf68
49066,HGNC:21224,ZUP1,C6orf113
49066,HGNC:21224,ZUP1,ZUFSP
49071,HGNC:13197,ZWS1,ZWS


## Make all of the gene symbols all caps

Different species follow different gene nomenclature conventions. <br>
For example, mouse genes have the first letter capitalized but the rest lowercase.<br>
They need to be all caps for matching

In [None]:
hgnc_previous_symbols_df["Approved symbol"] = hgnc_previous_symbols_df["Approved symbol"].str.upper()
hgnc_previous_symbols_df["previous_symbol"] = hgnc_previous_symbols_df["previous_symbol"].str.upper()

## Match aliases to previous symbols!

In [None]:
previous_symbol_match_subset_genes_df = subset_genes_df.copy()
previous_symbol_match_subset_genes_df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI}
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI}
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI}
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI}
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI}
...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}"
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}"
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}"
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}"


In [None]:
previous_symbol_match_subset_genes_df["Previous Symbol Match"] = previous_symbol_match_subset_genes_df.apply(lambda row: 
                        any((hgnc_previous_symbols_df['Approved symbol'] == row['gene_symbol']) 
                            & 
                            (hgnc_previous_symbols_df["previous_symbol"] == row['alias_symbol'])), axis=1)
previous_symbol_match_df = previous_symbol_match_subset_genes_df[previous_symbol_match_subset_genes_df["Previous Symbol Match"]]
previous_symbol_match_df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Previous Symbol Match
5,A1BG-AS1,A1BG-AS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",True
6,A1BG-AS1,A1BGAS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",True
8,A1BG-AS1,NCRNA00181,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",True
18,A2ML1,CPAMD9,{23336},{ENSG00000166535},{144568},"{ENSG, NCBI}",True
22,A2MP1,A2MP,{8},"{ENSG00000291190, ENSG00000256069}",{3},"{ENSG, NCBI}",True
...,...,...,...,...,...,...,...
86738,ZSWIM8,KIAA0913,{23528},{ENSG00000214655},{23053},"{ENSG, NCBI}",True
86739,ZSWIM9,C19ORF68,{34495},{ENSG00000185453},{374920},"{ENSG, NCBI}",True
86741,ZUP1,C6ORF113,{21224},{ENSG00000153975},{221302},"{ENSG, NCBI}",True
86744,ZUP1,ZUFSP,{21224},{ENSG00000153975},{221302},"{ENSG, NCBI}",True


In [None]:
previous_symbol_match_subset_genes_df.head(20)

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Previous Symbol Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False
5,A1BG-AS1,A1BG-AS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",True
6,A1BG-AS1,A1BGAS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",True
7,A1BG-AS1,FLJ23569,{37133},{ENSG00000268895},{503538},"{HGNC, ENSG}",False
8,A1BG-AS1,NCRNA00181,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",True
9,A1CF,ACF,{24086},{ENSG00000148584},{29974},"{HGNC, ENSG, NCBI}",False


# Clone Names Analysis

## Download the FLJ database file including FLJ IDs
- https://flj.lifesciencedb.jp/top/sys_info/02_about_database/accession_no/download_v032.html 
- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2780955/ 
- https://www.ncbi.nlm.nih.gov/nuccore/AK027222?report=GenBank

In [None]:
with open("downloaded_files/Conversion_table_FLJ(1).txt", 'r') as file:
    for i, line in enumerate(file):
        print(f"Line {i}: {line.strip()}")

In [None]:
clone_symbols_df = pd.read_csv(
    "downloaded_files/Conversion_table_FLJ(1).txt", 
    sep="\t", 
    skiprows=3
)
clone_symbols_df

Unnamed: 0,Accesion No,FLJ ID,Clone ID,Sequence ID,Another Sequence ID
0,AK075326,PSEC0001(FLJ91001),NT2RM1000066,C-NT2RM1000066,
1,AK172724,PSEC0002(FLJ91002),NT2RM1000295,C-NT2RM1000295,
2,AK075327,PSEC0003(FLJ91003),NT2RM1000361,C-NT2RM1000361,
3,AK075328,PSEC0004(FLJ91004),NT2RM1000558,C-NT2RM1000558,
4,AK075329,PSEC0005(FLJ91005),NT2RM1000566,C-NT2RM1000566,
...,...,...,...,...,...
30321,AK057825,FLJ25096,CBR00778,C-CBR00778,
30322,AK000479,FLJ20472,KAT07023,C-KAT07023,
30323,AK125921,FLJ43933,TESTI4013685,C-TESTI4013685,
30324,AK125959,FLJ43971,TESTI4017901,C-TESTI4017901,


In [None]:
extracted_ids = clone_symbols_df['FLJ ID'].str.extract(r'([^()]+)\((.+?)\)')

result_rows = []

# Extract values from the FLJ column so that there is only one ID per row
for index, row in clone_symbols_df.iterrows():
    flj_id = row['FLJ ID']
    if pd.notnull(extracted_ids.iloc[index, 0]):
        result_rows.append({'Accesion No': row['Accesion No'], 'ID': extracted_ids.iloc[index, 0]})  # First part
        result_rows.append({'Accesion No': row['Accesion No'], 'ID': extracted_ids.iloc[index, 1]})  # Second part
    else:
        result_rows.append({'Accesion No': row['Accesion No'], 'ID': flj_id})  # Keep the original if no match

result_df = pd.DataFrame(result_rows)

result_df

Unnamed: 0,Accesion No,ID
0,AK075326,PSEC0001
1,AK075326,FLJ91001
2,AK172724,PSEC0002
3,AK172724,FLJ91002
4,AK075327,PSEC0003
...,...,...
30581,AK057825,FLJ25096
30582,AK000479,FLJ20472
30583,AK125921,FLJ43933
30584,AK125959,FLJ43971


In [None]:
result_df.loc[
    result_df["ID"] == "FLJ25179"
]

Unnamed: 0,Accesion No,ID
11650,AK057908,FLJ25179


In [None]:
result_df["ID"] = result_df["ID"].str.strip()

In [None]:
previous_symbol_match_subset_genes_df = subset_genes_df.copy()
previous_symbol_match_subset_genes_df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI}
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI}
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI}
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI}
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI}
...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}"
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG}"
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{HGNC, ENSG, NCBI}"
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{HGNC, ENSG, NCBI}"


In [None]:
previous_symbol_match_subset_genes_df["Clone Symbol Match"] = previous_symbol_match_subset_genes_df['alias_symbol'].isin(result_df['ID'])

df = previous_symbol_match_subset_genes_df[previous_symbol_match_subset_genes_df["Clone Symbol Match"]]
previous_symbol_match_subset_genes_df.head(20)

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Clone Symbol Match
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI},False
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI},False
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI},False
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI},False
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI},False
5,A1BG-AS1,A1BG-AS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",False
6,A1BG-AS1,A1BGAS,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",False
7,A1BG-AS1,FLJ23569,{37133},{ENSG00000268895},{503538},"{HGNC, ENSG}",True
8,A1BG-AS1,NCRNA00181,{37133},{ENSG00000268895},{503538},"{ENSG, NCBI}",False
9,A1CF,ACF,{24086},{ENSG00000148584},{29974},"{HGNC, ENSG, NCBI}",False


In [None]:
df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Clone Symbol Match
7,A1BG-AS1,FLJ23569,{37133},{ENSG00000268895},{503538},"{HGNC, ENSG}",True
19,A2ML1,FLJ25179,{23336},{ENSG00000166535},{144568},"{HGNC, ENSG}",True
45,AACS,FLJ12389,{21298},{ENSG00000081760},{65985},"{HGNC, ENSG}",True
56,AAGAB,FLJ11506,{25662},{ENSG00000103591},{79719},"{HGNC, ENSG}",True
65,AAMDC,FLJ21035,{30205},{ENSG00000087884},{28971},"{HGNC, ENSG}",True
...,...,...,...,...,...,...,...
86729,ZSWIM4,FLJ12221,{25704},"{ENSG00000132003, ENSG00000288360}",{65249},"{HGNC, ENSG}",True
86747,ZWILCH,FLJ10036,{25468},{ENSG00000174442},{55055},"{HGNC, ENSG}",True
86760,ZXDC,FLJ13861,{28160},{ENSG00000070476},{79364},"{HGNC, ENSG}",True
86764,ZYG11B,FLJ13456,{25820},{ENSG00000162378},{79699},"{HGNC, ENSG}",True


# Gene Family Analysis

In [314]:
hgnc_genefamilies_df = pd.read_csv(
    "downloaded_files/hgnc_genefamily.csv", sep=",")
hgnc_genefamilies_df

Unnamed: 0,id,abbreviation,name,external_note,pubmed_ids,desc_comment,desc_label,desc_source,desc_go,typical_gene
0,3,FSCN,Fascin family,,21618240,,,,,FSCN1
1,4,ABHD,Abhydrolase domain containing,,23328280,,,,,ABHD1
2,6,ZYG11,ZYG11 cell cycle regulator family,,,,,,,
3,8,ZP,Zona pellucida glycoproteins,,15760956,There are four major zona pellucida glycoprote...,Zona pellucida glycoproteins,Wikipedia|https://en.wikipedia.org/wiki/Zona p...,,ZP1
4,10,VNN,Vanin family,,22155241,,,,,VNN1
...,...,...,...,...,...,...,...,...,...,...
1810,3338,,WICH complex,,21326359,Chromatin remodeling complex required for main...,WICH complex,Complex portal|https://www.ebi.ac.uk/complexpo...,,
1811,3339,,NoRC complex,,,NoRC remodels nucleosomes at the rDNA promoter...,NoRC complex,Complex Portal|https://www.ebi.ac.uk/complexpo...,,
1812,3340,,RSF complex,,,A nucleosome remodeling complex that participa...,RSF complex,Complex Portal|https://www.ebi.ac.uk/complexpo...,,
1813,3341,,ATP-dependent chromatin remodeling complexes,,19355820,,,,,


In [315]:
hgnc_genefamilies_df = hgnc_genefamilies_df[["id","abbreviation"]]
hgnc_genefamilies_df

Unnamed: 0,id,abbreviation
0,3,FSCN
1,4,ABHD
2,6,ZYG11
3,8,ZP
4,10,VNN
...,...,...
1810,3338,
1811,3339,
1812,3340,
1813,3341,


In [316]:
hgnc_genefamilies_df.rename(columns={'id': 'Gene group ID'}, inplace=True)
hgnc_genefamilies_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hgnc_genefamilies_df.rename(columns={'id': 'Gene group ID'}, inplace=True)


Unnamed: 0,Gene group ID,abbreviation
0,3,FSCN
1,4,ABHD
2,6,ZYG11
3,8,ZP
4,10,VNN
...,...,...
1810,3338,
1811,3339,
1812,3340,
1813,3341,


In [317]:
hgnc_gene_groupid_df = pd.read_csv(
    "downloaded_files/hgnc_id_symbol_genegroupid.txt", sep="\t")
hgnc_gene_groupid_df

Unnamed: 0,HGNC ID,Approved symbol,Gene group ID
0,HGNC:5,A1BG,594
1,HGNC:37133,A1BG-AS1,1987
2,HGNC:24086,A1CF,725
3,HGNC:6,A1S9T,
4,HGNC:7,A2M,2148
...,...,...,...
49078,HGNC:25820,ZYG11B,6|1492
49079,HGNC:13200,ZYX,1402|1691
49080,HGNC:51695,ZYXP1,
49081,HGNC:29027,ZZEF1,91|863


In [318]:
hgnc_gene_groupid_df['HGNC ID'] = hgnc_gene_groupid_df['HGNC ID'].str.replace('^HGNC:', '', regex=True)

In [319]:
hgnc_gene_groupid_df['Gene group ID'] = hgnc_gene_groupid_df['Gene group ID'].str.split('|')
hgnc_gene_groupid_df = hgnc_gene_groupid_df.explode('Gene group ID')

In [320]:
hgnc_gene_groupid_df = hgnc_gene_groupid_df.dropna(subset=['Gene group ID'])

In [321]:
hgnc_gene_groupid_df['Gene group ID'] = hgnc_gene_groupid_df['Gene group ID'].astype(int)

In [322]:
hgnc_gene_group_root_df = hgnc_gene_groupid_df.merge(hgnc_genefamilies_df, on='Gene group ID', how='left')

In [323]:
hgnc_gene_group_root_df

Unnamed: 0,HGNC ID,Approved symbol,Gene group ID,abbreviation
0,5,A1BG,594,
1,37133,A1BG-AS1,1987,
2,24086,A1CF,725,RBM
3,7,A2M,2148,
4,27057,A2M-AS1,1987,
...,...,...,...,...
31399,29027,ZZEF1,91,ZZZ
31400,29027,ZZEF1,863,
31401,24523,ZZZ3,91,ZZZ
31402,24523,ZZZ3,532,


In [324]:
hgnc_gene_group_root_df = hgnc_gene_group_root_df.dropna(subset=['abbreviation'])

In [325]:
hgnc_gene_group_root_df["abbreviation"] = hgnc_gene_group_root_df["abbreviation"].str.upper()
hgnc_gene_group_root_df["Approved symbol"] = hgnc_gene_group_root_df["Approved symbol"].str.upper()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hgnc_gene_group_root_df["abbreviation"] = hgnc_gene_group_root_df["abbreviation"].str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hgnc_gene_group_root_df["Approved symbol"] = hgnc_gene_group_root_df["Approved symbol"].str.upper()


In [326]:
gene_group_analysis_df = subset_genes_df.copy()
gene_group_analysis_df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source
0,A-GAMMA3'E,A-GAMMA-E,{},{},{109951028},{NCBI}
1,A1BG,A1B,{5},{ENSG00000121410},{1},{NCBI}
2,A1BG,ABG,{5},{ENSG00000121410},{1},{NCBI}
3,A1BG,GAB,{5},{ENSG00000121410},{1},{NCBI}
4,A1BG,HYST2477,{5},{ENSG00000121410},{1},{NCBI}
...,...,...,...,...,...,...
86768,ZZEF1,FLJ10821,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC}"
86769,ZZEF1,KIAA0399,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC}"
86770,ZZEF1,ZZZ4,{29027},{ENSG00000074755},{23140},"{ENSG, HGNC, NCBI}"
86771,ZZZ3,ATAC1,{24523},{ENSG00000036549},{26009},"{ENSG, HGNC, NCBI}"


In [329]:
gene_group_analysis_df["Gene Group Symbol Match"] = gene_group_analysis_df.apply(lambda row: 
                        any((hgnc_gene_group_root_df['Approved symbol'] == row['gene_symbol']) 
                            & 
                            (hgnc_gene_group_root_df["abbreviation"] == row['alias_symbol'])), axis=1)
gene_group_match_df = gene_group_analysis_df[gene_group_analysis_df["Gene Group Symbol Match"]]
gene_group_match_df

Unnamed: 0,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,NCBI_ID,source,Gene Group Symbol Match
219,ABCC1,ABCC,{51},{ENSG00000103222},{4363},{NCBI},True
451,ABL1,ABL,{76},{ENSG00000097007},{25},"{ENSG, NCBI}",True
817,ACSS2,ACS,{15814},{ENSG00000131069},{55902},"{ENSG, HGNC, NCBI}",True
846,ACTBL2,ACT,{17780},{ENSG00000169067},{345651},{NCBI},True
856,ACTG1,ACT,{144},{ENSG00000184009},{71},{NCBI},True
...,...,...,...,...,...,...,...
86152,ZNF569,ZNF,{24737},{ENSG00000196437},{148266},{NCBI},True
86235,ZNF629,ZNF,{29008},{ENSG00000102870},{23361},{NCBI},True
86433,ZNF763,ZNF,{27614},{ENSG00000197054},{284390},{NCBI},True
86763,ZYG11A,ZYG11,{32058},{ENSG00000203995},{440590},"{ENSG, HGNC, NCBI}",True


In [None]:
gene_group_analysis_df["Gene Group Symbol Match"] = gene_group_analysis_df.apply(lambda row: 
                        any((hgnc_gene_group_root_df['Approved symbol'] == row['gene_symbol']) 
                            & 
                            (hgnc_gene_group_root_df["abbreviation"].str.extract(r'([A-Za-z]+)')[0] == row['alias_symbol'].str.extract(r'([A-Za-z]+)')[0])), axis=1)
gene_group_match_df = gene_group_analysis_df[gene_group_analysis_df["Gene Group Symbol Match"]]
gene_group_match_df