In [59]:
import pandas as pd
import mygene
import json

In [60]:
def load_and_filter_data(file_path):
    """
    Load data from an Excel file and filter rows based on specific criteria.

    This function filters out rows where 'Gbkey' is not 'Gene | mRNA', ensures gene symbols are uppercase, and start with a letter.

    Parameters:
    file_path (str): Path to the Excel file containing the data.

    Returns:
    DataFrame: A pandas DataFrame with filtered data and 'Gbkey' column dropped.
    """
    # Load data from the provided Excel file
    df = pd.read_excel(file_path)
    
    # Filter data where 'Gbkey' equals 'Gene | mRNA' and first character of 'Gene_Symbol' is a letter
    df = df[df['Gbkey'] == 'Gene | mRNA']
    df = df[df['Gene_Symbol'].str[0].str.isalpha()]
    
    # Convert 'Gene_Symbol' to uppercase
    df['Gene_Symbol'] = df['Gene_Symbol'].str.upper()
    
    # Drop the 'Gbkey' column
    df.drop(columns=['Gbkey'])
    
    # Initialize a new column for Human_GeneID, Human_Symbol and reordering the columns
    df['GeneID'] = None
    df['Human_Symbol'] = None
    df = df[['GeneID', 'Gene_Symbol','Human_Symbol', 'Score (T vs N)', 'Score (500nM vs T)', 'Score (10uM vs T)', 'P-value (T vs N)']]

    return df 

In [61]:
import pandas as pd

def convert_mouse_symbol_to_human_id(mouse_df, human_df_path):
    """
    Update the mouse DataFrame with human gene IDs and symbols based on a mapping from human gene names to IDs. 

    Parameters:
    mouse_df (DataFrame): A DataFrame containing mouse gene symbols.
    human_df_path (str): Path to the Excel file containing human gene symbols, IDs, and names.

    Returns:
    DataFrame: The input mouse DataFrame with updated human gene IDs and symbols where matches are found.
    """
    # Read human_df from the provided Excel file
    human_df = pd.read_excel(human_df_path)

    # Create a dictionary from the human DataFrame mapping human gene names to IDs and symbols
    human_gene_dict = dict(zip(human_df['Gene_Symbol'], zip(human_df['GeneID'], human_df['Human_Name'])))

    # Initialize new columns for GeneID and Human_Symbol in mouse_df
    mouse_df['GeneID'] = None
    mouse_df['Human_Symbol'] = None

    # Update the mouse DataFrame with gene IDs and symbols from the human gene dictionary
    for index, row in mouse_df.iterrows():
        mouse_gene_symbol = row['Gene_Symbol']
        if mouse_gene_symbol in human_gene_dict:
            human_gene_id, human_symbol = human_gene_dict[mouse_gene_symbol]
            mouse_df.at[index, 'GeneID'] = human_gene_id
            mouse_df.at[index, 'Human_Symbol'] = human_symbol

    return mouse_df

In [62]:
def convert_symbol_to_id(mouse_df, sapiens_path):
    """
    Update the mouse DataFrame with human gene IDs and symbols based on a mapping from human gene symbols to IDs.

    Parameters:
    mouse_df (DataFrame): A DataFrame containing mouse gene symbols, with some missing or 'None' human gene IDs.
    sapiens_path (str): Path to the JSON file containing gene ID mappings.

    Returns:
    DataFrame: The input DataFrame with updated gene IDs and human symbols where matches are found.
    """
    # Load gene ID mappings from the specified JSON file
    with open(sapiens_path, 'r') as file:
        gene_dict = json.load(file)

    # Update the DataFrame with gene IDs and symbols from the mapping file
    for index, row in mouse_df.iterrows():
        gene_symbol = row['Gene_Symbol']
        if gene_symbol in gene_dict and (pd.isna(row['GeneID']) or row['GeneID'] == 'None'):
            mouse_df.at[index, 'GeneID'] = gene_dict[gene_symbol]
            mouse_df.at[index, 'Human_Symbol'] = gene_symbol

    return mouse_df

In [63]:
def convert_mouse_symb_human_id(mouse_df):
    """
    Convert mouse gene symbols to human gene IDs using MyGeneInfo service for rows where 'Human_Symbol' is empty.
    Also, count and print the number of values being sent to MyGeneInfo and print a list of all queries where no data was found.

    Parameters:
    mouse_df (DataFrame): A DataFrame containing mouse gene symbols.

    Returns:
    DataFrame: The input DataFrame with an added column 'GeneID' for human gene IDs.
    """
    mg = mygene.MyGeneInfo()

    # Filter the DataFrame to include only rows where 'Human_Symbol' is empty
    query_df = mouse_df[mouse_df['Human_Symbol'].isna() | (mouse_df['Human_Symbol'] == '')]

    # Count the number of values being sent to MyGeneInfo
    num_values_sent = len(query_df)
    print(f"Number of values being sent to MyGeneInfo: {num_values_sent}")

    # Query MyGeneInfo with filtered mouse gene symbols to get human gene IDs
    query_results = mg.querymany(query_df['Gene_Symbol'].tolist(), scopes='symbol', species='mouse', fields='name,symbol,homologene', returnall=True, verbose=True)

    no_data_found = []  # List to keep track of queries with no data found

    # Process the query results and update the original DataFrame with human gene IDs
    for result in query_results['out']:
        if 'notfound' not in result and 'homologene' in result:
            gene_symbol = result.get('symbol', '').upper()
            human_gene_id = next((gene[1] for gene in result['homologene'].get('genes', []) if gene[0] == 9606), None)
            if human_gene_id:
                mouse_df.loc[mouse_df['Gene_Symbol'] == gene_symbol, 'GeneID'] = human_gene_id
            else:
                print(f"No human homologene found for {gene_symbol}")
        else:
            no_data_found.append(result.get('query'))

    # Print the list of queries where no data was found
    if no_data_found:
        # print ach value then /n 
        print("\n".join(no_data_found))

    return mouse_df

In [64]:
def convert_human_id_human_symb(df):
    """
    Convert human gene IDs to human gene symbols using MyGeneInfo service. 
    Handles 'GeneID' that are in a floating-point format.

    Parameters:
    df (DataFrame): A DataFrame containing human gene IDs.

    Returns:
    DataFrame: The input DataFrame with an added column 'Human_Symbol' for human gene symbols.
    """
    mg = mygene.MyGeneInfo()
    
    # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    mouse_df = df.copy()

    # Convert 'GeneID' to integer where not NaN and not non-numeric, then to string
    mouse_df['GeneID'] = mouse_df['GeneID'].apply(lambda x: str(int(float(x))) if pd.notnull(x) and x != '' else x)
    
    # Query mygene.info for human symbols
    query_results = mg.querymany(mouse_df['GeneID'].tolist(), scopes='entrezgene', species='human', fields='symbol', returnall=True)
    for result in query_results['out']:
        if 'notfound' not in result:
            gene_id = result.get('query')
            gene_symbol = result.get('symbol', '').upper()
            mouse_df.loc[mouse_df['GeneID'] == gene_id, 'Human_Symbol'] = gene_symbol
        else:
            print(f"No data found for {result.get('query')}")

    return mouse_df

In [65]:
# Usage example: loading, filtering, and processing data
dir_path = 'Inputs/experiments_data/Parkinson/'
data_file_path = dir_path + 'Parkinson_0prefilter.xlsx'

# Load and filter data
filter_one_df = load_and_filter_data(data_file_path)
filter_one_df.to_excel(dir_path + 'Parkinson_1rna_filtered.xlsx', index=False)
print(filter_one_df.shape)
print(filter_one_df.head())

(14994, 7)
  GeneID Gene_Symbol Human_Symbol  Score (T vs N)  Score (500nM vs T)  \
0   None        SNCA         None        3.052472            0.138576   
1   None       ALDOC         None        1.655778           -0.133590   
2   None        BDNF         None       -2.310176            1.141288   
3   None        SCG2         None       -2.239593            1.033283   
4   None    SERPINE1         None       -2.451058            0.221729   

   Score (10uM vs T)  P-value (T vs N)  
0           0.323060      3.169900e-65  
1           0.233202      6.806014e-28  
2          -0.549896      2.636830e-27  
3          -0.607833      5.272165e-27  
4          -0.598311      1.887658e-24  


In [66]:
# Load and filter data
filter_two_df = convert_mouse_symbol_to_human_id(filter_one_df, 'Data/H_sapiens/gene_names/mouse_id_map.xlsx')
print(filter_two_df.shape)
print(filter_two_df.head())

(14994, 7)
  GeneID Gene_Symbol Human_Symbol  Score (T vs N)  Score (500nM vs T)  \
0   None        SNCA         None        3.052472            0.138576   
1   None       ALDOC         None        1.655778           -0.133590   
2   None        BDNF         None       -2.310176            1.141288   
3   None        SCG2         None       -2.239593            1.033283   
4   None    SERPINE1         None       -2.451058            0.221729   

   Score (10uM vs T)  P-value (T vs N)  
0           0.323060      3.169900e-65  
1           0.233202      6.806014e-28  
2          -0.549896      2.636830e-27  
3          -0.607833      5.272165e-27  
4          -0.598311      1.887658e-24  


In [67]:
# print how many rows have missing gene IDs
print (filter_one_df['Human_Symbol'].isna().sum())
filter_three_df = convert_symbol_to_id(filter_one_df, 'Data/H_sapiens/gene_names/H_sapiens.gene_info')
print (filter_three_df['Human_Symbol'].isna().sum())
filter_three_df.to_excel(dir_path + 'Parkinson_2idmap_converted.xlsx', index=False)

14810
1229


In [68]:
# Convert mouse gene symbols to human gene IDs
filter_four_df = convert_mouse_symb_human_id(filter_three_df)
filter_four_df.to_excel(dir_path + 'Parkinson_3mg_converted.xlsx', index=False)
# print how many rows have missing gene IDs
print (filter_four_df['GeneID'].isna().sum())
filter_four_df = filter_four_df.dropna(subset=['GeneID'])

Number of values being sent to MyGeneInfo: 1229


73 input query terms found dup hits:	[('GM33869', 2), ('GM39469', 2), ('GM2102', 2), ('GM3636', 2), ('GM5454', 2), ('PPP1CCB', 2), ('ZFP9
107 input query terms found no hit:	['METTL7A1', 'GM2115', 'LOC114841036', 'CHIL1', 'FCRLS', 'TMEM250-PS', 'D830031N03RIK', 'LOC11856824


No human homologene found for SCD2
No human homologene found for CYP2J9
No human homologene found for D430019H16RIK
No human homologene found for ARXES1
No human homologene found for NPCD
No human homologene found for ALDOART1
No human homologene found for ZFP871
No human homologene found for RPS3A1
No human homologene found for ZFP954
No human homologene found for TNFRSF26
No human homologene found for ZFP932
No human homologene found for ARF2
No human homologene found for SLFN2
No human homologene found for ZFP955B
No human homologene found for GM3667
No human homologene found for ZFP933
No human homologene found for SAA3
No human homologene found for CCL27A
No human homologene found for LY6A
No human homologene found for SERPINB6B
No human homologene found for SERPINB9B
No human homologene found for ZFP882
No human homologene found for GM2102
No human homologene found for CES2E
No human homologene found for GM3636
No human homologene found for TRIM30A
No human homologene found for A

In [69]:
filter_five_df = convert_human_id_human_symb(filter_four_df)
post_dict_df = filter_five_df.drop_duplicates(subset=['GeneID'])

69 input query terms found dup hits:	[('3105', 6), ('54578', 2), ('4017', 2), ('57016', 2), ('56171', 3), ('3159', 2), ('3133', 2), ('235


In [70]:
print(post_dict_df[post_dict_df['Gene_Symbol'] != post_dict_df['Human_Symbol']])

       GeneID Gene_Symbol Human_Symbol  Score (T vs N)  Score (500nM vs T)  \
91       7018         TRF           TF       -1.103645           -0.105890   
108    160622       GRASP      TAMALIN       -1.260488            0.552426   
134      9368    SLC9A3R1       NHERF1        0.952768           -0.104327   
143      9444          QK          QKI        0.730468           -0.032190   
172      3105       H2-K1        HLA-A       -0.895425            0.348087   
...       ...         ...          ...             ...                 ...   
27166    6476         SIS           SI        0.019836            0.593135   
27184    2086        ERV3       ERV3-1        0.015481            0.455169   
27200   55311      ZFP444       ZNF444       -0.001548           -0.090584   
27258  440957       SMIM4        UQCC5       -0.002533           -0.413108   
27314  157773    AI429214      C8ORF48        0.001308           -0.233816   

       Score (10uM vs T)  P-value (T vs N)  
91             -0.

In [71]:
# Create a new DataFrame to avoid SettingWithCopyWarning
post_dict_df = post_dict_df.copy()

# Convert 'GeneID' to integer
post_dict_df['GeneID'] = post_dict_df['GeneID'].astype(int)

# drop gene_symbol column
post_dict_df = post_dict_df.drop(columns=['Gene_Symbol'])
# Save to Excel
post_dict_df.to_excel(dir_path + 'Parkinson_4finished.xlsx', index=False)