In [201]:
import pandas as pd
import mygene
import json

In [202]:
def load_and_filter_data(file_path):
    # Load data and filter by 'Gene | mRNA', ensure gene symbols are uppercase and start with a letter
    df = pd.read_excel(file_path)
    df = df[df['Gbkey'] == 'Gene | mRNA']
    df = df[df['Gene_Symbol'].str[0].str.isalpha()]
    df['Gene_Symbol'] = df['Gene_Symbol'].str.upper()
    return df.drop(columns=['Gbkey'])

In [203]:
def convert_mouse_symb_human_id(mouse_df):
    mg = mygene.MyGeneInfo()
    query_results = mg.querymany(mouse_df['Gene_Symbol'].tolist(), scopes='symbol', species='mouse', fields='name,symbol,homologene', returnall=True)

    # Initialize new columns for Human_GeneID and Human_Symbol
    mouse_df['GeneID'] = None
    
    # Process the results and update the DataFrame
    for result in query_results['out']:
        if 'notfound' not in result and 'homologene' in result:
            gene_symbol = result.get('symbol', '').upper()
            human_gene_id = next((gene[1] for gene in result['homologene'].get('genes', []) if gene[0] == 9606), None)
            if human_gene_id:
                mouse_df.loc[mouse_df['Gene_Symbol'] == gene_symbol, 'GeneID'] = human_gene_id
            else:
                print(f"No human homologene found for {gene_symbol}")
        else:
            print(f"No data found for {result.get('query')}")

    return mouse_df

In [204]:
def convert_human_id_human_symb(df):
    mg = mygene.MyGeneInfo()
    
    # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    mouse_df = df.copy()

    # Initialize 'Human_Symbol' column and convert 'GeneID' to string
    mouse_df['Human_Symbol'] = None
    mouse_df['GeneID'] = mouse_df['GeneID'].astype(str)
    
    # Query mygene.info for human symbols
    query_results = mg.querymany(mouse_df['GeneID'].tolist(), scopes='entrezgene', species='human', fields='symbol', returnall=True)
    for result in query_results['out']:
        if 'notfound' not in result:
            gene_id = result.get('query')
            gene_symbol = result.get('symbol', '').upper()
            mouse_df.loc[mouse_df['GeneID'] == gene_id, 'Human_Symbol'] = gene_symbol
        else:
            print(f"No data found for {result.get('query')}")

    return mouse_df


In [205]:
def load_idmap(mouse_df, file_path):
    with open(file_path, 'r') as file:
        gene_dict = json.load(file)

    mouse_df['GeneID'] = mouse_df.apply(lambda row: gene_dict.get(row['Gene_Symbol'], row['GeneID']) if pd.isna(row['GeneID']) else row['GeneID'], axis=1)
    return mouse_df

In [206]:
# Usage
dir_path = 'Inputs/experiments_data/Parkinson/'
data_file_path = dir_path + 'Parkinson_0prefilter.xlsx'

temp_data_file = load_and_filter_data(data_file_path)
temp_data_file.to_excel(dir_path + 'Parkinson_1filtered.xlsx', index=False)
print(temp_data_file.shape)
print(temp_data_file.head())

(14994, 5)
  Gene_Symbol  Score (T vs N)  Score (500nM vs T)  Score (10uM vs T)  \
0        SNCA        3.052472            0.138576           0.323060   
1       ALDOC        1.655778           -0.133590           0.233202   
2        BDNF       -2.310176            1.141288          -0.549896   
3        SCG2       -2.239593            1.033283          -0.607833   
4    SERPINE1       -2.451058            0.221729          -0.598311   

   P-value (T vs N)  
0      3.169900e-65  
1      6.806014e-28  
2      2.636830e-27  
3      5.272165e-27  
4      1.887658e-24  


In [207]:
mouse_to_human_df = convert_mouse_symb_human_id(temp_data_file)
mouse_to_human_df.to_excel(dir_path + 'Parkinson_2converted.xlsx', index=False)
print(mouse_to_human_df.shape)
print(mouse_to_human_df.head())

87 input query terms found dup hits:	[('DDIT3', 2), ('GM33869', 2), ('GM39469', 2), ('GM2102', 2), ('GM3636', 2), ('CDR1', 2), ('GM5454',
287 input query terms found no hit:	['GRASP', 'SLC9A3R1', 'QK', 'NARS', 'GARS', 'SARS', 'FAM126A', 'CARS', 'YARS', 'GRAMD3', 'PNMAL2', '


No human homologene found for NEFM
No data found for GRASP
No data found for DDIT3
No data found for SLC9A3R1
No human homologene found for TPM4
No data found for QK
No human homologene found for SCD2
No human homologene found for CYP2J9
No human homologene found for TPM1
No data found for NARS
No human homologene found for D430019H16RIK
No data found for GARS
No human homologene found for SIK1
No data found for SARS
No human homologene found for POU3F1
No human homologene found for CALD1
No human homologene found for BEX1
No human homologene found for ARXES1
No data found for CRCT1
No data found for FAM126A
No data found for CARS
No data found for YARS
No data found for GRAMD3
No data found for PNMAL2
No human homologene found for NRXN3
No data found for AKAP17A
No human homologene found for TBC1D30
No human homologene found for NPCD
No human homologene found for PVR
No data found for YJEFN3
No human homologene found for FOLH1
No data found for SOGA1
No human homologene found for ALDO

In [208]:
print (mouse_to_human_df['GeneID'].isna().sum())
# Post-process DataFrame
human_to_human_df = load_idmap(mouse_to_human_df, 'Data/H_sapiens/gene_names/H_sapiens.gene_info')
print (mouse_to_human_df['GeneID'].isna().sum())

1227
838


In [209]:
human_to_human_df = human_to_human_df.dropna(subset=['GeneID'])
human_to_human_df = convert_human_id_human_symb(human_to_human_df)
post_dict_df = human_to_human_df.drop_duplicates(subset=['GeneID'])

68 input query terms found dup hits:	[('3105', 6), ('54578', 2), ('57016', 2), ('55859', 2), ('797', 2), ('56171', 3), ('3159', 2), ('313


In [210]:
# Reorder columns, rename and convert types
post_dict_df = post_dict_df[['GeneID', 'Gene_Symbol','Human_Symbol', 'Score (T vs N)', 'Score (500nM vs T)', 'Score (10uM vs T)', 'P-value (T vs N)']]
# print rows where the value in Gene_Symbol is different from Human_Symbol
print(post_dict_df[post_dict_df['Gene_Symbol'] != post_dict_df['Human_Symbol']])

       GeneID Gene_Symbol Human_Symbol  Score (T vs N)  Score (500nM vs T)  \
9        6372       CXCL5        CXCL6        2.693580           -0.328975   
36       2949       GSTM1        GSTM5        0.983349           -0.000898   
91       7018         TRF           TF       -1.103645           -0.105890   
172      3105       H2-K1        HLA-A       -0.895425            0.348087   
234     54578     UGT1A6A       UGT1A6        1.120641            0.246288   
...       ...         ...          ...             ...                 ...   
27166    6476         SIS           SI        0.019836            0.593135   
27189   57055        DAZL         DAZ2       -0.006210           -0.632314   
27200   55311      ZFP444       ZNF444       -0.001548           -0.090584   
27203    8364       H4C14         H4C3        0.013046            0.526665   
27314  157773    AI429214      C8ORF48        0.001308           -0.233816   

       Score (10uM vs T)  P-value (T vs N)  
9              -0.

In [211]:

post_dict_df['GeneID'] = post_dict_df['GeneID'].astype(int)

post_dict_df.to_excel(dir_path + 'Parkinson_3finished.xlsx', index=False)