### Aggregation analysis for diseases

In [11]:
import requests
import pandas as pd
import math

diseases = [
    {
        'efo_id': 'EFO_0000400',
        'efo_label': 'diabetes mellitus'
    }, 
    {
        'efo_id': 'EFO_0000305',
        'efo_label': 'breast carcinoma'
    },
    {
        'efo_id': 'EFO_0003060',
        'efo_label': 'non small cell lung carcinoma'
    },
    {
        'efo_id': 'EFO_0003843',
        'efo_label': 'pain'
    },
#     {
#         'efo_id': 'EFO_0000616',
#         'efo_label': 'neoplasm',
#     },    
    {
        'efo_id': 'EFO_0000270',
        'efo_label': 'asthma'
    },
    {
        'efo_id': 'EFO_0003086',
        'efo_label': 'kidney disease'
    },
    {
        'efo_id': 'EFO_0003885',
        'efo_label': 'multiple sclerosis'
    },
    {
        'efo_id': 'EFO_0003767',
        'efo_label': 'inflammatory bowel disease'
    },
    {
        'efo_id': 'EFO_0000341',
        'efo_label': 'chronic obstructive pulmonary disease'
    }
]

size = '10000'

summary_report_data = []

for disease in diseases:
    
    print('*****')
    
    print('Running aggregation for %s' % (disease['efo_label']))
    
    url = (
        'https://platform-api.opentargets.io/v3/platform/public/evidence/filter?size=' 
        + size 
        + '&datasource=chembl&fields=disease.efo_info&fields=drug&fields=evidence&fields=target&fields=access_level&disease='
        + disease['efo_id'] 
        + '&expandefo=true'
    ) 
    
    r = requests.get(url)
    
    data = r.json()
    
    all_drugs_data = data['data']
    
    if data['total'] > 10000:
        num_additional_api_calls_to_make = math.ceil(data['total'] / 10000) - 1
        next_params = [data['next'][0], data['next'][1]]
        for i in range(num_additional_api_calls_to_make):
            print('Running additional API call for %s with these parameters - %s' % (disease['efo_label'], next_params))
            url = (
                'https://platform-api.opentargets.io/v3/platform/public/evidence/filter?size=' 
                + size 
                + '&datasource=chembl&fields=disease.efo_info&fields=drug&fields=evidence&fields=target&fields=access_level&disease='
                + disease['efo_id'] 
                + '&expandefo=true' 
                + '&next='
                + str(next_params[0])
                + '&next='
                + str(next_params[1])
            )
            r = requests.get(url)
            data = r.json()
            all_drugs_data.extend(data['data'])
            if 'next' in data:
                next_params = [data['next'][0], data['next'][1]]
        
    print('Total number of evidence strings retrieved from API = %s' % len(all_drugs_data))
    
    drugs_data_simple = []
    
    for entry in all_drugs_data:
        target_symbol = entry['target']['gene_info']['symbol']
        disease_label = entry['disease']['efo_info']['label']
        drug_name = entry['drug']['molecule_name']
        drug_phase = entry['evidence']['drug2clinic']['clinical_trial_phase']['numeric_index']
        drug_status = entry['evidence']['drug2clinic'].get('status', 'n/a')
        drug_obj = {
            'target': target_symbol,
            'disease': disease_label,
            'drug': drug_name,
            'phase': drug_phase,
            'status': drug_status,
        }
        drugs_data_simple.append(drug_obj)
    
#     print('Number of evidence strings in list with simple data: %s' % len(all_drugs_data))
    
#     print(drugs_data_simple[0])
    
    drugs_df = pd.DataFrame(drugs_data_simple)
    
    cols_to_aggregate = ['disease', 'drug', 'phase', 'status']

    aggregated_drugs_df_excl_targets = drugs_df.groupby(cols_to_aggregate).size().reset_index(name='num_ev_strings').sort_values(by='num_ev_strings', ascending=False)

    print('Number of evidence strings after aggregation EXCLUDING targets = %s' % len(aggregated_drugs_df_excl_targets))
    
    cols_to_aggregate.append('target')
    
    aggregated_drugs_df_incl_targets = drugs_df.groupby(cols_to_aggregate).size().reset_index(name='num_ev_strings').sort_values(by='num_ev_strings', ascending=False)

    print('Number of evidence strings after aggregation INCLUDING targets = %s' % len(aggregated_drugs_df_incl_targets))
    
#     file_name = 'reports/aggregated_drug_evidence_' + disease['efo_label'] + '.csv'

#     aggregated_drugs_df.to_csv(file_name, index=False, encoding='utf-8')
    
#     percentage_difference = round(((len(aggregated_drugs_df) - len(all_drugs_data)) / len(all_drugs_data) * 100), 2) 
    
    summary_report_row = {
        'disease': disease['efo_label'],
        'efo_id': disease['efo_id'],
        'evidence_strings_all': data['total'],  
        'evidence_strings_aggregated_excluding_targets': len(aggregated_drugs_df_excl_targets ),
        'evidence_strings_aggregated_including_targets': len(aggregated_drugs_df_incl_targets ),
        'number_of_drugs': aggregated_drugs_df_excl_targets['drug'].nunique()
    }
    
    summary_report_data.append(summary_report_row)

summary_report_df = pd.DataFrame(summary_report_data)

summary_report_file_name = 'reports/summary_report_all_diseases.csv'

summary_report_df.to_csv(summary_report_file_name, index=False, encoding='utf-8')
        

*****
Running aggregation for diabetes mellitus
Running additional API call for diabetes mellitus with these parameters - [1.0, '8b3bdcaf4d3c0fe9e1737f194bd78702']
Running additional API call for diabetes mellitus with these parameters - [1.0, '15a5fccbf741d28a4302ffd5a2169736']
Running additional API call for diabetes mellitus with these parameters - [0.7, '697f740c02d03d266e6e1b895679e326']
Running additional API call for diabetes mellitus with these parameters - [0.2, '266fd9b25118880cf1f374b101340ed2']
Total number of evidence strings retrieved from API = 46053
Number of evidence strings after aggregation EXCLUDING targets = 1315
Number of evidence strings after aggregation INCLUDING targets = 4942
*****
Running aggregation for breast carcinoma
Running additional API call for breast carcinoma with these parameters - [0.2, '798a436e4ff7b70f2ba9f58abfbccc6d']
Total number of evidence strings retrieved from API = 19440
Number of evidence strings after aggregation EXCLUDING targets = 1

### Aggregation analysis for targets

In [46]:
import requests
import pandas as pd
import math

targets = [
    {
        'ensembl_id': 'ENSG00000113580',
        'symbol': 'NR3C1'
    },
    {
        'ensembl_id': 'ENSG00000146648',
        'symbol': 'EGFR'
    },
    {
        'ensembl_id': 'ENSG00000073756',
        'symbol': 'PTGS2'
    },
    {
        'ensembl_id': 'ENSG00000095303',
        'symbol': 'PTGS1'
    },
    {
        'ensembl_id': 'ENSG00000196230',
        'symbol': 'TUBB'
    },
    {
        'ensembl_id': 'ENSG00000131747',
        'symbol': 'TOP2A'
    },
    {
        'ensembl_id': 'ENSG00000091831',
        'symbol': 'ESR1'
    },  
]

size = '10000'

summary_report_data = []

for target in targets:
    
    print('*****')
    print('Running aggregation for %s' % (target['symbol']))
    
    url = (
        'https://platform-api.opentargets.io/v3/platform/public/evidence/filter?size=' 
        + size 
        + '&datasource=chembl&fields=disease.efo_info&fields=drug&fields=evidence&fields=target&fields=access_level&target='
        + target['ensembl_id'] 
        + '&expandefo=true'
    ) 
        
    r = requests.get(url)
    
    data = r.json()
    
    all_drugs_data = data['data']
    
    if data['total'] > 10000:
        num_additional_api_calls_to_make = math.ceil(data['total'] / 10000) - 1
        next_params = [data['next'][0], data['next'][1]]
        for i in range(num_additional_api_calls_to_make):
            print('Running additional API call for %s' % (disease['efo_label']))
            print(next_params)
            url = (
                'https://platform-api.opentargets.io/v3/platform/public/evidence/filter?size=' 
                + size 
                + '&datasource=chembl&fields=disease.efo_info&fields=drug&fields=evidence&fields=target&fields=access_level&disease='
                + disease['efo_id'] 
                + '&expandefo=true' 
                + '&next='
                + str(next_params[0])
                + '&next='
                + str(next_params[1])
            )
            r = requests.get(url)
            data = r.json()
            all_drugs_data.extend(data['data'])
            if 'next' in data:
                next_params = [data['next'][0], data['next'][1]]
        
    print('Total number of evidence strings from API: %s' % data['total'])
    
    drugs_data_simple = []
    
    for entry in all_drugs_data:
        target_symbol = entry['target']['gene_info']['symbol']
        disease_label = entry['disease']['efo_info']['label']
        drug_name = entry['drug']['molecule_name']
        drug_phase = entry['evidence']['drug2clinic']['clinical_trial_phase']['numeric_index']
        drug_status = entry['evidence']['drug2clinic'].get('status', 'n/a')
        drug_obj = {
            'target': target_symbol,
            'disease': disease_label,
            'drug': drug_name,
            'phase': drug_phase,
            'status': drug_status,
        }
        drugs_data_simple.append(drug_obj)
    
    print('Number of evidence strings in list with simple data: %s' % len(all_drugs_data))
    
    print(drugs_data_simple[0])
    
    drugs_df = pd.DataFrame(drugs_data_simple)
    
    cols_to_aggregate = ['disease', 'drug', 'phase', 'status', 'target']

    aggregated_drugs_df = drugs_df.groupby(cols_to_aggregate).size().reset_index(name='num_ev_strings').sort_values(by='num_ev_strings', ascending=False)

    print('Number of evidence strings aggregated: %s' % len(aggregated_drugs_df))
    
    file_name = 'reports/aggregated_drug_evidence_' + target['symbol'] + '.csv'

    aggregated_drugs_df.to_csv(file_name, index=False, encoding='utf-8')
    
    percentage_difference = round(((len(aggregated_drugs_df) - len(all_drugs_data)) / len(all_drugs_data) * 100), 2) 
    
    summary_report_row = {
        'symbol': target['symbol'],
        'ensembl_id': target['ensembl_id'],
        'evidence_strings_all': data['total'],  
        'evidence_strings_api_response': len(all_drugs_data),
        'evidence_strings_aggregated': len(aggregated_drugs_df),
        'percentage_difference': percentage_difference
    }
    
    summary_report_data.append(summary_report_row)

summary_report_df = pd.DataFrame(summary_report_data)

summary_report_file_name = 'reports/summary_report_all_targets.csv'

summary_report_df.to_csv(summary_report_file_name, index=False, encoding='utf-8')
        

*****
Running aggregation for NR3C1
Total number of evidence strings from API: 6696
Number of evidence strings in list with simple data: 6696
{'target': 'NR3C1', 'disease': 'rheumatoid arthritis', 'drug': 'PREDNISONE', 'phase': 4, 'status': 'n/a'}
Number of evidence strings aggregated: 2743
*****
Running aggregation for EGFR
Total number of evidence strings from API: 2626
Number of evidence strings in list with simple data: 2626
{'target': 'EGFR', 'disease': 'metastatic colorectal cancer', 'drug': 'CETUXIMAB', 'phase': 4, 'status': 'Unknown status'}
Number of evidence strings aggregated: 1174
*****
Running aggregation for PTGS2
Total number of evidence strings from API: 5339
Number of evidence strings in list with simple data: 5339
{'target': 'PTGS2', 'disease': 'pain', 'drug': 'ETORICOXIB', 'phase': 4, 'status': 'Completed'}
Number of evidence strings aggregated: 1576
*****
Running aggregation for PTGS1
Total number of evidence strings from API: 4436
Number of evidence strings in list