In [1]:
#importing libraries
import json
import pandas as pd
import numpy as np
import traceback

In [2]:
#show dataset summary
def show_dataset_summary(file_name):
    with open(file_name, encoding="utf8") as f:
        data = json.load(f)
        
    total_condition = len(data['Conditions'])
    total_patient = len(data['Patients'])
    total_therapy = len(data['Therapies'])
    total_trial = 0
    
    for patient in data['Patients']:
        total_trial += len(patient['trials'])
        #for d in patient['trials']:
            #print(d['condition'])
        
    print('-----------Dataset Summary-----------')
    print('Total Condition : ', total_condition)
    print('Total Patient : ', total_patient)
    print('Total Therapy : ', total_therapy)
    print('Total Trial : ', total_trial)
    print('-----------Dataset Summary-----------')

In [3]:
def get_condition_id(dataset, patient_condition_id):
    condition_id = ''
    with open(dataset, encoding="unicode_escape") as f:
        data = json.load(f)

    # conditions
    df_conditions = pd.json_normalize(data['Patients'], "conditions", "id", errors='ignore', record_prefix='_')
    row_cond = df_conditions.loc[df_conditions['_id'] == patient_condition_id]
    #print('pc_id: ', patient_condition_id)
    #print(row_cond)
    #if row_cond.size != 0:
    condition_id = row_cond['_kind'].values[0]
    
    return condition_id

In [4]:
def get_similar_neighbour(CTI_matrix, n):
    order = np.argsort(CTI_matrix.values, axis=1)[:, :n]

    similar_neighbour = CTI_matrix.apply(lambda x: pd.Series(x.sort_values(ascending=False)
                                      .iloc[:n].index,
                                      index=['top_{}'.format(i) for i in range(1, n + 1)]), axis=1)
    return similar_neighbour

In [5]:
def get_top_therapy(dataset, condition_id, count):
    with open(dataset, encoding="utf8") as f:
        data = json.load(f)
    
    # trials under patients
    trials_data = pd.json_normalize(data['Patients'], "trials", ["id", "name"], errors='ignore', record_prefix='_')
    # print(trials_data)

    # conditions under patients
    conditions_data = pd.json_normalize(data['Patients'], "conditions", ["id"], errors='ignore', record_prefix='_')
    # print(conditions_data)

    merge_trials_with_conditions = pd.merge(trials_data, conditions_data, how='left', left_on=['_condition', 'id'], right_on=['_id', 'id'])
    #print(merge_trials_with_conditions)
    
    #creating condition theraphy interaction matrix
    therapy_matrix_CTI = merge_trials_with_conditions.pivot_table(index='_kind', columns='_therapy', values='_successful')
    #print(therapy_matrix_CTI)
    
    #finding similar neighbour
    similar_neighbour = get_similar_neighbour(therapy_matrix_CTI, count)
    #display(similar_neighbour.head(5))
    
    #csv convertion
    #merge_trials_with_conditions.to_csv(r'before_merge_trials_with_conditions.csv', index=None)
    
    if condition_id != None:
        merge_trials_with_conditions = merge_trials_with_conditions[merge_trials_with_conditions['_kind'] == condition_id]
        #print('After Filtering: ', merge_trials_with_conditions)

        merge_trials_with_conditions = merge_trials_with_conditions.groupby('_therapy')['_successful'].mean()
        #print('After group by: ', merge_trials_with_conditions)
    
        neighbours_data = similar_neighbour.loc[condition_id, :]
        
        top_therapy_data = pd.DataFrame(columns=['Therapy', 'Success'])
        #print('Top Recommended Therapies for: ', top_therapy_data)

        for items in neighbours_data:
            try:
                top_therapy_data = top_therapy_data.append({'Therapy': items, 'Success': merge_trials_with_conditions[items]}, ignore_index=True)
            except:
                break
        #print('Top Therapy: ', top_therapy_data['Therapy'][0])        
        #print(top_therapy_data)
        return top_therapy_data

In [6]:
def therapy_recommendation(patient_id, condition_id, patient_condition_id, dataset):
    #reading dataset
    with open(dataset, encoding="utf8") as f:
        data = json.load(f)

    # trials under patients
    trials_data = pd.json_normalize(data['Patients'], "trials", ["id", "name"], errors='ignore', record_prefix='_')
    #print(trials_data)

    # conditions under patients
    conditions_data = pd.json_normalize(data['Patients'], "conditions", ["id"], errors='ignore', record_prefix='_')
    #print(conditions_data)

    merge_trials_with_conditions = pd.merge(trials_data, conditions_data, how='left', left_on=['_condition', 'id'], right_on=['_id', 'id'])
    #print(merge_trials_with_conditions.head(5))
    
    #selecting necessary column from the output entity
    columns = ['id', '_kind', '_therapy', '_successful']
    #print(merge_trials_with_conditions[columns])
    merged_data = merge_trials_with_conditions[columns]
    #print(merged_data)
    
    # creating mean reating of successful trials
    therapy_success_rate = pd.DataFrame(merged_data.groupby('_therapy')['_successful'].mean())
    #display(therapy_success_rate.head(5))
    
    #creating number of successful trials
    therapy_success_rate['number_of_successful_trials'] = merged_data.groupby('_therapy')['_successful'].count()
    #display(therapy_success_rate.head(5))
    
    #creating patient therapy interaction matrix | PTI
    therapy_matrix_PTI = merged_data.pivot_table(index='id', columns='_therapy', values='_successful')
    #display(therapy_matrix_PTI)
    
    #finding top therapy based on condition
    if condition_id != None:
        top_therapy = get_top_therapy(dataset, condition_id, 5)
        
        #recommendation for a specific therapy
        #success_rate_for_a_specific_therapy = therapy_matrix_PTI['Th47']
        success_rate_for_a_specific_therapy = therapy_matrix_PTI[top_therapy['Therapy'][0]]
        
        # finding correlation with other Therapies
        similar_to_specific_therapy = therapy_matrix_PTI.corrwith(success_rate_for_a_specific_therapy)
        correlation_with_specific_therapy = pd.DataFrame(similar_to_specific_therapy, columns=['Correlation'])
        correlation_with_specific_therapy.dropna(inplace=True)

        #Joining with number of successfil trials
        correlation_with_specific_therapy = correlation_with_specific_therapy.join(therapy_success_rate['number_of_successful_trials'])
        #display(correlation_with_specific_therapy.head())
        
        recommended_therapy = correlation_with_specific_therapy[correlation_with_specific_therapy['number_of_successful_trials'] > 1800].sort_values(by='Correlation', ascending=False)
        #print('after threshold filtering')
        #display(recommended_therapy.head(10))
        
        # merge recommended therapy with therapy data for therapy name
        df_therapies = pd.json_normalize(data['Therapies'], errors='ignore', record_prefix='_')
        merge_recommended_therapy = pd.merge(recommended_therapy, df_therapies, how='left', left_on=['_therapy'], right_on=['id'])
        columns = ['id', 'name', 'Correlation', 'number_of_successful_trials']
        merge_recommended_therapy[columns].head(5).to_csv(r'result_' + patient_id + '_' + patient_condition_id + '.csv', index=None)
        display(merge_recommended_therapy[columns].head(5))
        
    

In [7]:
#defining main function
def main():
    
    #dataset & test cases initialization
    dataset = 'C:\\Users\\HP\\Downloads\\data-mining\\datasetB.json'
    test_cases = 'C:\\Users\\HP\\Downloads\\data-mining\\datasetB_cases.txt'
    
    #Subset of Dataset for debugging
#     dataset = 'C:\\Users\\HP\\Downloads\\data-mining\\datasetB_sample.json'
#     test_cases = 'C:\\Users\\HP\\Downloads\\data-mining\\datasetB_cases_sample.txt'
    
    #show_dataset_summary(dataset)    
    
    with open(test_cases) as cases:
        next(cases)
        for row in cases:
            row = row.strip()
            patient_id = row.split(None, 1)[0]
            patient_condition_id = row.split(None, 1)[1]
            
            #print('PatientID: ', patient_id)
            #print('Patient Condition Id: ', patient_condition_id)
            
            try:
                condition_id = get_condition_id(dataset, patient_condition_id)
                print('Recommedation for PatientID x PatientConID : ', patient_id, 'x', patient_condition_id)
                #print('Condition Id: ', condition_id)
                #break;
                therapy_recommendation(patient_id, condition_id, patient_condition_id, dataset)
                #break;
            except:
                print(traceback.format_exc())
                print('Data not found for the condition')
                
            
if __name__ == "__main__":
    main()

Recommedation for PatientID x PatientConID :  6 x pc32


Unnamed: 0,id,name,Correlation,number_of_successful_trials
0,Th24,intravenous immunoglobulin,1.0,18333
1,Th41,protein therapy,0.035117,18343
2,Th51,water cure (therapy),0.034352,18237
3,Th1,abortive therapy,0.026512,18592
4,Th21,hippotherapy,0.022125,18499


Recommedation for PatientID x PatientConID :  51345 x pc277636


Unnamed: 0,id,name,Correlation,number_of_successful_trials
0,Th24,intravenous immunoglobulin,1.0,18333
1,Th41,protein therapy,0.035117,18343
2,Th51,water cure (therapy),0.034352,18237
3,Th1,abortive therapy,0.026512,18592
4,Th21,hippotherapy,0.022125,18499


Recommedation for PatientID x PatientConID :  82486 x pc445475


Unnamed: 0,id,name,Correlation,number_of_successful_trials
0,Th50,Waon therapy,1.0,18350
1,Th51,water cure (therapy),0.028041,18237
2,Th2,antibody therapy,0.025607,18714
3,Th23,induction therapy,0.021529,18364
4,Th42,rehydration therapy,0.018754,18405


Recommedation for PatientID x PatientConID :  51348 x pc277652


Unnamed: 0,id,name,Correlation,number_of_successful_trials
0,Th21,hippotherapy,1.0,18499
1,Th11,definitive therapy,0.03915,18523
2,Th18,gold standard therapy,0.034954,18510
3,Th27,leech therapy,0.028497,18640
4,Th24,intravenous immunoglobulin,0.022125,18333


Recommedation for PatientID x PatientConID :  51358 x pc277696


Unnamed: 0,id,name,Correlation,number_of_successful_trials
0,Th19,herbal therapy,1.0,18202
1,Th32,molecular chaperone therapy,0.026473,18298
2,Th49,systemic therapy,0.02646,18469
3,Th2,antibody therapy,0.02555,18714
4,Th36,neutron therapy,0.018441,18161


Recommedation for PatientID x PatientConID :  51362 x pc277711


Unnamed: 0,id,name,Correlation,number_of_successful_trials
0,Th17,exercise therapy,1.0,18657
1,Th12,dietary therapy,0.032738,18485
2,Th44,sound therapy,0.027581,18305
3,Th25,investigational therapy,0.027095,18404
4,Th22,immunosuppressive therapy,0.025704,18623


Recommedation for PatientID x PatientConID :  51366 x pc277723


Unnamed: 0,id,name,Correlation,number_of_successful_trials
0,Th13,drug therapy,1.0,18438
1,Th1,abortive therapy,0.025253,18592
2,Th34,mud therapy,0.023146,18637
3,Th40,prophylactic therapy,0.022294,18292
4,Th45,speech therapy,0.017089,18423


Recommedation for PatientID x PatientConID :  51387 x pc277825


Unnamed: 0,id,name,Correlation,number_of_successful_trials
0,Th29,magnetic resonance therapy,1.0,18305
1,Th4,aurotherapy,0.052433,18323
2,Th2,antibody therapy,0.029202,18714
3,Th14,electromagnetic therapy (alternative medicine),0.018239,18109
4,Th50,Waon therapy,0.018039,18350


Recommedation for PatientID x PatientConID :  51416 x pc277986


Unnamed: 0,id,name,Correlation,number_of_successful_trials
0,Th30,medical nutrition therapy,1.0,18291
1,Th9,crystal therapy,0.026575,18354
2,Th36,neutron therapy,0.024541,18161
3,Th31,medical therapy,0.021384,18319
4,Th6,chrysotherapy,0.01754,18476


Recommedation for PatientID x PatientConID :  51453 x pc278191


Unnamed: 0,id,name,Correlation,number_of_successful_trials
0,Th27,leech therapy,1.0,18640
1,Th10,curative therapy,0.03725,18579
2,Th44,sound therapy,0.032689,18305
3,Th21,hippotherapy,0.028497,18499
4,Th11,definitive therapy,0.021212,18523
