In [1]:
#importing libraries
import json
import pandas as pd
import numpy as np
import traceback

In [2]:
#show dataset summary
def show_dataset_summary(file_name):
    with open(file_name, encoding="utf8") as f:
        data = json.load(f)
        
    total_condition = len(data['Conditions'])
    total_patient = len(data['Patients'])
    total_therapy = len(data['Therapies'])
    total_trial = 0
    
    for patient in data['Patients']:
        total_trial += len(patient['trials'])
        #for d in patient['trials']:
            #print(d['condition'])
        
    print('//-----------Dataset Summary------------')
    print('Total Condition : ', total_condition)
    print('Total Patient : ', total_patient)
    print('Total Therapy : ', total_therapy)
    print('Total Trial : ', total_trial)
    print('----------- Dataset Summary-----------//')

In [3]:
def get_condition_id(dataset, patient_condition_id):
    with open(dataset, encoding="unicode_escape") as f:
        data = json.load(f)

    # conditions
    df_conditions = pd.json_normalize(data['Patients'], "conditions", "id", errors='ignore', record_prefix='_')
    row_cond = df_conditions.loc[df_conditions['_id'] == patient_condition_id]
    #print('pc_id: ', patient_condition_id)
    #print(row_cond)
    #if row_cond.size != 0:
    condition_id = row_cond['_kind'].values[0]
    
    return condition_id

In [4]:
def get_similar_neighbour(CTI_matrix, n):
    order = np.argsort(CTI_matrix.values, axis=1)[:, :n]

    similar_neighbour = CTI_matrix.apply(lambda x: pd.Series(x.sort_values(ascending=False)
                                      .iloc[:n].index,
                                      index=['top_{}'.format(i) for i in range(1, n + 1)]), axis=1)
    return similar_neighbour

In [5]:
def get_top_therapy(dataset, condition_id, count):
    with open(dataset, encoding="utf8") as f:
        data = json.load(f)
    
    # trials under patients
    trials_data = pd.json_normalize(data['Patients'], "trials", ["id", "name"], errors='ignore', record_prefix='_')
    # print(trials_data)

    # conditions under patients
    conditions_data = pd.json_normalize(data['Patients'], "conditions", ["id"], errors='ignore', record_prefix='_')
    # print(conditions_data)

    merge_trials_with_conditions = pd.merge(trials_data, conditions_data, how='left', left_on=['_condition', 'id'], right_on=['_id', 'id'])
    #print(merge_trials_with_conditions)
    
    #creating condition theraphy interaction matrix
    therapy_matrix_CTI = merge_trials_with_conditions.pivot_table(index='_kind', columns='_therapy', values='_successful')
    #print(therapy_matrix_CTI)
    
    #finding similar neighbour
    similar_neighbour = get_similar_neighbour(therapy_matrix_CTI, count)
    #print('Similar Neighbors: ', similar_neighbour)
    
    #csv convertion
    #merge_trials_with_conditions.to_csv(r'before_merge_trials_with_conditions.csv', index=None)
    
    if condition_id != None:
        merge_trials_with_conditions = merge_trials_with_conditions[merge_trials_with_conditions['_kind'] == condition_id]
        #print('After Filtering: ', merge_trials_with_conditions)

        merge_trials_with_conditions = merge_trials_with_conditions.groupby('_therapy')['_successful'].mean()
        #print('After group by: ', merge_trials_with_conditions)
    
    
        neighbours_data = similar_neighbour.loc[condition_id, :]
        #print('Top Recommended Therapies for: ', row_data)
        
        top_therapy_data = pd.DataFrame(columns=['Therapy', 'Success'])

        for items in neighbours_data:
            try:
                top_therapy_data = top_therapy_data.append({'Therapy': items, 'Success': merge_trials_with_conditions[items]}, ignore_index=True)
            except:
                break
        #print('Top Therapy: ', top_therapy_data['Therapy'][0])        
        #print(top_therapy_data)
        return top_therapy_data

In [6]:
def therapy_recommendation(patient_id, condition_id, dataset):
    #reading dataset
    with open(dataset, encoding="utf8") as f:
        data = json.load(f)

    # trials under patients
    trials_data = pd.json_normalize(data['Patients'], "trials", ["id", "name"], errors='ignore', record_prefix='_')
    # print(trials_data)

    # conditions under patients
    conditions_data = pd.json_normalize(data['Patients'], "conditions", ["id"], errors='ignore', record_prefix='_')
    # print(conditions_data)

    merge_trials_with_conditions = pd.merge(trials_data, conditions_data, how='left', left_on=['_condition', 'id'], right_on=['_id', 'id'])
    #print(merge_trials_with_conditions)
    # output entity | _condition, _end, _id_x, _start, _cured, _diagnosed, _id_y, _isCured, _isTreated, name

    columns = ['id', '_kind', '_therapy', '_successful']
    #print(merge_trials_with_conditions[columns])
    merged_data = merge_trials_with_conditions[columns]
    #print(merged_data)
    
    # creating mean reating of successful therapy
    therapy_success_rate = pd.DataFrame(merged_data.groupby('_therapy')['_successful'].mean())
    
    #creating number of therapy data
    therapy_success_rate['number_of_successful_trials'] = merged_data.groupby('_therapy')['_successful'].count()
    
    #creating patient therapy interaction matrix
    therapy_matrix_PTI = merged_data.pivot_table(index='id', columns='_therapy', values='_successful')
    #print(therapy_matrix_PTI)
    #best rated therapy
    #print(therapy_success_rate.sort_values('number_of_successful_trials', ascending=False).head(10))
    
    #finding top therapy based on condition
    if condition_id != None:
        top_therapy = get_top_therapy(dataset, condition_id, 1)
        
        #recommendation for a specific therapy
        success_rate_for_a_specific_therapy = therapy_matrix_PTI[top_therapy['Therapy'][0]]
        #success_rate_for_a_specific_therapy = therapy_matrix_PTI['Th47']

        # finding correlation with other Therapies
        similar_to_specific_therapy = therapy_matrix_PTI.corrwith(success_rate_for_a_specific_therapy)
        correlation_with_specific_therapy = pd.DataFrame(similar_to_specific_therapy, columns=['Correlation'])
        #print(correlation_with_specific_therapy.head()) 
        correlation_with_specific_therapy.dropna(inplace=True)

        #Joining with ratings
        correlation_with_specific_therapy = correlation_with_specific_therapy.join(therapy_success_rate['number_of_successful_trials'])
        #display(correlation_with_specific_therapy.head(10))
        
        recommended_therapy = correlation_with_specific_therapy[correlation_with_specific_therapy['number_of_successful_trials'] > 200].sort_values(by='Correlation', ascending=False).head(10)
        #print('after threshold filtering')
        #display(recommended_therapy.head(10))
        
        # merge recommended therapy with therapy for therapy name
        df_therapies = pd.json_normalize(data['Therapies'], errors='ignore', record_prefix='_')
        merge_recommended_therapy = pd.merge(recommended_therapy, df_therapies, how='left', left_on=['_therapy'], right_on=['id'])
        columns = ['id', 'name', 'Correlation', 'number_of_successful_trials']
        
        display(merge_recommended_therapy[columns].head(10))
        
    

In [7]:
#defining main function
def main():
    
    #dataset & test cases initialization
#     dataset = 'C:\\Users\\HP\\Downloads\\data-mining\\datasetB.json'
#     test_cases = 'C:\\Users\\HP\\Downloads\\data-mining\\datasetB_cases.txt'
    
    dataset = 'C:\\Users\\HP\\Downloads\\data-mining\\datasetB_sample.json'
    test_cases = 'C:\\Users\\HP\\Downloads\\data-mining\\datasetB_cases_sample.txt'
    
    #show_dataset_summary(dataset)    
    
    with open(test_cases) as cases:
        next(cases)
        for row in cases:
            row = row.strip()
            patient_id = row.split(None, 1)[0]
            patient_condition_id = row.split(None, 1)[1]
            
            #print('PatientID: ', patient_id)
            #print('Patient_condition: ', patient_condition_id)
            
            try:
                condition_id = get_condition_id(dataset, patient_condition_id)
                #print(condition_id)
                therapy_recommendation(patient_id, condition_id, dataset)
            except:
                print(traceback.format_exc())
                print('Data not found for the condition')
            
main()

Unnamed: 0,id,name,Correlation,number_of_successful_trials
0,Th51,water cure (therapy),1.0,2039
1,Th37,phytotherapy,0.074726,2027
2,Th50,Waon therapy,0.063485,2030
3,Th24,intravenous immunoglobulin,0.062794,1999
4,Th20,phytotherapy,0.058191,2019
5,Th10,curative therapy,0.056998,1957
6,Th25,investigational therapy,0.052194,2064
7,Th35,neurologic music therapy,0.044043,2104
8,Th41,protein therapy,0.044039,2018
9,Th28,low level laser therapy,0.04233,2016


Unnamed: 0,id,name,Correlation,number_of_successful_trials
0,Th47,stepdown therapy,1.0,2034
1,Th49,systemic therapy,0.059311,2027
2,Th14,electromagnetic therapy (alternative medicine),0.059136,1922
3,Th32,molecular chaperone therapy,0.05588,1967
4,Th44,sound therapy,0.054425,1987
5,Th37,phytotherapy,0.047375,2027
6,Th40,prophylactic therapy,0.046073,1969
7,Th2,antibody therapy,0.043365,2055
8,Th39,preventive therapy,0.041584,2105
9,Th19,herbal therapy,0.03816,1920
