In [1]:
#importing libraries
import json
import pandas as pd
import numpy as np
import traceback
import math

In [2]:
#show dataset summary
def show_dataset_summary(file_name):
    with open(file_name, encoding="utf8") as f:
        data = json.load(f)
        
    total_condition = len(data['Conditions'])
    total_patient = len(data['Patients'])
    total_therapy = len(data['Therapies'])
    total_trial = 0
    
    for patient in data['Patients']:
        total_trial += len(patient['trials'])
        #for d in patient['trials']:
            #print(d['condition'])
        
    print('//-----------Dataset Details------------')
    print('Total Condition : ', total_condition)
    print('Total Patient : ', total_patient)
    print('Total Therapy : ', total_therapy)
    print('Total Trial : ', total_trial)
    print('----------- Dataset Details------------//')

In [3]:
def get_data_for_evaluation(dataset):
    with open(dataset, encoding="unicode_escape") as f:
        data = json.load(f)
        
    # trials under patients
    trials_data = pd.json_normalize(data['Patients'], "trials", ["id", "name"], errors='ignore', record_prefix='_')
    # print(trials_data)

    # conditions under patients
    conditions_data = pd.json_normalize(data['Patients'], "conditions", ["id"], errors='ignore', record_prefix='_')
    # print(conditions_data)

    merge_trials_with_conditions = pd.merge(trials_data, conditions_data, how='left', left_on=['_condition', 'id'], right_on=['_id', 'id'])
    #print(merge_trials_with_conditions)

    columns = ['id', '_isCured', '_kind', '_condition', '_therapy', '_successful']
    #print(merge_trials_with_conditions[columns])
    merged_data = merge_trials_with_conditions[columns]
    #display('before filter', merged_data)
    merged_data = merged_data[merged_data['_isCured'] != False]
    #display('after filter', merged_data)
    #merged_data = merged_data[merged_data['_successful'] <= 10]
    #display('less than 10: ', merged_data)
    
    #select random 5 rows
    random_rows = merged_data.sample(n=5)
    random_rows = random_rows.reset_index()
    #display('random rows', random_rows)
    
    return random_rows

    
    

In [4]:
def get_RMSE(actual_ratings, predicted_ratings):
    print(actual_ratings)
    print('----------------')
    print(predicted_ratings)
    MSE = np.square(np.subtract(actual_ratings,predicted_ratings)).mean() 
    RMSE = math.sqrt(MSE)
    print("Root Mean Square Error:\n")
    print(RMSE)

In [5]:
def get_condition_id(dataset, patient_condition_id):
    with open(dataset, encoding="unicode_escape") as f:
        data = json.load(f)

    # conditions
    df_conditions = pd.json_normalize(data['Patients'], "conditions", "id", errors='ignore', record_prefix='_')
    row_cond = df_conditions.loc[df_conditions['_id'] == patient_condition_id]
    #print('pc_id: ', patient_condition_id)
    #print(row_cond)
    #if row_cond.size != 0:
    condition_id = row_cond['_kind'].values[0]
    
    return condition_id

In [6]:
def get_similar_neighbour(CTI_matrix, n):
    order = np.argsort(CTI_matrix.values, axis=1)[:, :n]

    similar_neighbour = CTI_matrix.apply(lambda x: pd.Series(x.sort_values(ascending=False)
                                      .iloc[:n].index,
                                      index=['top_{}'.format(i) for i in range(1, n + 1)]), axis=1)
    return similar_neighbour

In [7]:
def get_top_therapy(dataset, condition_id, count):
    with open(dataset, encoding="utf8") as f:
        data = json.load(f)
    
    # trials under patients
    trials_data = pd.json_normalize(data['Patients'], "trials", ["id", "name"], errors='ignore', record_prefix='_')
    # print(trials_data)

    # conditions under patients
    conditions_data = pd.json_normalize(data['Patients'], "conditions", ["id"], errors='ignore', record_prefix='_')
    # print(conditions_data)

    merge_trials_with_conditions = pd.merge(trials_data, conditions_data, how='left', left_on=['_condition', 'id'], right_on=['_id', 'id'])
    #print(merge_trials_with_conditions)
    
    #creating condition theraphy interaction matrix
    therapy_matrix_CTI = merge_trials_with_conditions.pivot_table(index='_kind', columns='_therapy', values='_successful')
    #print(therapy_matrix_CTI)
    
    #finding similar neighbour
    similar_neighbour = get_similar_neighbour(therapy_matrix_CTI, count)
    #print('Similar Neighbors: ', similar_neighbour)
    
    #csv convertion
    #merge_trials_with_conditions.to_csv(r'before_merge_trials_with_conditions.csv', index=None)
    
    if condition_id != None:
        merge_trials_with_conditions = merge_trials_with_conditions[merge_trials_with_conditions['_kind'] == condition_id]
        #print('After Filtering: ', merge_trials_with_conditions)

        merge_trials_with_conditions = merge_trials_with_conditions.groupby('_therapy')['_successful'].mean()
        #print('After group by: ', merge_trials_with_conditions)
    
    
        neighbours_data = similar_neighbour.loc[condition_id, :]
        #print('Top Recommended Therapies for: ', row_data)
        
        top_therapy_data = pd.DataFrame(columns=['Therapy', 'Success'])

        for items in neighbours_data:
            try:
                top_therapy_data = top_therapy_data.append({'Therapy': items, 'Success': merge_trials_with_conditions[items]}, ignore_index=True)
            except:
                break
        #print('Top Therapy: ', top_therapy_data['Therapy'][0])        
        #print(top_therapy_data)
        return top_therapy_data

In [8]:
def therapy_recommendation(patient_id, condition_id, dataset):
    #reading dataset
    with open(dataset, encoding="utf8") as f:
        data = json.load(f)

    # trials under patients
    trials_data = pd.json_normalize(data['Patients'], "trials", ["id", "name"], errors='ignore', record_prefix='_')
    # print(trials_data)

    # conditions under patients
    conditions_data = pd.json_normalize(data['Patients'], "conditions", ["id"], errors='ignore', record_prefix='_')
    # print(conditions_data)

    merge_trials_with_conditions = pd.merge(trials_data, conditions_data, how='left', left_on=['_condition', 'id'], right_on=['_id', 'id'])
    #print(merge_trials_with_conditions)
    # output entity | _condition, _end, _id_x, _start, _cured, _diagnosed, _id_y, _isCured, _isTreated, name

    columns = ['id', '_kind', '_therapy', '_successful']
    #print(merge_trials_with_conditions[columns])
    merged_data = merge_trials_with_conditions[columns]
    #print(merged_data)
    
    # creating mean reating of successful therapy
    therapy_success_rate = pd.DataFrame(merged_data.groupby('_therapy')['_successful'].mean())
    
    #creating number of therapy data
    therapy_success_rate['number_of_successful_therapy'] = merged_data.groupby('_therapy')['_successful'].count()
    
    #creating patient therapy interaction matrix
    therapy_matrix_PTI = merged_data.pivot_table(index='id', columns='_therapy', values='_successful')
    #print(therapy_matrix_PTI)
    #best rated therapy
    #print(therapy_success_rate.sort_values('number_of_successful_therapy', ascending=False).head(10))
    
    #finding top therapy based on condition
    if condition_id != None:
        top_therapy = get_top_therapy(dataset, condition_id, 1)
        
        #recommendation for a specific therapy
        success_rate_for_a_specific_therapy = therapy_matrix_PTI[top_therapy['Therapy'][0]]
        #success_rate_for_a_specific_therapy = therapy_matrix_PTI['Th47']

        # finding correlation with other Therapies
        similar_to_specific_therapy = therapy_matrix_PTI.corrwith(success_rate_for_a_specific_therapy)
        correlation_with_specific_therapy = pd.DataFrame(similar_to_specific_therapy, columns=['Correlation'])
        #print(correlation_with_specific_therapy.head()) 
        correlation_with_specific_therapy.dropna(inplace=True)

        #Joining with ratings
        correlation_with_specific_therapy = correlation_with_specific_therapy.join(therapy_success_rate['number_of_successful_therapy'])
        #display(correlation_with_specific_therapy.head(10))
        
        recommended_therapy = correlation_with_specific_therapy[correlation_with_specific_therapy['number_of_successful_therapy'] > 100].sort_values(by='Correlation', ascending=False).head(10)
        #print('after threshold filtering')
        #display(recommended_therapy.head(10))
        
        # merge recommended therapy with therapy for therapy name
        df_therapies = pd.json_normalize(data['Therapies'], errors='ignore', record_prefix='_')
        merge_recommended_therapy = pd.merge(recommended_therapy, df_therapies, how='left', left_on=['_therapy'], right_on=['id'])
        columns = ['id', 'name', 'Correlation', 'number_of_successful_therapy']
        
        mrt_data = merge_recommended_therapy[columns].sample(n=5)
        mrt_data = mrt_data.reset_index()
        return mrt_data
        
    

In [9]:
#defining main function
def main():
    
    
    # dataset & test cases initialization
    dataset = 'C:\\Users\\HP\\Downloads\\data-mining\\datasetB.json'
    test_cases = 'C:\\Users\\HP\\Downloads\\data-mining\\datasetB_cases.txt'
    
#     dataset = 'C:\\Users\\HP\\Downloads\\data-mining\\datasetB_sample.json'
#     test_cases = 'C:\\Users\\HP\\Downloads\\data-mining\\datasetB_cases_sample.txt'
    
    condition_ids = []
    patient_ids = []
    actual_rating = []
    evaluation_data = get_data_for_evaluation(dataset)
    #display(evaluation_data)
    for index, row in evaluation_data.iterrows():
        condition_ids.append(row['_kind'])
        patient_ids.append(row['id'])
        actual_rating.append(row['_successful'])
        
#     print(condition_ids)
#     print('---------------')
#     print(patient_ids)
    
    if(condition_ids):
        for patient_id, condition_id in zip(patient_ids, condition_ids):
            try:
                print('condition Id: ', condition_id)
                recommended_data = therapy_recommendation(patient_id, condition_id, dataset)
                predicted_rating = list(recommended_data['Correlation'].values * 100)
                get_RMSE(actual_rating, predicted_rating)
                predicted_rating = []
            except:
                print(traceback.format_exc())
                print('Data not found for the condition')
            
main()

ccccc Cond54
[2, 53, 100, 10, 100]
----------------
[2.5068757573875233, 1.6538884470100983, 1.5954847578955191, 1.5400465820196605, 2.390836651379563]
Root Mean Square Error:

66.21064611696694
ccccc Cond127
[2, 53, 100, 10, 100]
----------------
[2.787308934071889, 2.9641772806087587, 1.8441074001670092, 2.6597809950038314, 100.0]
Root Mean Square Error:

49.38149750194961
ccccc Cond241
[2, 53, 100, 10, 100]
----------------
[2.014433494256443, 4.756483226799876, 99.99999999999997, 2.7808909277337963, 2.446479235367691]
Root Mean Square Error:

48.777540053480806
ccccc Cond244
[2, 53, 100, 10, 100]
----------------
[2.3536095999689333, 2.046879962931134, 2.7094877365858006, 2.5703895399119365, 2.4121686346749813]
Root Mean Square Error:

65.78810409171928
ccccc Cond36
[2, 53, 100, 10, 100]
----------------
[2.1491412702536454, 2.6976712222357393, 2.5703895399119365, 2.446479235367691, 100.0]
Root Mean Square Error:

49.15268292561662
