## Medical data analysis using graphs

In [None]:
from main import *

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)

studyId = 'nsclc_ctdx_msk_2022'

#### Getting data from local files (downloaded from https://www.cbioportal.org/)

Patient data

In [None]:
patient_data = get_PatientData(studyId)
display(patient_data)

Sample data

In [None]:
sample_data = get_SampleData(studyId)
display(sample_data)

Mutation data

In [None]:
mutation_data = get_MutationData(studyId)
display(mutation_data)

Full data (by merging sample and mutation data)

In [None]:
full_data = get_FullData(sample_data, mutation_data)
display(full_data)

#### Building the green graph and working on it

In [None]:
dip_graph = build_DiPGraph(full_data)
pm_graph = build_PMGraph(full_data, patient_data, sample_data)

In [None]:
'''
disease = 'Non-Small Cell Lung Cancer'
mcount = dip_graph.degree(disease)
dis_mutations = getMutations_fromDisease(dip_graph, pm_graph, disease, mcount)
dis_mutations = dis_mutations.style.set_caption(f'{disease} - {mcount} patients')
display(dis_mutations)
'''

#### Clustering

In [None]:
cluster_attributes = ['CANCER_TYPE_DETAILED', 'OS_MONTHS', 'OS_STATUS', 'SMOKING_STATUS', 'SMOKING_HISTORY', 'DRIVERS_MUTATIONS', 'TARGET_THERAPY']
threshold = 1  # imposta ad un numero decimale tra 0 e 1 (compresi)
clusters = clustering(pm_graph, threshold)

Display general cluster view

In [None]:
if len(clusters) > 0:
    cluster_view = {'Cluster' : [], 'Numero pazienti' : [], 'Numero mutazioni' : []}
    for n, patients in clusters.items():
        cluster_view['Cluster'].append(n)
        cluster_view['Numero pazienti'].append(len(patients))
        leader = patients[0]
        leader_mutations = list(m for m in pm_graph.neighbors(leader))
        cluster_view['Numero mutazioni'].append(len(leader_mutations))
    cluster_view_df = pd.DataFrame(cluster_view)

    pcount = len(get_PNodes(pm_graph))
    cluster_view_df.plot.bar(x='Cluster', rot=0, title=f'Clustering di {pcount} pazienti - Percentuale di similaritá = {int(threshold*100)}%', figsize=(20, 5))
    #display(cluster_view_df)

Display all clusters (or a specific cluster)

In [None]:
cluster_dfs = {}
cluster_model = {'PATIENT' : []}
for attr in cluster_attributes:
    if nx.get_node_attributes(pm_graph, attr):
        cluster_model[attr] = []

for n, patients in clusters.items():
    cluster_data = copy.deepcopy(cluster_model)
    for p in patients:
        for attr in cluster_data.keys():
            if attr == 'PATIENT':
                cluster_data[attr].append(p)
            else:
                cluster_data[attr].append(pm_graph.nodes[p][attr])
    cluster_df = pd.DataFrame(cluster_data)
    cluster_dfs[n] = cluster_df

show = -1    # imposta al numero di cluster che si vuole approfondire o a -1 per vederli tutti
for n, cluster in cluster_dfs.items():
    if n == show or show == -1:
        cluster_df = cluster.style.set_caption(f'Cluster {n}')
        leader = clusters[n][0]
        leader_mutations = list(m for m in pm_graph.neighbors(leader))
        cluster_mutations = {'MUTATION' : leader_mutations}
        cluster_mutations_df = pd.DataFrame(cluster_mutations)
        display(cluster_df)
        display(cluster_mutations_df)