## Medical data analysis using graphs

In [None]:
from main import *
import dataframe_image as dfi

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)

studyId = 'nsclc_ctdx_msk_2022'

#### Getting data from local files (downloaded from https://www.cbioportal.org/)

Patient data

In [None]:
patient_data = get_PatientData(studyId)
display(patient_data)

Sample data

In [None]:
sample_data = get_SampleData(studyId)
display(sample_data)

Mutation data

In [None]:
mutation_data = get_MutationData(studyId)
display(mutation_data)

Full data (by merging sample and mutation data)

In [None]:
full_data = get_FullData(sample_data, mutation_data)
display(full_data)

In [None]:
# check dbSNP_RS values
full_data.value_counts('dbSNP_RS')

In [None]:
# check patient ids
full_data.value_counts('PATIENT_ID')

In [None]:
# check mutation values
full_data.value_counts('MUTATION')

In [None]:
# check cancer type detailed values
full_data.value_counts('CANCER_TYPE_DETAILED')

Drugs data (from file geni_farmaci.xls)

In [None]:
drugs_data = get_DrugsData()
display(drugs_data)

#### Building the green graph and working on it

In [None]:
dip_graph = build_DiPGraph(full_data)
pm_graph = build_PMGraph(full_data, patient_data, sample_data)

In [None]:
diseases = get_DiNodes(dip_graph)
dd = {'Disease' : [], 'Patient count' : []}
for di in diseases:
    pcnt = len(set(m for m in dip_graph.neighbors(di)))
    dd['Disease'].append(di)
    dd['Patient count'].append(pcnt)
ddf = pd.DataFrame(dd)
ddf.sort_values(by='Patient count', ascending=False, inplace=True)
ddf.reset_index(drop=True, inplace=True)
ddfi = ddf.iloc[:5]
display(ddfi)
#dfi.export(ddfi, f'immagini tesi/{studyId}/malattie.png')
    

In [None]:
disease = 'Lung Adenocarcinoma'
mcount = dip_graph.degree(disease)
dis_mutations = getMutations_fromDisease(dip_graph, pm_graph, disease, mcount)
print(f'{disease} - {mcount} pazienti')
#dis_mutations = dis_mutations.style.set_caption(f'{disease} - {mcount} patients')
df = dis_mutations.iloc[:10]
display(df)
#dfi.export(df, f'immagini tesi/{studyId}/panadenocarcinoma.png')

#### Clustering

In [None]:
cluster_attributes = ['CANCER_TYPE_DETAILED', 'SEX', 'OS_STATUS', 'OS_MONTHS']
threshold = 1   # imposta ad un numero decimale tra 0 e 1 (compresi)
clusters = clustering(pm_graph, threshold)

Display general cluster view

In [None]:
if len(clusters) > 0:
    cluster_view = {'Cluster' : [], 'Numero pazienti' : [], 'Numero mutazioni': []}
    for n, patients in clusters.items():
        cluster_view['Cluster'].append(n)
        cluster_view['Numero pazienti'].append(len(patients))
        leader = patients[0]
        leader_mutations = list(m for m in pm_graph.neighbors(leader))
        cluster_view['Numero mutazioni'].append(len(leader_mutations))
    cluster_view_df = pd.DataFrame(cluster_view)

    total_patients = len(get_PNodes(pm_graph))
    cluster_patients = cluster_view_df['Numero pazienti'].sum()
    ones = total_patients - cluster_patients
    total_clusters = len(clusters) + ones
    cluster_view_df.iloc[:].plot.bar(x='Cluster', rot=0, title=f'Clustering di {total_patients} pazienti - Soglia di similaritá = {threshold}', figsize=(10, 4))
    with pd.option_context('display.max_rows', 10):
        display(cluster_view_df)
    print(f'{total_patients} pazienti\n{total_clusters} cluster totali: {len(clusters)} cluster con piú di un paziente ({cluster_patients} pazienti), {ones} cluster con un paziente')

Display all clusters (or a specific cluster)

In [None]:
cluster_dfs = {}
cluster_model = {'PATIENT' : []}
for attr in cluster_attributes:
    if nx.get_node_attributes(pm_graph, attr):
        cluster_model[attr] = []

for n, patients in clusters.items():
    cluster_data = copy.deepcopy(cluster_model)
    for p in patients:
        for attr in cluster_data.keys():
            if attr == 'PATIENT':
                cluster_data[attr].append(p)
            else:
                cluster_data[attr].append(pm_graph.nodes[p][attr])
    cluster_df = pd.DataFrame(cluster_data)
    cluster_dfs[n] = cluster_df

#dfi.export(cluster_dfs[0], f'immagini tesi/{studyId}/cl0.png')

def show_cluster(number):
    #show = 0    # imposta al numero di cluster che si vuole approfondire o a -1 per vederli tutti
    for n, cluster in cluster_dfs.items():
        if n == number or number == -1:
            cluster_df = cluster.style.set_caption(f'Cluster {n}')
            leader = clusters[n][0]
            leader_mutations = list(m for m in pm_graph.neighbors(leader))
            cluster_mutations = {'MUTATION' : leader_mutations}
            cluster_mutations_df = pd.DataFrame(cluster_mutations)
            #dfi.export(cluster_mutations_df, f'immagini tesi/{studyId}/cl0_mutations.png')
            display(cluster_df)
            display(cluster_mutations_df)
            '''for attr in cluster_attributes:
                if attr == 'OS_MONTHS':
                    cluster[attr].plot.bar(xlabel='Patient', rot=0, title=attr)
                else:
                    cluster[attr].value_counts().plot(kind='pie', autopct='%1.0f%%', title=attr, ylabel='')
                plt.show()'''
            #cluster[attribute].value_counts().plot(kind='pie', autopct='%1.0f%%', title=attribute)

show_cluster(-1)