# Medical data analysis using graphs

In [None]:
import os.path as op
import numpy as np      # array e operazioni numeriche veloci
import pandas as pd     # dataframe per enorme raccolta dati
import networkx as nx   # grafi efficienti e visualizzabili
import matplotlib.pyplot as plt # plotting

pd.set_option('display.max_columns', 50)

studyId = 'nsclc_ctdx_msk_2022'

##### Patient Data

In [None]:
patientdata_df = pd.DataFrame()
path = f'{studyId}/data_clinical_patient.txt'
if op.isfile(path):
    patientdata_df = pd.read_csv(path, sep='\t', skiprows=4)
    display(patientdata_df)
else:
    display('No patient data available')

##### Sample Data

In [None]:
sampledata_df = pd.DataFrame(columns=['SAMPLE_ID'])
path = f'{studyId}/data_clinical_sample.txt'
if op.isfile(path):
    sampledata_df = pd.read_csv(path, sep='\t', skiprows=4)
    display(sampledata_df)
else:
    display('No sample data available')

##### Mutation Data

In [None]:
mutationdata_df = pd.DataFrame(columns=['Tumor_Sample_Barcode'])
path = f'{studyId}/data_mutations.txt'
if op.isfile(path):
    mutationdata_df = pd.read_csv(f'{studyId}/data_mutations.txt', sep='\t', skiprows=2)
    mutationdata_df.drop_duplicates(inplace=True)
    display(mutationdata_df)
else:
    display('No mutation data available')

In [None]:
genes = np.array([])
with open("GreenGraph/genes.txt") as file:
    for line in file:
        x = line.strip()
        genes = np.append(genes, x)

##### Treatment Data

In [None]:
treatdata_df = pd.DataFrame()
path = f'{studyId}/data_timeline_treatment.txt'
if op.isfile(path):
    treatdata_df = pd.read_csv(f'{studyId}/data_timeline_treatment.txt', sep='\t')
    display(treatdata_df)
else:
    display('No tratment data available')

### Manipulating Data

In [None]:
tdata = pd.merge(sampledata_df, mutationdata_df, left_on='SAMPLE_ID', right_on='Tumor_Sample_Barcode')
tdata = tdata.astype(str)
tdata['MUTATION'] = tdata[['Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position']].agg('_'.join, axis='columns')
#data = tdata[['SAMPLE_ID', 'PATIENT_ID', 'CANCER_TYPE', 'CANCER_TYPE_DETAILED', 'MUTATION']]
display(tdata)

In [None]:
# conteggio delle malattie
diseasec = tdata.value_counts('CANCER_TYPE', dropna=False)
display(diseasec)

In [None]:
# conteggio dei pazienti
patientc = tdata.value_counts('PATIENT_ID', dropna=False)
display(patientc)

In [None]:
# conteggio deile mutazioni
mutationc = tdata.value_counts('MUTATION', dropna=False)
display(mutationc)

In [None]:
E_DiP = nx.from_pandas_edgelist(tdata, source='CANCER_TYPE', target='PATIENT_ID', create_using=nx.DiGraph())
E_PM = nx.from_pandas_edgelist(tdata, source='PATIENT_ID', target='MUTATION', create_using=nx.DiGraph())

#Di = [n for n,d in E_DiP.in_degree() if d==0]
#pos = nx.bipartite_layout(E_DiP, Di)

#nx.draw_networkx(E_DiP)

P = [node for node, degree in E_PM.in_degree() if degree == 0]
M = [node for node, degree in E_PM.out_degree() if degree == 0]

clusters = {}
for patient in P:
    mutations = frozenset(n for n in E_PM.neighbors(patient))

    cluster_found = False
    for c_mutations, c_patients in clusters.items():
        if mutations == c_mutations:
            c_patients.add(patient)
            cluster_found = True
            break

    if not cluster_found:
        clusters[mutations] = {patient}

clusters_stats = {'Cluster' : [], 
                 'Numero di pazienti' : []}

clusters = dict(sorted(clusters.items(), key=lambda item: len(item[1]), reverse=True))
cluster_dfs = []
cc = 0
for k, v in clusters.items():
    if len(v) > 1:
        clusters_stats['Cluster'].append(cc)
        clusters_stats['Numero di pazienti'].append(len(v))
        cc += 1

        cluster_data = {'Paziente' : [],
                        'Malattia' : []}
        for p in v:
            disease = tdata.loc[tdata['PATIENT_ID'] == p, 'CANCER_TYPE_DETAILED'].values[0]
            cluster_data['Paziente'].append(p)
            cluster_data['Malattia'].append(disease)
        cdf = pd.DataFrame(cluster_data)
        cluster_dfs.append(cdf)

clusters_df = pd.DataFrame(clusters_stats)
display(clusters_df)

for dataframe in cluster_dfs:
    display(dataframe)