# Medical data analysis using graphs

In [None]:
import numpy as np      # array e operazioni numeriche veloci
import pandas as pd     # dataframe contenenti anche migliaia di dati
import networkx as nx   # grafi efficienti e visualizzabili
import matplotlib.pyplot as plt # plotting
import scipy as sp
pd.set_option('display.max_columns', 50)

studyId = 'nsclc_ctdx_msk_2022' # variabile usata come id per estrarre i dati dello studio che vogliamo
                                # in questo caso: Metastatic Non-Small Cell Lung Cancer (MSK, Nature Medicine 2022)

##### Patient Data

In [None]:
df = pd.read_csv(f'{studyId}/data_clinical_patient.txt', sep='\t')
patient_df = df.iloc[4:]
display(patient_df)

In [None]:
count = patient_df.value_counts('Stage at Draw', dropna=False)
display(count)

##### Sample Data

In [None]:
df = pd.read_csv(f'{studyId}/data_clinical_sample.txt', sep='\t')
sample_df = df.iloc[4:]
display(sample_df)

In [None]:
count = sample_df.value_counts(dropna=False)
display(count)

##### Mutation Data

In [None]:
mutation_df = pd.read_csv(f'{studyId}/data_mutations.txt', sep='\t', skiprows=2)
mutation_df.drop_duplicates(inplace=True)
display(mutation_df)

In [None]:
count = mutation_df.value_counts(dropna=False)
display(count)

In [None]:
genes = np.array([])
with open("GreenGraph/genes.txt") as file:
    for line in file:
        x = line.strip()
        genes = np.append(genes, x)

##### Treatment Data

In [None]:
treat_df = pd.read_csv(f'{studyId}/data_timeline_treatment.txt', sep='\t')
display(treat_df)

### Manipulating Data

In [111]:
tdata = pd.merge(sample_df, mutation_df, left_on='#Sample Identifier', right_on='Tumor_Sample_Barcode')
tdata = tdata.astype(str)
tdata['Mutation'] = tdata[['Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position']].agg('_'.join, axis='columns')
data = tdata[['#Sample Identifier', 'Patient Identifier', 'Cancer Type', 'Cancer Type Detailed', 
              'Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Mutation']]
display(data)

Unnamed: 0,#Sample Identifier,Patient Identifier,Cancer Type,Cancer Type Detailed,Hugo_Symbol,Chromosome,Start_Position,End_Position,Mutation
0,MSK-L-002-001B,P-0016223,Non-Small Cell Lung Cancer,Non-Small Cell Lung Cancer,ALK,2,29416141,29416141,ALK_2_29416141_29416141
1,MSK-L-002-001B,P-0016223,Non-Small Cell Lung Cancer,Non-Small Cell Lung Cancer,TP53,17,7578461,7578461,TP53_17_7578461_7578461
2,MSK-L-002-002,P-0016223,Non-Small Cell Lung Cancer,Non-Small Cell Lung Cancer,TP53,17,7578461,7578461,TP53_17_7578461_7578461
3,MSK-L-002-003,P-0016223,Non-Small Cell Lung Cancer,Non-Small Cell Lung Cancer,ALK,2,29416141,29416141,ALK_2_29416141_29416141
4,MSK-L-002-003,P-0016223,Non-Small Cell Lung Cancer,Non-Small Cell Lung Cancer,TP53,17,7578461,7578461,TP53_17_7578461_7578461
...,...,...,...,...,...,...,...,...,...
9638,NCI-L-199-01,NCI-L-199,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,KEAP1,19,10602781,10602781,KEAP1_19_10602781_10602781
9639,NCI-L-199-01,NCI-L-199,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,MET,7,116436016,116436016,MET_7_116436016_116436016
9640,NCI-L-199-01,NCI-L-199,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,RET,10,43622158,43622158,RET_10_43622158_43622158
9641,NCI-L-199-01,NCI-L-199,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,TP53,17,7579358,7579358,TP53_17_7579358_7579358


In [None]:
# conteggio delle malattie
diseasec = tdata.value_counts('Cancer Type', dropna=False)
display(diseasec)

In [None]:
# conteggio dei pazienti
patientc = tdata.value_counts('Patient Identifier', dropna=False)
display(patientc)

In [None]:
# conteggio deile mutazioni
mutationc = tdata.value_counts('Mutation', dropna=False)
display(mutationc)

In [118]:
E_DiP = nx.from_pandas_edgelist(tdata, source='Cancer Type', target='Patient Identifier', create_using=nx.DiGraph())
E_PM = nx.from_pandas_edgelist(tdata, source='Patient Identifier', target='Mutation', create_using=nx.DiGraph())

#Di = [n for n,d in E_DiP.in_degree() if d==0]
#pos = nx.bipartite_layout(E_DiP, Di)

#nx.draw_networkx(E_DiP)

#print(E_DiP.edges)
#print(E_PM.edges)

P = [node for node, degree in E_PM.in_degree() if degree == 0]
M = [node for node, degree in E_PM.out_degree() if degree == 0]

clusters = {}
for patient in P:
    mutations = frozenset(n for n in E_PM.neighbors(patient))

    cluster_found = False
    for c_mutations, c_patients in clusters.items():
        if mutations == c_mutations:
            c_patients.add(patient)
            cluster_found = True
            break

    if not cluster_found:
        clusters[mutations] = {patient}

clusters_stats = {'Cluster' : [], 
                 'Numero di pazienti' : []}

clusters = dict(sorted(clusters.items(), key=lambda item: len(item[1]), reverse=True))
cluster_dfs = []
cc = 0
for k, v in clusters.items():
    if len(v) > 1:
        clusters_stats['Cluster'].append(cc)
        clusters_stats['Numero di pazienti'].append(len(v))
        cc += 1

        cluster_data = {'Paziente' : [],
                        'Malattia' : []}
        for p in v:
            disease = tdata.loc[tdata['Patient Identifier'] == p, 'Cancer Type Detailed'].values[0]
            cluster_data['Paziente'].append(p)
            cluster_data['Malattia'].append(disease)
        cdf = pd.DataFrame(cluster_data)
        cluster_dfs.append(cdf)

clusters_df = pd.DataFrame(clusters_stats)
display(clusters_df)

for dataframe in cluster_dfs:
    display(dataframe)

Unnamed: 0,Cluster,Numero di pazienti
0,0,15
1,1,14
2,2,10
3,3,5
4,4,5
5,5,4
6,6,3
7,7,3
8,8,2
9,9,2


Unnamed: 0,Paziente,Malattia
0,MSK-L-474,Lung Adenocarcinoma
1,MSK-L-512,Lung Adenocarcinoma
2,MSK-L-310,Lung Adenocarcinoma
3,NCI-L-147,Lung Adenocarcinoma
4,MSK-L-173,Lung Adenocarcinoma
5,NCI-L-084,Lung Adenocarcinoma
6,MSK-L-355,Lung Adenocarcinoma
7,P-0052012,Lung Squamous Cell Carcinoma
8,NCI-L-023,Lung Adenocarcinoma
9,NCI-L-015,Lung Adenocarcinoma


Unnamed: 0,Paziente,Malattia
0,MSK-L-573,Lung Adenocarcinoma
1,P-0030308,Lung Adenocarcinoma
2,MSK-L-1158,Lung Adenocarcinoma
3,MSK-L-253,Lung Adenocarcinoma
4,MSK-L-536,Lung Adenocarcinoma
5,MSK-L-1090,Lung Adenocarcinoma
6,MSK-L-159,Lung Adenocarcinoma
7,P-0040363,Lung Adenocarcinoma
8,P-0027009,Non-Small Cell Lung Cancer
9,NCI-L-063,Lung Adenocarcinoma


Unnamed: 0,Paziente,Malattia
0,P-0025344,Lung Adenocarcinoma
1,NCI-L-038,Lung Adenocarcinoma
2,NCI-L-013,Lung Adenocarcinoma
3,MSK-L-575,Lung Adenocarcinoma
4,NCI-L-172,Lung Adenocarcinoma
5,NCI-L-056,Lung Adenocarcinoma
6,P-0041215,Lung Adenocarcinoma
7,MSK-L-528,Lung Adenocarcinoma
8,MSK-L-688,Lung Adenocarcinoma
9,MSK-L-380,Lung Adenocarcinoma


Unnamed: 0,Paziente,Malattia
0,MSK-L-031,Lung Adenocarcinoma
1,MSK-L-699,Lung Adenocarcinoma
2,MSK-L-544,Lung Adenocarcinoma
3,MSK-L-1006,Lung Adenocarcinoma
4,P-0017214,Lung Adenocarcinoma


Unnamed: 0,Paziente,Malattia
0,MSK-L-143,Lung Adenocarcinoma
1,MSK-L-195,Lung Adenocarcinoma
2,P-0052859,Lung Adenocarcinoma
3,P-0045581,Lung Adenocarcinoma
4,P-0025398,Lung Adenocarcinoma


Unnamed: 0,Paziente,Malattia
0,P-0038442,Lung Adenocarcinoma
1,MSK-L-679,Lung Adenocarcinoma
2,MSK-L-719,Lung Adenocarcinoma
3,MSK-L-172,Lung Adenocarcinoma


Unnamed: 0,Paziente,Malattia
0,P-0022101,Lung Adenocarcinoma
1,MSK-L-074,Lung Adenocarcinoma
2,MSK-L-177,Lung Adenocarcinoma


Unnamed: 0,Paziente,Malattia
0,MSK-L-262,Lung Adenocarcinoma
1,P-0036090,Lung Adenocarcinoma
2,MSK-L-1181,Lung Adenocarcinoma


Unnamed: 0,Paziente,Malattia
0,MSK-L-052,Lung Adenocarcinoma
1,MSK-L-023,Non-Small Cell Lung Cancer


Unnamed: 0,Paziente,Malattia
0,MSK-L-168,Lung Adenocarcinoma
1,NCI-L-006,Lung Adenocarcinoma


Unnamed: 0,Paziente,Malattia
0,MSK-L-261,Lung Adenocarcinoma
1,MSK-L-848,Lung Adenocarcinoma


Unnamed: 0,Paziente,Malattia
0,MSK-L-574,Non-Small Cell Lung Cancer
1,MSK-L-309,Lung Adenocarcinoma


Unnamed: 0,Paziente,Malattia
0,MSK-L-546,Non-Small Cell Lung Cancer
1,NCI-L-016,Lung Adenocarcinoma


Unnamed: 0,Paziente,Malattia
0,P-0007534,Lung Adenocarcinoma
1,MSK-L-811,Lung Adenocarcinoma


Unnamed: 0,Paziente,Malattia
0,P-0031598,Lung Adenocarcinoma
1,P-0043746,Lung Adenocarcinoma
