# Progetto CHL

## Dataset Preprocessing 

Inizialmente, il dataset era composto da diverse colonne: la prima contenente i geni espressi attraverso i simboli HUGO, la seconda gli identificatori univoci dei geni, e le colonne successive rappresentavano i livelli di espressione genica di ogni gene per ciascun paziente. 

A causa della presenza eccessiva di valori NaN nella prima colonna, abbiamo deciso di eliminarla e di utilizzare l'ID del gene come identificativo principale. Inoltre, abbiamo focalizzato la nostra analisi sull'espressione genetica di un singolo paziente, eliminando le colonne relative agli altri pazienti.

Inoltre sono state eliminate le righe che contenevano come espressione genetica il valore NaN. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
g_df = pd.read_csv('../data/gene_list_converter.txt', sep='\t')
g_df.head()

Unnamed: 0,Entrez_Gene_Id,Gene Name,GENE NAME
0,53947,"alpha 1,4-galactosyltransferase",A4GALT
1,10058,ATP binding cassette subfamily B member 6,ABCB6
2,4363,ATP binding cassette subfamily C member 1,ABCC1
3,150000,ATP binding cassette subfamily C member 13,ABCC13
4,10257,ATP binding cassette subfamily C member 4,ABCC4


In [3]:
gene_dict = {}

# Itero attraverso tutte le righe del DataFrame g_df 
# Per ogni riga, index sarà l'indice della riga e row sarà una Serie Pandas che rappresenta la riga stessa.
for index, row in g_df.iterrows():
    
    #if not pd.isnull(row['GENE NAME']):
    gene_dict[row['Entrez_Gene_Id']] = row['GENE NAME']
        
print(gene_dict)

{53947: 'A4GALT', 10058: 'ABCB6', 4363: 'ABCC1', 150000: 'ABCC13', 10257: 'ABCC4', 9429: 'ABCG2', 171022: 'ABHD11-AS1', 84680: 'ACCS', 390110: 'ACCSL', 43: 'ACHE', 2532: 'ACKR1', 390928: 'ACP7', 1587: 'ADAM3A', 255926: 'ADAM5', 8755: 'ADAM6', 105: 'ADARB2', 124: 'ADH1A', 125: 'ADH1B', 126: 'ADH1C', 127: 'ADH4', 128: 'ADH5', 130: 'ADH6', 131: 'ADH7', 199800: 'ADM5', 79814: 'AGMAT', 246181: 'AKR7L', 84909: 'AOPEP', 358: 'AQP1', 360: 'AQP3', 84837: 'ARHGAP5-AS1', 494470: 'ARK2C', 147339: 'ARK2N', 419: 'ART3', 420: 'ART4', 440: 'ASNS', 57194: 'ATP10A', 23120: 'ATP10B', 57205: 'ATP10D', 23200: 'ATP11B', 92270: 'ATP6AP1L', 79895: 'ATP8B4', 10079: 'ATP9A', 374868: 'ATP9B', 8706: 'B3GALNT1', 124872: 'B4GALNT2', 85319: 'BAGE2', 4059: 'BCAM', 10380: 'BPNT1', 54928: 'BPNT2', 682: 'BSG', 399815: 'C10orf88B', 720: 'C4A', 23066: 'CAND2', 339184: 'CCDC144NL', 348249: 'CCL15-CCL14', 100129055: 'CCNYL4', 641367: 'CCNYL6', 977: 'CD151', 948: 'CD36', 960: 'CD44', 1604: 'CD55', 966: 'CD59', 4267: 'CD99', 

In [4]:
# URL completi dei file nel repository GitHub
thca_url = 'https://raw.githubusercontent.com/alesmk/CHL/main/data/ds_thca.txt'
lusc_url = 'https://raw.githubusercontent.com/alesmk/CHL/main/data/ds_lusc.txt'
skcm_url = 'https://raw.githubusercontent.com/alesmk/CHL/main/data/ds_skcm.txt'
coadread_url = 'https://raw.githubusercontent.com/alesmk/CHL/main/data/ds_coadread.txt'

def processing_ds(ds_url):
    ds = pd.read_csv(ds_url, sep='\t')
    ds = ds.drop(columns='Hugo_Symbol', axis=1)
    ds = ds.dropna()
    ds = ds.reset_index(drop=True)
    ds = ds[:100]  # per fare prove
    return ds

ds_thca = processing_ds(thca_url)
ds_lusc = processing_ds(lusc_url)
ds_skcm = processing_ds(skcm_url)
ds_coadread = processing_ds(coadread_url)


'''Patients'''
thca_p = 'TCGA-4C-A93U-01'
lusc_p = 'TCGA-21-5787-01'
skcm_p = 'TCGA-D3-A1Q9-06'
coadread_p = 'TCGA-AA-A01P-01'

## Correlazione

Per ogni coppia di geni, è stata calcolata la correlazione tra questi utilizzando la distanza Euclidea.

In [5]:
def correlation(ds, paziente):
    ds = ds[['Entrez_Gene_Id', paziente]]
    n = len(ds)

    correlation_results = []

    for i in range(0, n):
        gene1 = ds.Entrez_Gene_Id[i]
        val1 = ds.loc[i, paziente]

        for j in range(i+1, n):

            gene2 = ds.Entrez_Gene_Id[j]
            val2 = ds.loc[j, paziente]

            # Correlazione tra gene expressions calcolata attraverso distanza euclidea
            correlation = np.sqrt((val1 - val2)**2)

            #if correlation >= 1.5:
            correlation_results.append({
                'gene_1': gene1,
                'gene_2': gene2,
                'correlazione': correlation,
                'geneName_1': gene_dict.get(gene1, 'NO_CORR'),
                'geneName_2': gene_dict.get(gene2, 'NO_CORR')
            })

    df = pd.DataFrame(correlation_results)

    return df

In [6]:
df_thca = correlation(ds_thca, thca_p)
df_thca.to_csv('correlation_thca_completa.csv', index=False) 
df_thca

Unnamed: 0,gene_1,gene_2,correlazione,geneName_1,geneName_2
0,100130426,100133144,2.9835,NO_CORR,NO_CORR
1,100130426,100134869,0.2732,NO_CORR,UBE2Q2P2
2,100130426,10357,3.9577,NO_CORR,HMGB1P1
3,100130426,10431,2.7237,NO_CORR,NO_CORR
4,100130426,155060,5.1139,NO_CORR,LOC155060
...,...,...,...,...,...
4945,10061,9619,0.2160,ABCF2,ABCG1
4946,10061,9429,0.7502,ABCF2,ABCG2
4947,55324,9619,1.8281,ABCF3,ABCG1
4948,55324,9429,0.8619,ABCF3,ABCG2


In [7]:
count_no_corr_1 = len(df_thca[df_thca['geneName_1'] == 'NO_CORR'])
count_no_corr_2 = len(df_thca[df_thca['geneName_2'] == 'NO_CORR'])

print(f"{count_no_corr_1}, {count_no_corr_2}")

878, 112


In [8]:
df_lusc = correlation(ds_lusc, lusc_p)
df_lusc.to_csv('correlation_lusc_completa.csv', index=False) 
df_lusc

count_no_corr_1 = len(df_lusc[df_lusc['geneName_1'] == 'NO_CORR'])
count_no_corr_2 = len(df_lusc[df_lusc['geneName_2'] == 'NO_CORR'])
print(f"{count_no_corr_1}, {count_no_corr_2}")

860, 130


In [9]:
df_skcm = correlation(ds_skcm, skcm_p)
df_skcm.to_csv('correlation_skcm_completa.csv', index=False) 
df_skcm

count_no_corr_1 = len(df_skcm[df_skcm['geneName_1'] == 'NO_CORR'])
count_no_corr_2 = len(df_skcm[df_skcm['geneName_2'] == 'NO_CORR'])
print(f"{count_no_corr_1}, {count_no_corr_2}")

793, 98


In [10]:
df_coadread = correlation(ds_coadread, coadread_p)
df_coadread.to_csv('correlation_coadread_completa.csv', index=False) 
df_coadread

count_no_corr_1 = len(df_coadread[df_coadread['geneName_1'] == 'NO_CORR'])
count_no_corr_2 = len(df_coadread[df_coadread['geneName_2'] == 'NO_CORR'])
print(f"{count_no_corr_1}, {count_no_corr_2}")

467, 28
