### Preprocessing con sostituzione dei valori nan

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

import io
import requests

In [2]:
# URL completi dei file nel repository GitHub
thca_url = 'https://raw.githubusercontent.com/alesmk/CHL/main/data/ds_thca.txt'
lusc_url = 'https://raw.githubusercontent.com/alesmk/CHL/main/data/ds_lusc.txt'
skcm_url = 'https://raw.githubusercontent.com/alesmk/CHL/main/data/ds_skcm.txt'
coadread_url = 'https://raw.githubusercontent.com/alesmk/CHL/main/data/ds_coadread.txt'

In [3]:
def checkNaN(row):
    tmp_idx = -1
    tmp_values = []
    
    for i, value in enumerate(row[1:], start=1):
        
        if np.isnan(value):
            if tmp_idx != -1:
                return pd.Series([None]*len(row), index=row.index)
            else:
                tmp_idx = i
        else:
            tmp_values.append(value)
    
    if tmp_idx != -1:
        row.iloc[tmp_idx] = np.mean(tmp_values)
                
    return row

In [4]:
def processing_ds(ds_url):
    s = requests.get(ds_url).content
    ds = pd.read_csv(io.StringIO(s.decode('utf-8')), sep='\t')
    #ds = pd.read_csv(ds_url, sep='\t')
    
    ds = ds.drop(columns='Hugo_Symbol', axis=1)

    ds = ds[:10000]  # per fare prove 
    
    ds = ds.apply(checkNaN, axis=1) # Applica la funzione a ogni *riga* del df
    ds.dropna(inplace=True)
    
    
    file_name = ds_url.split('/')[-1].split('.')[0]       # nome del file dall'URL
    ds.to_csv(f'../data/clean_{file_name}.csv', index=False)

    # Verifica se il df contiene valori NaN lungo le righe
    if ds.isnull().any(axis=1).any():
        print("Il dataframe contiene valori NaN lungo le righe.")
    
    return ds

In [5]:
ds_thca = processing_ds(thca_url)

In [6]:
ds_lusc = processing_ds(lusc_url)

In [7]:
ds_skcm = processing_ds(skcm_url)

In [8]:
ds_coadread = processing_ds(coadread_url)

In [9]:
ds_lusc

Unnamed: 0,Entrez_Gene_Id,TCGA-18-3406-01,TCGA-18-3407-01,TCGA-18-3408-01,TCGA-18-3410-01,TCGA-18-3411-01,TCGA-18-3412-01,TCGA-18-3414-01,TCGA-18-3415-01,TCGA-18-3416-01,...,TCGA-NK-A5CT-01,TCGA-NK-A5CX-01,TCGA-NK-A7XE-01,TCGA-O2-A52N-01,TCGA-O2-A52Q-01,TCGA-O2-A52S-01,TCGA-O2-A52V-01,TCGA-O2-A52W-01,TCGA-O2-A5IB-01,TCGA-XC-AA0X-01
0,100130426.0,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,...,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669
1,100133144.0,1.8378,-0.1598,-0.7655,0.0104,-0.2851,1.2732,-0.3424,1.4537,0.7209,...,-0.4284,1.0729,0.2770,0.7214,-1.4950,0.2233,0.7159,1.4204,1.7913,1.1983
2,100134869.0,0.7621,-0.8918,-1.7254,0.4158,0.0165,0.5016,0.7771,1.3384,-0.7704,...,-2.1169,0.2563,0.1348,-1.4860,-1.6373,1.5907,0.0935,1.1266,1.4415,0.6649
3,10357.0,1.4785,0.5889,1.2334,0.6218,1.4572,2.3625,0.6860,0.0698,-0.4780,...,-1.2241,-0.2630,-0.4246,-0.4659,-0.0679,0.2929,0.4620,1.2909,0.0398,-1.8503
4,10431.0,1.2240,0.0276,1.1588,0.5041,0.4177,-0.2879,0.1943,1.8865,1.0704,...,0.5977,-0.0658,2.3068,-1.2277,0.3787,0.5783,0.7103,-0.5234,-1.9716,-0.4674
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,390595.0,-1.7636,-1.2815,-1.9448,-1.1881,-2.6786,-1.4553,-1.5040,-1.0939,-0.5796,...,1.3153,0.8373,1.0234,0.3897,0.5895,1.2049,-0.8955,0.2395,-0.4195,1.2988
9996,390858.0,-1.8176,-0.0417,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,...,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176
9997,391322.0,-1.0502,-2.4741,-2.4741,-1.3632,-0.7838,-0.6049,0.7917,-2.4741,-0.6066,...,-0.0524,0.4201,-1.8989,1.7854,0.2235,1.8909,-2.4741,-2.4741,1.3287,0.8291
9998,392196.0,-1.2273,-1.2273,-1.2273,-0.4597,-1.2273,-1.2273,-1.2273,-1.2273,-1.2273,...,-1.2273,-1.2273,-1.2273,-0.2197,-1.2273,-1.2273,-1.2273,-1.2273,-1.2273,-1.2273


### Correlation matrix per eliminaregeni con correlazione superiore a 0.9


In [10]:
def corr_matrix(ds):
    ds.set_index('Entrez_Gene_Id', inplace=True)
    
    # Calcola la matrice di correlazione tra i geni
    corr_matrix = ds.T.corr()

    return corr_matrix

In [11]:
import numpy as np
import pandas as pd

def remove_genes(ds, threshold):
    cm = corr_matrix(ds)  

    # Creiamo una copia della matrice di correlazione per lavorare sui valori
    significant_corr = cm.copy()

    # Imposta la diagonale su un valore non significativo per ignorarla
    np.fill_diagonal(significant_corr.values, 0)

    high_correlation = significant_corr[(significant_corr > threshold) | (significant_corr < -threshold)]

    # Mostra il risultato
    # print(high_correlation)
    
    # Lista per tenere traccia delle coppie di geni
    significant_pairs = []

    # Itera solo sugli elementi significativi
    for index, row in high_correlation.iterrows():
        for i in range(len(high_correlation.columns)):
            col = high_correlation.columns[i]
            if pd.notna(row.iloc[i]):
                significant_pairs.append((index, col))

    # Converti la lista in un DataFrame per una migliore gestione e visualizzazione
    # contiene gli ID dei geni ammissibili
    significant_pairs_df = pd.DataFrame(significant_pairs, columns=['Gene1', 'Gene2'])
    # print("-------")
    print(significant_pairs_df)
    # return significant_pairs_df
    
    # Scegliere casualmente tra la prima e la seconda colonna per ogni riga
    scelte_casuali = np.random.choice([0, 1], size=len(significant_pairs_df))

    # Usare np.arange() per generare un array degli indici delle righe
    indici_righe = np.arange(len(significant_pairs_df))

    # Selezionare i dati usando indici delle righe e scelte di colonna
    valori_scelti = significant_pairs_df.values[indici_righe, scelte_casuali]

    # Stampa diretta dei valori scelti (id geni da eliminare)
    print("Valori scelti casualmente tra la prima e la seconda colonna per ogni riga:")
    print(valori_scelti)  # valori_da_eliminare

    ds = delete_rows(ds, valori_scelti)  # Assuming delete_rows is defined elsewhere
    
    return ds


In [12]:
#significant_pairs_df_lusc = remove_genes(ds_lusc, 0.9)

In [13]:
'''
# Scegliere casualmente tra la prima e la seconda colonna per ogni riga
scelte_casuali = np.random.choice([0, 1], size=len(significant_pairs_df_lusc))

# Usare np.arange() per generare un array degli indici delle righe
indici_righe = np.arange(len(significant_pairs_df_lusc))

# Selezionare i dati usando indici delle righe e scelte di colonna
valori_scelti = significant_pairs_df_lusc.values[indici_righe, scelte_casuali]

# Stampa diretta dei valori scelti (id geni da eliminare)
print("Valori scelti casualmente tra la prima e la seconda colonna per ogni riga:")
print(valori_scelti)
'''
def delete_rows(df, valori_da_eliminare):
    print("LEN " , len(valori_da_eliminare), " --- ", len(df))
    df.reset_index(inplace=True)
    
    # Filtrare il DataFrame per eliminare le righe con i valori specificati in 'entrez'
    df = df[~df['Entrez_Gene_Id'].isin(valori_da_eliminare)]

    # Mostra il DataFrame filtrato
    #print(df)
    
    return df



In [14]:
ds_lusc = remove_genes(ds_lusc, 0.9)
ds_lusc

         Gene1     Gene2
0     404770.0  112401.0
1     404770.0  128861.0
2     404770.0  140856.0
3     404770.0    1444.0
4     404770.0  140881.0
...        ...       ...
1459  154761.0  285966.0
1460  255025.0  404770.0
1461  255025.0  112401.0
1462  255025.0  151300.0
1463   22997.0   22997.0

[1464 rows x 2 columns]
Valori scelti casualmente tra la prima e la seconda colonna per ogni riga:
[404770. 404770. 140856. ... 255025. 151300.  22997.]
LEN  1464  ---  9959


  df.reset_index(inplace=True)


Unnamed: 0,Entrez_Gene_Id,TCGA-18-3406-01,TCGA-18-3407-01,TCGA-18-3408-01,TCGA-18-3410-01,TCGA-18-3411-01,TCGA-18-3412-01,TCGA-18-3414-01,TCGA-18-3415-01,TCGA-18-3416-01,...,TCGA-NK-A5CT-01,TCGA-NK-A5CX-01,TCGA-NK-A7XE-01,TCGA-O2-A52N-01,TCGA-O2-A52Q-01,TCGA-O2-A52S-01,TCGA-O2-A52V-01,TCGA-O2-A52W-01,TCGA-O2-A5IB-01,TCGA-XC-AA0X-01
0,100130426.0,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,...,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669
1,100133144.0,1.8378,-0.1598,-0.7655,0.0104,-0.2851,1.2732,-0.3424,1.4537,0.7209,...,-0.4284,1.0729,0.2770,0.7214,-1.4950,0.2233,0.7159,1.4204,1.7913,1.1983
2,100134869.0,0.7621,-0.8918,-1.7254,0.4158,0.0165,0.5016,0.7771,1.3384,-0.7704,...,-2.1169,0.2563,0.1348,-1.4860,-1.6373,1.5907,0.0935,1.1266,1.4415,0.6649
3,10357.0,1.4785,0.5889,1.2334,0.6218,1.4572,2.3625,0.6860,0.0698,-0.4780,...,-1.2241,-0.2630,-0.4246,-0.4659,-0.0679,0.2929,0.4620,1.2909,0.0398,-1.8503
4,10431.0,1.2240,0.0276,1.1588,0.5041,0.4177,-0.2879,0.1943,1.8865,1.0704,...,0.5977,-0.0658,2.3068,-1.2277,0.3787,0.5783,0.7103,-0.5234,-1.9716,-0.4674
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9954,390595.0,-1.7636,-1.2815,-1.9448,-1.1881,-2.6786,-1.4553,-1.5040,-1.0939,-0.5796,...,1.3153,0.8373,1.0234,0.3897,0.5895,1.2049,-0.8955,0.2395,-0.4195,1.2988
9955,390858.0,-1.8176,-0.0417,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,...,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176
9956,391322.0,-1.0502,-2.4741,-2.4741,-1.3632,-0.7838,-0.6049,0.7917,-2.4741,-0.6066,...,-0.0524,0.4201,-1.8989,1.7854,0.2235,1.8909,-2.4741,-2.4741,1.3287,0.8291
9957,392196.0,-1.2273,-1.2273,-1.2273,-0.4597,-1.2273,-1.2273,-1.2273,-1.2273,-1.2273,...,-1.2273,-1.2273,-1.2273,-0.2197,-1.2273,-1.2273,-1.2273,-1.2273,-1.2273,-1.2273


In [15]:
ds_lusc

Unnamed: 0,Entrez_Gene_Id,TCGA-18-3406-01,TCGA-18-3407-01,TCGA-18-3408-01,TCGA-18-3410-01,TCGA-18-3411-01,TCGA-18-3412-01,TCGA-18-3414-01,TCGA-18-3415-01,TCGA-18-3416-01,...,TCGA-NK-A5CT-01,TCGA-NK-A5CX-01,TCGA-NK-A7XE-01,TCGA-O2-A52N-01,TCGA-O2-A52Q-01,TCGA-O2-A52S-01,TCGA-O2-A52V-01,TCGA-O2-A52W-01,TCGA-O2-A5IB-01,TCGA-XC-AA0X-01
0,100130426.0,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,...,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669
1,100133144.0,1.8378,-0.1598,-0.7655,0.0104,-0.2851,1.2732,-0.3424,1.4537,0.7209,...,-0.4284,1.0729,0.2770,0.7214,-1.4950,0.2233,0.7159,1.4204,1.7913,1.1983
2,100134869.0,0.7621,-0.8918,-1.7254,0.4158,0.0165,0.5016,0.7771,1.3384,-0.7704,...,-2.1169,0.2563,0.1348,-1.4860,-1.6373,1.5907,0.0935,1.1266,1.4415,0.6649
3,10357.0,1.4785,0.5889,1.2334,0.6218,1.4572,2.3625,0.6860,0.0698,-0.4780,...,-1.2241,-0.2630,-0.4246,-0.4659,-0.0679,0.2929,0.4620,1.2909,0.0398,-1.8503
4,10431.0,1.2240,0.0276,1.1588,0.5041,0.4177,-0.2879,0.1943,1.8865,1.0704,...,0.5977,-0.0658,2.3068,-1.2277,0.3787,0.5783,0.7103,-0.5234,-1.9716,-0.4674
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9954,390595.0,-1.7636,-1.2815,-1.9448,-1.1881,-2.6786,-1.4553,-1.5040,-1.0939,-0.5796,...,1.3153,0.8373,1.0234,0.3897,0.5895,1.2049,-0.8955,0.2395,-0.4195,1.2988
9955,390858.0,-1.8176,-0.0417,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,...,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176,-1.8176
9956,391322.0,-1.0502,-2.4741,-2.4741,-1.3632,-0.7838,-0.6049,0.7917,-2.4741,-0.6066,...,-0.0524,0.4201,-1.8989,1.7854,0.2235,1.8909,-2.4741,-2.4741,1.3287,0.8291
9957,392196.0,-1.2273,-1.2273,-1.2273,-0.4597,-1.2273,-1.2273,-1.2273,-1.2273,-1.2273,...,-1.2273,-1.2273,-1.2273,-0.2197,-1.2273,-1.2273,-1.2273,-1.2273,-1.2273,-1.2273


In [16]:
#ds_lusc = delete_rows(ds_lusc, valori_scelti)

### Outlier detection

In [17]:
df = ds_lusc
genes = df['Entrez_Gene_Id']
df = df.drop(columns=['Entrez_Gene_Id'])

In [18]:
df.head()

Unnamed: 0,TCGA-18-3406-01,TCGA-18-3407-01,TCGA-18-3408-01,TCGA-18-3410-01,TCGA-18-3411-01,TCGA-18-3412-01,TCGA-18-3414-01,TCGA-18-3415-01,TCGA-18-3416-01,TCGA-18-3417-01,...,TCGA-NK-A5CT-01,TCGA-NK-A5CX-01,TCGA-NK-A7XE-01,TCGA-O2-A52N-01,TCGA-O2-A52Q-01,TCGA-O2-A52S-01,TCGA-O2-A52V-01,TCGA-O2-A52W-01,TCGA-O2-A5IB-01,TCGA-XC-AA0X-01
0,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,...,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669
1,1.8378,-0.1598,-0.7655,0.0104,-0.2851,1.2732,-0.3424,1.4537,0.7209,-3.5744,...,-0.4284,1.0729,0.277,0.7214,-1.495,0.2233,0.7159,1.4204,1.7913,1.1983
2,0.7621,-0.8918,-1.7254,0.4158,0.0165,0.5016,0.7771,1.3384,-0.7704,-2.7078,...,-2.1169,0.2563,0.1348,-1.486,-1.6373,1.5907,0.0935,1.1266,1.4415,0.6649
3,1.4785,0.5889,1.2334,0.6218,1.4572,2.3625,0.686,0.0698,-0.478,0.9208,...,-1.2241,-0.263,-0.4246,-0.4659,-0.0679,0.2929,0.462,1.2909,0.0398,-1.8503
4,1.224,0.0276,1.1588,0.5041,0.4177,-0.2879,0.1943,1.8865,1.0704,1.6479,...,0.5977,-0.0658,2.3068,-1.2277,0.3787,0.5783,0.7103,-0.5234,-1.9716,-0.4674


#### LocalOutlierFactor

In [19]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(n_neighbors=20)
outliers_lof = lof.fit_predict(df) 

print(f"Sono stati trovati {len(outliers_lof[outliers_lof==-1])} outliers")
df_filtered=df[outliers_lof==1]

#print(df_filtered)

Sono stati trovati 1586 outliers


In [20]:
len(outliers_lof)

9718

In [21]:
len(df)

9718

In [22]:
len(df_filtered)

8132

#### Isolation Forest

In [23]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(n_estimators = 100, max_samples='auto', random_state=42)
iso_forest.fit(df)
outliers_isoF = iso_forest.predict(df)

#print(outliers)
print(f"Sono stati trovati {len(outliers_isoF[outliers_isoF==-1])} outliers")
df_filtered=df[outliers_isoF==1]

#print(df_filtered)

Sono stati trovati 141 outliers


In [24]:
#print(outliers_lof)
#print(outliers_isoF)

In [25]:
#common_elements = [(i, outliers_lof[i]) for i in range (len(outliers_lof)) if outliers_lof[i] == outliers_isoF[i] & outliers_lof[i] == -1]

In [26]:
common = []
for i in range(len(outliers_lof)):
    if outliers_lof[i] == outliers_isoF[i] :
        if outliers_lof[i] == -1 :
            common.append(True)

print(len(common))
#print(len(common_elements))

79


#### Angle-based outliers

#### KNN

In [27]:
#pip install pyod

In [28]:
from pyod.models.knn import KNN

In [29]:
knn = KNN(method = 'mean', n_neighbors=20, metric='euclidean')
knn.fit(df)

KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='mean',
  metric='euclidean', metric_params=None, n_jobs=1, n_neighbors=20, p=2,
  radius=1.0)

In [30]:
predicted = pd.Series(knn.predict(df), index=df.index)
print('# of outliers ', predicted.sum())

# of outliers  206


In [31]:
# Trasforma la serie in un array NumPy
#outliers_knn = predicted.values

# Applica la trasformazione
outliers_knn = predicted.replace({0: 'OK', 1: 'OUTLIER'}).values
#print(outliers_knn)

outliers_knn = np.where(outliers_knn == 'OUTLIER', -1, 1)

# Stampa il risultato
print("LEN prima di filtrare: ", len(outliers_knn), "\n")
#print(outliers_knn)

print(f"Sono stati trovati {len(outliers_knn[outliers_knn==-1])} outliers")
df_filtered = df[outliers_knn==1]

print(len(df))
print(len(df_filtered))

LEN prima di filtrare:  9718 

Sono stati trovati 206 outliers
9718
9512


TODO:  
- tuning di parametri: n_neighbors, contamination (adesso settato di default), estimators in IF (consigliato 100 dal prof
- in KNN capire se la metrica di distanza influisce sul risultato
- per ogni metodo è stata mantenuta una maschera: alla fine valutare quali outlier sono comuni 
- aggiungere ABOD with default parameters

### feature selection