#### Import data

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
# URL completi dei file nel repository GitHub
thca_url = '../data/dsthca_genesremoved.csv'
lusc_url = '../data/dslusc_genesremoved.csv'
skcm_url = '../data/dsskcm_genesremoved.csv'
coadread_url = '../data/dsco_genesremoved.csv'

In [3]:
ds_thca = pd.read_csv(thca_url)

In [4]:
ds_lusc = pd.read_csv(lusc_url)

In [5]:
ds_skcm =  pd.read_csv(skcm_url)

In [6]:
ds_coadread =  pd.read_csv(coadread_url)

In [7]:
'''def prepareData(df):
    #genes = df['Entrez_Gene_Id']
    df = df.drop(columns=['Entrez_Gene_Id'])

    return df'''

"def prepareData(df):\n    #genes = df['Entrez_Gene_Id']\n    df = df.drop(columns=['Entrez_Gene_Id'])\n\n    return df"

In [8]:
print(len(ds_coadread))
print(len(ds_lusc))
print(len(ds_skcm))
print(len(ds_thca))

10516
12720
19647
19353


#### LocalOutlierFactor

In [9]:
from sklearn.neighbors import LocalOutlierFactor

In [10]:
def detect_outliers_localOut_factor(data, k):

    lof = LocalOutlierFactor(n_neighbors=k, metric='euclidean', n_jobs=7)
    outlier_labels = lof.fit_predict(data) 
    print(f"LOF - {k} - Sono stati trovati {len(outlier_labels[outlier_labels == -1])} outliers")
    
    return outlier_labels

#### Prova

In [11]:
################################### PROVA

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from matplotlib.legend_handler import HandlerPathCollection

def update_legend_marker_size(handle, orig):
    """Customize size of the legend marker"""
    handle.update_from(orig)
    handle.set_sizes([20])

# Assuming df is your DataFrame and the Local Outlier Factor has been applied
def plot_lof_outliers(df, n_neighbors):
    # Apply LOF
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination='auto')
    lof_fit = lof.fit(df)
    X_scores = -lof.negative_outlier_factor_
    outliers = lof_fit.fit_predict(df)
    #df_filtered = df[outliers == 1]
    #X = df_filtered.to_numpy()

    plt.figure(figsize=(10, 6))
    plt.scatter(df.iloc[:, 0], df.iloc[:, 1], color="k", s=3.0, label="Data points")
    
    # plot circles with radius proportional to the outlier scores
    radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
    scatter = plt.scatter(
        df.iloc[:, 0],
        df.iloc[:, 1],
        s=1000 * radius,
        edgecolors="r",
        facecolors="none",
        label="Outlier scores",
    )
    plt.axis("tight")
    plt.xlim((-5, 5))
    plt.ylim((-5, 5))
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.legend(
        handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)}
    )
    plt.title("Local Outlier Factor (LOF)")
    plt.show()

# Example usage:
# Assuming df is your DataFrame with the data
# df = pd.read_csv('your_data.csv')

# Run the function with your DataFrame and desired number of neighbors
#plot_lof_outliers(df, n_neighbors=20)


#### Isolation Forest

In [12]:
from sklearn.ensemble import IsolationForest

In [13]:
def detect_outliers_isolation_forest(data, n_estim):
    
    iso_forest = IsolationForest(n_estimators = n_estim, max_samples=1000, random_state=42, n_jobs=7)
    iso_forest.fit(data)
    outlier_labels = iso_forest.predict(data)
    print(f"IF - {n_estim} - sono stati trovati {len(outlier_labels[outlier_labels == -1])} outliers")
    
    return outlier_labels


#### Angle-based outliers

ABOD (Angle-Based Outlier Detection) si basa sul principio che i punti che sono circondati da molti punti vicini formano angoli più ampi rispetto ai punti isolati (outliers). 

https://blog.paperspace.com/outlier-detection-with-abod/

parametri default:
class pyod.models.abod.ABOD(contamination=0.1, n_neighbors=10, method='fast') <br>
0.1 significa che ci aspettiamo il 10% di outliers


In [14]:
#pip install pyod

In [15]:
from pyod.models.abod import ABOD

In [16]:
def detect_outliers_abod(data, k):

    clf = ABOD(n_neighbors=k) 
    clf.fit(data)

    outlier_labels = clf.predict(data)
    outlier_labels = np.array([1 if label == 0 else -1 for label in outlier_labels])
    print(f"ABOD - {k} - Sono stati trovati {len(outlier_labels[outlier_labels==-1])} outliers")
    
    return outlier_labels

#### KNN

In [17]:
from pyod.models.knn import KNN
#https://pyod.readthedocs.io/en/latest/pyod.html


In [18]:
def detect_outliers_knn(data, k):
    knn = KNN(method = 'mean', n_neighbors=k, metric='euclidean', n_jobs=7)
    knn.fit(data)
    predicted = pd.Series(knn.predict(data), index=data.index)

    outlier_labels = predicted.replace({0: 'OK', 1: 'OUTLIER'}).values
    outlier_labels = np.where(outlier_labels == 'OUTLIER', -1, 1)

    print(f"KNN - {k} - Sono stati trovati {len(outlier_labels[outlier_labels==-1])} outliers")

    return outlier_labels

#Altro parametro:
##method:
    #‘largest’: use the distance to the kth neighbor as the outlier score   --> trova più outliers rispetto a mean
    #‘mean’: use the average of all k neighbors as the outlier score  --> forse ha più senso lasciare questa
    #‘median’: use the median of the distance to k neighbors as the outlier score  --> la escluderei

## Common outliers 

In [19]:
def find_common_outliers(lof, isf, abod, knn):
    '''common = []
    for i in range(len(outliers_lof)):
        if outliers_lof[i] == outliers_isoF[i] :
            if outliers_lof[i] == -1 :
                common.append(True)'''
    outliers = []
    print(len(lof))
    print(len(isf))
    print(len(abod))
    print(len(knn))
    print("--> tutte le liste hanno la stessa lunghezza")
    # Assumo che tutte le liste abbiano la stessa lunghezza
    for i in range(len(lof)):
        # Conto i -1 nelle 4 liste per l'indice corrente
        #count = (lof[i] == -1) + (isf[i] == -1) + (abod[i] == -1) + (knn[i] == -1)
        count = int(lof[i] == -1) + int(isf[i] == -1) + int(abod[i] == -1) + int(knn[i] == -1)
        
        #print(count)
        if count >= 2:
            outliers.append(-1)
        else:
            outliers.append(1)

    # Stampa della lista di output
    print("LEN -> ", len(outliers), end = "") 
    print(" ------- ", end = "")
    #print(outliers.count(1))
   # print("-------")
   # print(outliers)

    return outliers

'''def find_common_outliers(*args):
    binary_args = [np.where(arg == -1, 1, 0) for arg in args]
    combined = np.sum(binary_args, axis=0)
    common_outliers = np.where(combined >= 2, 1, 0)
    return common_outliers
'''


'def find_common_outliers(*args):\n    binary_args = [np.where(arg == -1, 1, 0) for arg in args]\n    combined = np.sum(binary_args, axis=0)\n    common_outliers = np.where(combined >= 2, 1, 0)\n    return common_outliers\n'

In [24]:
def outlier_detection(df):
    #df = prepareData(df)
    genes = df['Entrez_Gene_Id']
    tmp_df = df.drop(columns=['Entrez_Gene_Id'])

    #df=df[:13000]

    outliers_lof = detect_outliers_localOut_factor(tmp_df, 10)
    outliers_if = detect_outliers_isolation_forest(tmp_df, 20)
    outliers_abod = detect_outliers_abod(tmp_df, 5)
    outliers_knn = detect_outliers_knn(tmp_df, 10)

    outliers = find_common_outliers(outliers_lof, outliers_if, outliers_abod, outliers_knn)
    #print(outliers)
    # df = df[:(len(outliers))]
   # df_filtered = df[outliers == 1]
    # Convert outliers to boolean array
    outliers = np.array(outliers)
    #outliers_boolean = outliers == 1
    #df_filtered = df[outliers_boolean]  # Use the boolean array to filter the 
    df_filtered = df[outliers==1]  # Use the boolean array to filter the 

    print("len iniziale: ", len(df))
    print(f"RIMOSSI {len(df)-len(df_filtered)} outliers")
    return df_filtered


In [25]:
print("LUSC")
df_filtered_lusc = outlier_detection(ds_lusc)

print("\nSKCM")
df_filtered_skcm = outlier_detection(ds_skcm)

print("\nTHCA")
df_filtered_thca = outlier_detection(ds_thca)

print("\nCOADREAD")
df_filtered_coadread = outlier_detection(ds_coadread)

df_filtered_coadread

LUSC
LOF - 10 - Sono stati trovati 296 outliers
IF - 20 - sono stati trovati 218 outliers
ABOD - 5 - Sono stati trovati 1700 outliers
KNN - 10 - Sono stati trovati 529 outliers
12720
12720
12720
12720
--> tutte le liste hanno la stessa lunghezza
LEN ->  12720 ------- len iniziale:  12720
RIMOSSI 531 outliers

SKCM
LOF - 10 - Sono stati trovati 3156 outliers
IF - 20 - sono stati trovati 334 outliers
ABOD - 5 - Sono stati trovati 2986 outliers
KNN - 10 - Sono stati trovati 412 outliers
19647
19647
19647
19647
--> tutte le liste hanno la stessa lunghezza
LEN ->  19647 ------- len iniziale:  19647
RIMOSSI 1094 outliers

THCA
LOF - 10 - Sono stati trovati 4204 outliers
IF - 20 - sono stati trovati 391 outliers
ABOD - 5 - Sono stati trovati 2609 outliers
KNN - 10 - Sono stati trovati 842 outliers
19353
19353
19353
19353
--> tutte le liste hanno la stessa lunghezza
LEN ->  19353 ------- len iniziale:  19353
RIMOSSI 2137 outliers

COADREAD
LOF - 10 - Sono stati trovati 171 outliers
IF - 20 - s

Unnamed: 0,Entrez_Gene_Id,TCGA-AA-3664-01,TCGA-AA-3715-01,TCGA-AA-A01P-01,TCGA-AA-A022-01,TCGA-AA-A02R-01
0,100133144.0,2.1074,-0.6870,-1.0656,2.0404,0.2411
2,10431.0,-0.7704,-0.8834,0.5917,-0.4232,-0.4570
3,155060.0,1.0586,0.9720,-0.8701,1.2773,-0.1955
4,388795.0,-0.8328,-0.1053,-0.8213,0.2388,-1.4267
5,390284.0,-1.4870,-1.9785,-0.1300,0.3606,0.5280
...,...,...,...,...,...,...
10511,7789.0,-1.4699,-0.0467,0.3257,1.2421,0.7615
10512,158586.0,-0.9122,-1.4573,-0.5672,1.5232,0.4002
10513,79364.0,0.9300,-0.3224,-0.4270,0.4241,-0.5312
10514,440590.0,0.1100,-0.6595,-0.2214,-1.3802,-1.2224


In [26]:
df_filtered_lusc.to_csv(f'../data/filtered_lusc.csv', index=False)
df_filtered_skcm.to_csv(f'../data/filtered_skcm.csv', index=False)
df_filtered_thca.to_csv(f'../data/filtered_thca.csv', index=False)
df_filtered_coadread.to_csv(f'../data/filtered_coadread.csv', index=False)

In [27]:
print(len(df_filtered_coadread))
print(len(df_filtered_lusc))
print(len(df_filtered_skcm))
print(len(df_filtered_thca))

9716
12189
18553
17216


In [28]:
df_filtered_lusc

Unnamed: 0,Entrez_Gene_Id,TCGA-21-5787-01,TCGA-34-2596-01,TCGA-34-5231-01,TCGA-37-4141-01,TCGA-39-5031-01,TCGA-43-A475-01,TCGA-63-5128-01,TCGA-68-A59J-01,TCGA-77-A5G3-01,TCGA-90-A4ED-01,TCGA-98-A53B-01
0,100133144.0,-2.2769,-1.8432,0.9940,-0.0427,-0.6309,0.1270,0.9573,-0.0815,0.9524,0.8139,1.1773
1,26823.0,-1.9506,1.1385,-0.5955,-0.1582,-0.8293,1.4164,-0.4456,0.7333,-1.9506,0.6894,-1.9506
2,280660.0,-1.2958,-1.2958,-1.2958,-1.2958,-1.2958,-1.2958,-1.2958,-1.2958,-1.2958,-1.2958,-1.2958
3,340602.0,-0.9638,0.0955,-0.9638,0.2653,2.7832,0.3382,-0.9638,-0.4505,-0.1274,0.8124,-0.7206
4,390284.0,-0.2736,-0.1277,0.5600,-0.3962,-1.2135,0.5446,-0.3579,0.5679,1.0685,0.1967,-0.0818
...,...,...,...,...,...,...,...,...,...,...,...,...
12715,221302.0,-1.2575,-1.0186,-0.4702,-0.6698,-0.3214,-0.5768,-1.3740,1.4408,0.5671,-0.6161,-1.2133
12716,9183.0,-0.7842,-0.8894,0.0764,0.8365,-0.4749,-0.5232,1.0375,-0.0914,0.2553,-0.7150,0.4797
12717,55055.0,1.3453,0.9074,1.8250,0.6115,-1.0863,0.9943,0.0251,-0.8131,0.8874,-1.4305,1.0134
12718,440590.0,0.3692,-1.6668,1.5931,1.5153,0.2441,-0.7745,-1.1293,0.5757,-0.0476,-0.6978,1.3848


### Trova i geni comuni

In [41]:
# Combina tutti i geni dai quattro dataset in un'unica Serie (prima colonna)
common_genes = set(df_filtered_coadread.iloc[:, 0]).intersection(set(df_filtered_lusc.iloc[:, 0]), set(df_filtered_skcm.iloc[:, 0]), set(df_filtered_thca.iloc[:, 0]))
#common_genes = set(ds_coadread.iloc[:, 0]) & set(ds_lusc.iloc[:, 0]) & set(ds_skcm.iloc[:, 0]) & set(ds_thca.iloc[:, 0])
# Rimuove i duplicati e converte in un array
#common_genes_unique=list(set(common_genes))

#print(common_genes,"\n")
print("LEN:", len(common_genes), "\n")

# Filtra ogni dataset mantenendo solo i geni presenti in all_genes_unique
ds_coadread_filtered = df_filtered_coadread[df_filtered_coadread.iloc[:, 0].isin(common_genes)]
ds_lusc_filtered = df_filtered_lusc[df_filtered_lusc.iloc[:, 0].isin(common_genes)]
ds_skcm_filtered = df_filtered_skcm[df_filtered_skcm.iloc[:, 0].isin(common_genes)]
ds_thca_filtered = df_filtered_thca[df_filtered_thca.iloc[:, 0].isin(common_genes)]

print(len(ds_coadread_filtered))
print(len(ds_lusc_filtered))
print(len(ds_skcm_filtered))
print(len(ds_thca_filtered))

ds_lusc_filtered.to_csv('../data/dslusc_genesremoved_common.csv', index=False)
ds_thca_filtered.to_csv('../data/dsthca_genesremoved_common.csv', index=False)
ds_skcm_filtered.to_csv('../data/dsskcm_genesremoved_common.csv', index=False)
ds_coadread_filtered.to_csv('../data/dsco_genesremoved_common.csv', index=False)

LEN: 5256 

5256
5256
5256
5256


In [40]:
common_genes = set(ds_lusc_filtered.iloc[:, 0]).intersection(set(ds_thca_filtered.iloc[:, 0]), set(ds_coadread_filtered.iloc[:, 0]), set(ds_skcm_filtered.iloc[:, 0]))
len(common_genes)

5256

In [36]:
ds_coadread_filtered.head()

Unnamed: 0,Entrez_Gene_Id,TCGA-AA-3664-01,TCGA-AA-3715-01,TCGA-AA-A01P-01,TCGA-AA-A022-01,TCGA-AA-A02R-01
5,390284.0,-1.487,-1.9785,-0.13,0.3606,0.528
6,57714.0,1.444,0.5062,1.331,1.8555,2.179
9,1.0,-0.3238,1.1803,1.1085,1.3288,-0.1198
11,87769.0,0.5944,-1.4075,0.1305,0.3522,-1.0786
15,13.0,-0.7453,-1.505,0.2772,-0.3166,-1.505


In [37]:
ds_lusc_filtered.head()

Unnamed: 0,Entrez_Gene_Id,TCGA-21-5787-01,TCGA-34-2596-01,TCGA-34-5231-01,TCGA-37-4141-01,TCGA-39-5031-01,TCGA-43-A475-01,TCGA-63-5128-01,TCGA-68-A59J-01,TCGA-77-A5G3-01,TCGA-90-A4ED-01,TCGA-98-A53B-01
4,390284.0,-0.2736,-0.1277,0.56,-0.3962,-1.2135,0.5446,-0.3579,0.5679,1.0685,0.1967,-0.0818
10,57714.0,-0.9819,-0.0271,0.8788,-0.3408,-1.869,2.5135,0.0495,2.7367,0.1474,-0.4162,0.3886
14,1.0,-0.9423,0.0759,0.3231,0.7501,1.2677,-0.7438,-0.5985,0.0658,-0.7699,1.7923,1.2799
15,87769.0,-0.4927,0.4209,1.4016,-0.133,-0.1892,1.1449,0.8826,-0.8093,-0.9387,0.9069,-0.1085
18,13.0,-1.6914,-0.0842,-1.1588,-1.499,0.9242,-0.6383,0.1106,-0.1477,-0.6543,-1.9058,0.964


In [38]:
ds_skcm_filtered.head()

Unnamed: 0,Entrez_Gene_Id,TCGA-D3-A1Q5-06,TCGA-D3-A1Q6-06,TCGA-D3-A1Q7-06,TCGA-D3-A1Q9-06,TCGA-D3-A1QA-06,TCGA-D3-A2JN-06,TCGA-D3-A2JP-06,TCGA-D3-A3CB-06,TCGA-D3-A3CE-06,...,TCGA-GN-A4U7-06,TCGA-GN-A4U8-06,TCGA-GN-A9SD-06,TCGA-W3-AA1O-06,TCGA-W3-AA1Q-06,TCGA-WE-A8K5-06,TCGA-WE-A8K6-06,TCGA-WE-A8ZQ-06,TCGA-WE-AAA4-06,TCGA-YG-AA3O-06
10,390284.0,1.6731,1.0274,0.1719,0.4846,-0.1741,0.8847,1.0403,1.1577,-0.2504,...,0.2859,0.4194,0.0211,-1.0648,0.6507,-0.064,0.675,-0.9143,-1.0105,-1.3256
14,57714.0,-0.8063,-0.2988,1.3727,0.6344,1.2964,-2.2806,-0.7805,1.3611,-0.4364,...,-0.4177,-0.7025,-0.5419,-0.0269,-0.7018,-1.2095,-1.2194,-0.0673,1.9103,0.2748
22,1.0,-0.0086,-1.7494,-0.9417,-0.2275,-0.4392,1.2346,1.5045,0.1138,-1.9834,...,1.5244,-0.3929,0.2473,-0.1655,0.4806,0.063,0.6407,0.3853,0.5002,-0.1331
25,87769.0,0.4953,0.7342,0.5561,-1.3483,0.3235,-0.4325,0.0029,0.8975,0.3311,...,-0.003,0.9047,0.0998,1.828,0.1174,-0.6893,0.9018,-2.1778,-0.0095,0.0552
33,13.0,0.62,-0.4714,1.4264,-1.0408,-1.0408,-1.0408,-1.0408,-1.0408,-1.0408,...,-1.0408,-0.7463,-1.0408,-0.7752,-1.0408,-1.0408,-0.7236,-1.0408,-1.0408,-1.0408


In [39]:
ds_thca_filtered.head()

Unnamed: 0,Entrez_Gene_Id,TCGA-4C-A93U-01,TCGA-BJ-A0Z3-01,TCGA-BJ-A0Z9-01,TCGA-BJ-A0ZB-01,TCGA-BJ-A18Z-01,TCGA-BJ-A28R-01,TCGA-BJ-A28X-01,TCGA-BJ-A290-01,TCGA-BJ-A2NA-01,...,TCGA-J8-A42S-01,TCGA-J8-A4HY-01,TCGA-KS-A41F-01,TCGA-KS-A4I5-01,TCGA-KS-A4IB-01,TCGA-L6-A4EP-01,TCGA-L6-A4EQ-01,TCGA-L6-A4ET-01,TCGA-L6-A4EU-01,TCGA-MK-A4N9-01
10,390284.0,-0.6231,0.2288,-1.6978,0.536,0.3721,-0.5287,0.0723,1.7071,0.0331,...,1.8225,1.3865,-2.5595,0.3789,-1.0752,0.7349,0.3806,-0.588,0.2145,-0.2225
14,57714.0,1.1333,0.5616,1.0845,0.5896,0.2411,0.611,1.5505,-0.8981,0.4936,...,-0.3535,-0.6396,-0.686,-1.0741,-0.3131,-1.6452,-0.6627,-0.8,-1.8913,-0.058
23,1.0,-0.1501,-0.1305,-0.2336,-0.3905,0.6314,0.1703,-0.5927,-0.3335,0.5035,...,0.1902,-0.3908,3.3222,-0.5239,-1.7318,1.0376,0.6527,0.007,0.5323,0.8167
26,87769.0,-0.5624,-0.1534,0.2076,-0.7516,0.2746,-1.2138,-0.7097,0.1816,0.1868,...,1.483,0.9658,3.0186,-0.7494,1.5759,-0.7606,1.1,0.0581,2.3351,0.7959
35,13.0,-1.2465,-1.2465,-1.2465,-1.2465,0.3724,-1.2465,-0.3763,-0.442,-1.2465,...,-0.7286,-1.2465,-1.2465,0.1366,-1.2465,-0.3217,-1.2465,-1.2465,-1.2465,0.8049
