#### Import data

In [119]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [120]:
# URL completi dei file nel repository GitHub
thca_url = '../data/dsthca_genesremoved.csv'
lusc_url = '../data/dslusc_genesremoved.csv'
skcm_url = '../data/dsskcm_genesremoved.csv'
coadread_url = '../data/dsco_genesremoved.csv'

In [121]:
ds_thca = pd.read_csv(thca_url)

In [122]:
ds_lusc = pd.read_csv(lusc_url)

In [123]:
ds_skcm =  pd.read_csv(skcm_url)

In [124]:
ds_coadread =  pd.read_csv(coadread_url)

In [125]:
'''def prepareData(df):
    #genes = df['Entrez_Gene_Id']
    df = df.drop(columns=['Entrez_Gene_Id'])

    return df'''

"def prepareData(df):\n    #genes = df['Entrez_Gene_Id']\n    df = df.drop(columns=['Entrez_Gene_Id'])\n\n    return df"

In [126]:
print(len(ds_coadread))
print(len(ds_lusc))
print(len(ds_skcm))
print(len(ds_thca))

10516
12720
19647
19353


#### LocalOutlierFactor

In [127]:
from sklearn.neighbors import LocalOutlierFactor

In [128]:
def detect_outliers_localOut_factor(data, k):

    lof = LocalOutlierFactor(n_neighbors=k, metric='euclidean', n_jobs=7)
    outlier_labels = lof.fit_predict(data) 
    print(f"LOF - {k} - Sono stati trovati {len(outlier_labels[outlier_labels == -1])} outliers")
    
    return outlier_labels

#### Prova

In [129]:
################################### PROVA

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from matplotlib.legend_handler import HandlerPathCollection

def update_legend_marker_size(handle, orig):
    """Customize size of the legend marker"""
    handle.update_from(orig)
    handle.set_sizes([20])

# Assuming df is your DataFrame and the Local Outlier Factor has been applied
def plot_lof_outliers(df, n_neighbors):
    # Apply LOF
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination='auto')
    lof_fit = lof.fit(df)
    X_scores = -lof.negative_outlier_factor_
    outliers = lof_fit.fit_predict(df)
    #df_filtered = df[outliers == 1]
    #X = df_filtered.to_numpy()

    plt.figure(figsize=(10, 6))
    plt.scatter(df.iloc[:, 0], df.iloc[:, 1], color="k", s=3.0, label="Data points")
    
    # plot circles with radius proportional to the outlier scores
    radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
    scatter = plt.scatter(
        df.iloc[:, 0],
        df.iloc[:, 1],
        s=1000 * radius,
        edgecolors="r",
        facecolors="none",
        label="Outlier scores",
    )
    plt.axis("tight")
    plt.xlim((-5, 5))
    plt.ylim((-5, 5))
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.legend(
        handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)}
    )
    plt.title("Local Outlier Factor (LOF)")
    plt.show()

# Example usage:
# Assuming df is your DataFrame with the data
# df = pd.read_csv('your_data.csv')

# Run the function with your DataFrame and desired number of neighbors
#plot_lof_outliers(df, n_neighbors=20)


#### Isolation Forest

In [130]:
from sklearn.ensemble import IsolationForest

In [131]:
def detect_outliers_isolation_forest(data, n_estim):
    
    iso_forest = IsolationForest(n_estimators = n_estim, max_samples=1000, random_state=42, n_jobs=7)
    iso_forest.fit(data)
    outlier_labels = iso_forest.predict(data)
    print(f"IF - {n_estim} - sono stati trovati {len(outlier_labels[outlier_labels == -1])} outliers")
    
    return outlier_labels


#### Angle-based outliers

ABOD (Angle-Based Outlier Detection) si basa sul principio che i punti che sono circondati da molti punti vicini formano angoli più ampi rispetto ai punti isolati (outliers). 

https://blog.paperspace.com/outlier-detection-with-abod/

parametri default:
class pyod.models.abod.ABOD(contamination=0.1, n_neighbors=10, method='fast') <br>
0.1 significa che ci aspettiamo il 10% di outliers


In [132]:
#pip install pyod

In [133]:
from pyod.models.abod import ABOD

In [134]:
def detect_outliers_abod(data, k):

    clf = ABOD(n_neighbors=k) 
    clf.fit(data)

    outlier_labels = clf.predict(data)
    outlier_labels = np.array([1 if label == 0 else -1 for label in outlier_labels])
    print(f"ABOD - {k} - Sono stati trovati {len(outlier_labels[outlier_labels==-1])} outliers")
    
    return outlier_labels

#### KNN

In [135]:
from pyod.models.knn import KNN
#https://pyod.readthedocs.io/en/latest/pyod.html


In [136]:
def detect_outliers_knn(data, k):
    knn = KNN(method = 'mean', n_neighbors=k, metric='euclidean', n_jobs=7)
    knn.fit(data)
    predicted = pd.Series(knn.predict(data), index=data.index)

    outlier_labels = predicted.replace({0: 'OK', 1: 'OUTLIER'}).values
    outlier_labels = np.where(outlier_labels == 'OUTLIER', -1, 1)

    print(f"KNN - {k} - Sono stati trovati {len(outlier_labels[outlier_labels==-1])} outliers")

    return outlier_labels

#Altro parametro:
##method:
    #‘largest’: use the distance to the kth neighbor as the outlier score   --> trova più outliers rispetto a mean
    #‘mean’: use the average of all k neighbors as the outlier score  --> forse ha più senso lasciare questa
    #‘median’: use the median of the distance to k neighbors as the outlier score  --> la escluderei

## Common outliers 

In [137]:
def find_common_outliers(lof, isf, abod, knn):
    '''common = []
    for i in range(len(outliers_lof)):
        if outliers_lof[i] == outliers_isoF[i] :
            if outliers_lof[i] == -1 :
                common.append(True)'''
    outliers = []

    # Assumo che tutte le liste abbiano la stessa lunghezza
    for i in range(len(lof)):
        # Conto i -1 nelle 4 liste per l'indice corrente
        count = (lof[i] == -1) + (isf[i] == -1) + (abod[i] == -1) + (knn[i] == -1)
        
        if count >= 2:
            outliers.append(-1)
        else:
            outliers.append(1)

    # Stampa della lista di output
    print("LEN -> ", len(outliers), end = "") 
    print(" ------- ", end = "")
    print(outliers.count(1))
   # print("-------")
   # print(outliers)

    return outliers

def find_common_outliers(*args):
    binary_args = [np.where(arg == -1, 1, 0) for arg in args]
    combined = np.sum(binary_args, axis=0)
    common_outliers = np.where(combined >= 2, 1, 0)
    return common_outliers



In [138]:
def outlier_detection(df):
    #df = prepareData(df)
    df = df.drop(columns=['Entrez_Gene_Id'])

    outliers_lof = detect_outliers_localOut_factor(df, 30)
    outliers_if = detect_outliers_isolation_forest(df, 20)
    outliers_abod = detect_outliers_abod(df, 15)
    outliers_knn = detect_outliers_knn(df, 10)

    outliers = find_common_outliers(outliers_lof, outliers_if, outliers_abod, outliers_knn)
    
    # df = df[:(len(outliers))]
   # df_filtered = df[outliers == 1]
    # Convert outliers to boolean array
    outliers_boolean = outliers == 1
    
    df_filtered = df[outliers_boolean]  # Use the boolean array to filter the DataFrame
    
    return df_filtered


In [139]:
print("LUSC")
df_filtered_lusc = outlier_detection(ds_lusc)

print("\nSKCM")
df_filtered_skcm = outlier_detection(ds_skcm)

print("\nTHCA")
df_filtered_thca = outlier_detection(ds_thca)

print("\nCOADREAD")
df_filtered_coadread = outlier_detection(ds_coadread)

LUSC
LOF - 30 - Sono stati trovati 481 outliers
IF - 20 - sono stati trovati 218 outliers
ABOD - 15 - Sono stati trovati 1234 outliers
KNN - 10 - Sono stati trovati 529 outliers

SKCM
LOF - 30 - Sono stati trovati 3250 outliers
IF - 20 - sono stati trovati 334 outliers
ABOD - 15 - Sono stati trovati 1992 outliers
KNN - 10 - Sono stati trovati 412 outliers

THCA
LOF - 30 - Sono stati trovati 3660 outliers
IF - 20 - sono stati trovati 391 outliers
ABOD - 15 - Sono stati trovati 1949 outliers
KNN - 10 - Sono stati trovati 842 outliers

COADREAD
LOF - 30 - Sono stati trovati 193 outliers
IF - 20 - sono stati trovati 1018 outliers
ABOD - 15 - Sono stati trovati 1005 outliers
KNN - 10 - Sono stati trovati 663 outliers


In [140]:
df_filtered_lusc.to_csv(f'../data/filtered_lusc.csv', index=False)
df_filtered_skcm.to_csv(f'../data/filtered_skcm.csv', index=False)
df_filtered_thca.to_csv(f'../data/filtered_thca.csv', index=False)
df_filtered_coadread.to_csv(f'../data/filtered_coadread.csv', index=False)

In [141]:
print(len(df_filtered_coadread))
print(len(df_filtered_lusc))
print(len(df_filtered_skcm))
print(len(df_filtered_thca))

833
594
850
1790


### Trova i geni comuni

In [142]:
# Combina tutti i geni dai quattro dataset in un'unica Serie (prima colonna)
common_genes = set(ds_coadread.iloc[:, 0]).intersection(set(ds_lusc.iloc[:, 0]), set(ds_skcm.iloc[:, 0]), set(ds_thca.iloc[:, 0]))
#common_genes = set(ds_coadread.iloc[:, 0]) & set(ds_lusc.iloc[:, 0]) & set(ds_skcm.iloc[:, 0]) & set(ds_thca.iloc[:, 0])
# Rimuove i duplicati e converte in un array
#common_genes_unique=list(set(common_genes))

print(common_genes,"\n")
print("LEN:", len(common_genes), "\n")

# Filtra ogni dataset mantenendo solo i geni presenti in all_genes_unique
ds_coadread_filtered = ds_coadread[ds_coadread.iloc[:, 0].isin(common_genes)]
ds_lusc_filtered = ds_lusc[ds_lusc.iloc[:, 0].isin(common_genes)]
ds_skcm_filtered = ds_skcm[ds_skcm.iloc[:, 0].isin(common_genes)]
ds_thca_filtered = ds_thca[ds_thca.iloc[:, 0].isin(common_genes)]

print(len(ds_coadread_filtered))
print(len(ds_lusc_filtered))
print(len(ds_skcm_filtered))
print(len(ds_thca_filtered))

ds_lusc_filtered.to_csv('../data/dslusc_genesremoved_common.csv', index=False)
ds_thca_filtered.to_csv('../data/dsthca_genesremoved_common.csv', index=False)
ds_skcm_filtered.to_csv('../data/dsskcm_genesremoved_common.csv', index=False)
ds_coadread_filtered.to_csv('../data/dsco_genesremoved_common.csv', index=False)

{1.0, 9.0, 10.0, 13.0, 15.0, 16.0, 18.0, 22.0, 23.0, 24.0, 28.0, 31.0, 32.0, 35.0, 37.0, 39.0, 41.0, 43.0, 47.0, 48.0, 49.0, 58.0, 70.0, 72.0, 87.0, 88.0, 97.0, 98.0, 103.0, 105.0, 109.0, 111.0, 115.0, 116.0, 117.0, 123.0, 124.0, 125.0, 126.0, 133.0, 134.0, 196743.0, 141.0, 142.0, 148.0, 150.0, 152.0, 153.0, 160.0, 163.0, 175.0, 176.0, 177.0, 181.0, 183.0, 187.0, 189.0, 191.0, 203.0, 204.0, 205.0, 210.0, 214.0, 215.0, 219.0, 222.0, 224.0, 230.0, 238.0, 239.0, 245.0, 246.0, 248.0, 251.0, 257.0, 269.0, 270.0, 272.0, 273.0, 274.0, 196883.0, 164118.0, 279.0, 283.0, 284.0, 285.0, 287.0, 290.0, 306.0, 307.0, 312.0, 314.0, 316.0, 318.0, 330.0, 333.0, 334.0, 335.0, 131408.0, 339.0, 341.0, 196951.0, 344.0, 348.0, 351.0, 360.0, 363.0, 364.0, 366.0, 368.0, 369.0, 372.0, 374.0, 378.0, 382.0, 196993.0, 387.0, 388.0, 392.0, 393.0, 395.0, 398.0, 131474.0, 405.0, 408.0, 409.0, 410.0, 411.0, 412.0, 197021.0, 414.0, 415.0, 419.0, 429.0, 432.0, 433.0, 440.0, 65977.0, 443.0, 444.0, 445.0, 65982.0, 65983.0