In [1]:
import pandas as pd
import  numpy as np

In [2]:
coadread_url='../data/dsco_genesremoved_common.csv'
lusc_url='../data/dslusc_genesremoved_common.csv'
thca_url='../data/dsthca_genesremoved_common.csv'
skcm_url='../data/dsskcm_genesremoved_common.csv'

## Creo un dataset con tutti i pazienti di tutti i tumori

In [3]:
def prepare_df(df_url, tumor):
    df = pd.read_csv(df_url)
    df = df.transpose()
       
    df['Tipo_Tumore'] = tumor
    header = df.iloc[0]

    # Rimuoviamo la prima riga
    df = df.iloc[1:]
    
    return df, header

In [4]:
def get_complete_dataset():
    ds_coadread, header = prepare_df(coadread_url, 'COADREAD')
    ds_lusc, _ = prepare_df(lusc_url, 'LUSC')
    ds_thca, _ = prepare_df(thca_url, 'THCA')
    ds_skcm, _ = prepare_df(skcm_url, 'SKCM')
    
    # Concatenating the DataFrames vertically
    result = pd.concat([ds_coadread, ds_lusc, ds_thca, ds_skcm], axis=0)
    result.columns = header #set the header row as the df header
    result.columns.values[-1] = 'Tipo_Tumore'
    
    result.to_csv("../data/dataset_completo.csv")
    return result

result = get_complete_dataset()

In [5]:
result.head()

Entrez_Gene_Id,390284.0,57714.0,1.0,87769.0,13.0,51166.0,16.0,57505.0,26574.0,18.0,...,8233.0,9753.0,221584.0,65982.0,7589.0,342945.0,54993.0,90204.0,440590.0,Tipo_Tumore
TCGA-AA-3664-01,-1.487,1.444,-0.3238,0.5944,-0.7453,0.6319,-0.6444,-0.1132,-1.2494,-1.184,...,-0.5085,-1.0939,2.5236,-1.1147,-0.4819,-0.1973,-0.3139,-0.7648,0.11,COADREAD
TCGA-AA-3715-01,-1.9785,0.5062,1.1803,-1.4075,-1.505,-1.8759,0.7678,1.3006,-1.1229,-1.1714,...,-2.6557,-2.2483,0.95,-0.5557,-2.4038,0.045,-0.895,0.3987,-0.6595,COADREAD
TCGA-AA-A01P-01,-0.13,1.331,1.1085,0.1305,0.2772,-0.099,1.0056,0.3054,0.1039,-1.2287,...,0.7692,0.3507,0.5222,0.0637,0.0201,0.5432,-0.6008,-1.4645,-0.2214,COADREAD
TCGA-AA-A022-01,0.3606,1.8555,1.3288,0.3522,-0.3166,0.6652,0.0283,0.0397,0.1054,-0.0434,...,1.0353,-1.5612,1.4304,-0.6259,0.1472,0.9336,-0.3133,-0.1778,-1.3802,COADREAD
TCGA-AA-A02R-01,0.528,2.179,-0.1198,-1.0786,-1.505,1.581,-0.2017,1.7164,-0.2208,-0.6816,...,-0.227,-1.0981,-0.1316,-0.5495,-0.6363,0.2926,-1.0888,-0.8471,-1.2224,COADREAD


In [6]:
_df = pd.read_csv("../data/dataset_completo.csv")
#_df

## Aggiungo la colonna SEX 

In [7]:
# Apro il file XLS
pazienti_names='../data/notizie cliniche.xlsx'
xls = pd.ExcelFile(pazienti_names)

print(xls.sheet_names)

['lusc_tcga_pan_can_atlas_2018_cl', 'skcm_tcga_pan_can_atlas_2018', 'thca_tcga_pan_can_atlas_2018', 'coadread_tcga_pan_can_atlas']


In [8]:
# Per il foglio excel fornito (relativo a un tipo di TUMOR), restituisco la tripla (SAMPLE ID, SEX, TUMOR)
def get_sex_column(sheet_number, tumor):
    pazienti = xls.sheet_names[sheet_number]

    item = pd.read_excel(pazienti_names, sheet_name=pazienti)
    sample_ids, sex = item['Sample ID'], item['Sex']
    
    return list(zip(sample_ids, sex, [tumor] * len(sample_ids)))

In [9]:
def add_gender(result):
    
    gender_lusc = get_sex_column(0, 'LUSC')
    gender_skcm = get_sex_column(1, 'SKCM')
    gender_thca = get_sex_column(2, 'THCA')
    gender_coad = get_sex_column(3, 'COADREAD')
    all_genders = gender_lusc + gender_skcm + gender_thca + gender_coad
    print("Numero di genders considerati: ", len(all_genders))
    
    result['Sex'] = None
    
    for i in range(len(all_genders)):
        gender_info = all_genders[i]
        paziente = gender_info[0]
        gender = gender_info[1]
        tumore = gender_info[2]
        
        if result.loc[paziente]['Tipo_Tumore'] != tumore:
            print("Errore")
            break
        
        result.at[paziente, 'Sex'] = gender
    
    result.to_csv("../data/dataset_completo_sex.csv")
    return result

result_sex = add_gender(result)

Numero di genders considerati:  275


In [10]:
result_sex.head()

Entrez_Gene_Id,390284.0,57714.0,1.0,87769.0,13.0,51166.0,16.0,57505.0,26574.0,18.0,...,9753.0,221584.0,65982.0,7589.0,342945.0,54993.0,90204.0,440590.0,Tipo_Tumore,Sex
TCGA-AA-3664-01,-1.487,1.444,-0.3238,0.5944,-0.7453,0.6319,-0.6444,-0.1132,-1.2494,-1.184,...,-1.0939,2.5236,-1.1147,-0.4819,-0.1973,-0.3139,-0.7648,0.11,COADREAD,Female
TCGA-AA-3715-01,-1.9785,0.5062,1.1803,-1.4075,-1.505,-1.8759,0.7678,1.3006,-1.1229,-1.1714,...,-2.2483,0.95,-0.5557,-2.4038,0.045,-0.895,0.3987,-0.6595,COADREAD,Male
TCGA-AA-A01P-01,-0.13,1.331,1.1085,0.1305,0.2772,-0.099,1.0056,0.3054,0.1039,-1.2287,...,0.3507,0.5222,0.0637,0.0201,0.5432,-0.6008,-1.4645,-0.2214,COADREAD,Female
TCGA-AA-A022-01,0.3606,1.8555,1.3288,0.3522,-0.3166,0.6652,0.0283,0.0397,0.1054,-0.0434,...,-1.5612,1.4304,-0.6259,0.1472,0.9336,-0.3133,-0.1778,-1.3802,COADREAD,Female
TCGA-AA-A02R-01,0.528,2.179,-0.1198,-1.0786,-1.505,1.581,-0.2017,1.7164,-0.2208,-0.6816,...,-1.0981,-0.1316,-0.5495,-0.6363,0.2926,-1.0888,-0.8471,-1.2224,COADREAD,Female


In [11]:
#print(result_sex.loc['TCGA-IM-A41Y-01'])

In [12]:
__df = pd.read_csv("../data/dataset_completo_sex.csv")
#__df