In [1]:
#########################
# Program to preprocess all raw files for a cancer type
#########################
import pandas as pd
import os
import glob # Use glob to read files with wildcards

# Location for LUAD files
#For all files use this path
path = "/Users/annika/annika/high-school/cs/tgca-downloads/STAD/*/*n450*.*txt"

# Get the list of files
dirList = glob.glob(path)
# Initialize a list
dflist = []
for d in dirList:
    # Read the next file
    df3 = pd.read_csv(d, sep = "\t")
    # Sort by the column which has the 'Composite Element REF' so that 
    # the methalyation sites are sorted
    df3.sort_values(by='Composite Element REF', inplace=True)
    # Imputation by mean values to fill up missing values
    df3['Beta_value'].fillna(df3['Beta_value'].mean(),inplace=True) 
    # Transpose the rows and columns, so that all beta values are in a row
    df3_transposed = df3.T
    # Create a new variable called 'header' from the first row of the dataset which are
    # the values of 'Composite Element REF' e.g. cg00000029
    df3_headers = df3_transposed.iloc[0] 
    # Replace the dataframe with a new one which only contains the beta values
    df3_transposed = df3_transposed[1:2]
    # Rename the dataframe's column names with the header variable i.e. 
    # 'Composite Element REF' e.g. cg00000029
    df3_transposed.rename(columns = df3_headers, inplace=True)
    # Create a new column with the column name as 'CancerType' and value as 'LUAD'
    df3_transposed = df3_transposed.assign(CancerType='STAD')
    # concat the dataframes into a list
    dflist.append(df3_transposed)
# Convert the list to a dataframe    
dfFinal = pd.concat(dflist)
# Save the dataframe to csv file, do not save the index
dfFinal.to_csv("../../data/STAD/STAD_betaValuesOnly.csv", index = False) 
# Now the raw files can be deleted and only the csv files can be retained.