In [1]:
#########################
# Program to preprocess all raw files for a cancer type
#########################
import pandas as pd
import os
import glob # Use glob to read files with wildcards

# Location for LUAD files
#For all files use this path
path = "/Users/annika/highschool/science-fair/synopsys-2019/raw-data/LUSC/*/*n450*.*txt"

# Get the list of files
dirList = glob.glob(path)
# Initialize a list
dflist = []
for d in dirList:
    # Read the next file
    df3 = pd.read_csv(d, sep = "\t")
    # Sort by the column which has the 'Composite Element REF' so that 
    # the methalyation sites are sorted
    df3.sort_values(by='Composite Element REF', inplace=True)
    # Imputation by mean values to fill up missing values
    df3['Beta_value'].fillna(df3['Beta_value'].mean(),inplace=True) 
    # Transpose the rows and columns, so that all beta values are in a row
    df3_transposed = df3.T
    # Create a new variable called 'header' from the first row of the dataset which are
    # the values of 'Composite Element REF' e.g. cg00000029
    df3_headers = df3_transposed.iloc[0] 
    # Replace the dataframe with a new one which only contains the beta values
    df3_transposed = df3_transposed[1:2]
    # Rename the dataframe's column names with the header variable i.e. 
    # 'Composite Element REF' e.g. cg00000029
    df3_transposed.rename(columns = df3_headers, inplace=True)
    # Create a new column with the column name as 'CancerType' and value as 'LUSC'
    df3_transposed = df3_transposed.assign(CancerType='LUSC')
    # concat the dataframes into a list
    dflist.append(df3_transposed)
# Convert the list to a dataframe    
dfFinal = pd.concat(dflist)
# Save the dataframe to csv file, do not save the index
dfFinal.to_csv("/Users/annika/highschool/science-fair/synopsys-2019/output-data/LUSC/LUSC_betaValuesOnly.csv", index = False) 
# Now the raw files can be deleted and only the csv files can be retained.

In [2]:
#########################
# Test
#########################
dfFinal.head(10)

Unnamed: 0,cg00000029,cg00000108,cg00000109,cg00000165,cg00000236,cg00000289,cg00000292,cg00000321,cg00000363,cg00000622,...,rs798149,rs845016,rs877309,rs9292570,rs9363764,rs939290,rs951295,rs966367,rs9839873,CancerType
Beta_value,0.192233,0.444082,0.444082,0.437818,0.891653,0.686637,0.669414,0.661537,0.17093,0.0144179,...,0.0171241,0.0491737,0.967247,0.955235,0.040929,0.767885,0.509276,0.0418312,0.391726,LUSC
Beta_value,0.255612,0.443937,0.443937,0.166282,0.868745,0.628335,0.510295,0.500748,0.275229,0.0140966,...,0.0210358,0.0716497,0.963729,0.950703,0.553096,0.462415,0.525245,0.0487105,0.924722,LUSC
Beta_value,0.260794,0.431099,0.431099,0.45785,0.854843,0.658224,0.445955,0.405763,0.248956,0.0155969,...,0.964654,0.44892,0.468137,0.0261742,0.942848,0.455287,0.507145,0.474504,0.923575,LUSC
Beta_value,0.0806007,0.405027,0.405027,0.447316,0.905936,0.439351,0.274778,0.546109,0.128682,0.0121982,...,0.0162767,0.0389224,0.515944,0.965336,0.033168,0.844161,0.530504,0.0380709,0.937422,LUSC
Beta_value,0.0873921,0.425372,0.425372,0.834098,0.914908,0.628755,0.773698,0.746225,0.089895,0.00933953,...,0.0105813,0.920381,0.978404,0.965149,0.450326,0.156574,0.533733,0.950294,0.914907,LUSC
Beta_value,0.34052,0.451601,0.451601,0.238552,0.841372,0.631624,0.647576,0.381522,0.267064,0.0157424,...,0.970399,0.110295,0.966937,0.961821,0.0896893,0.955139,0.53257,0.899642,0.860347,LUSC
Beta_value,0.442577,0.46551,0.46551,0.317684,0.892931,0.734212,0.554247,0.440179,0.352198,0.0170786,...,0.0184421,0.450891,0.0173889,0.0204752,0.625978,0.967221,0.449276,0.0321134,0.947432,LUSC
Beta_value,0.17498,0.430431,0.430431,0.59306,0.898874,0.384079,0.602559,0.754391,0.12034,0.0125914,...,0.0141474,0.0697833,0.178144,0.974763,0.0915756,0.956101,0.62743,0.684369,0.098075,LUSC
Beta_value,0.0979332,0.39548,0.39548,0.441396,0.920951,0.70777,0.549316,0.62832,0.116435,0.0119118,...,0.0128162,0.487075,0.97278,0.965532,0.962402,0.226922,0.0667001,0.870676,0.965451,LUSC
Beta_value,0.188403,0.474838,0.474838,0.521536,0.898297,0.687171,0.694121,0.532692,0.272998,0.0140212,...,0.954149,0.495956,0.69662,0.952707,0.898214,0.727657,0.968195,0.266402,0.860481,LUSC
