# Load in the data and perform basic quality control

The input is the background covariates file and an output directory.
The output is a background covariates file that has been quality controlled.


In [1]:
import pandas as pd
import numpy as np
import sys, os

## Set input variables

In [2]:
background = "data/background.csv"
output_dir = "output"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## Read in data and perform some basic quality control

In [None]:
'''
* summary: loads data
* input: 
* output:  bg, tr
'''

# read in as pandas dataframes
bg = pd.read_csv(background, low_memory=False)

'''
* summary: replaces row indices with `challengeID`
* input:   bg, tr
* output:  bg, tr
'''

nb_samples = bg.shape[0]
assert bg['challengeID'].to_dict().values() == range(1, nb_samples + 1)

bg = bg.set_index('challengeID')

'''
* summary: cleans background data; replaces 'missing' with -3, ignores non-numeric columns,
  ignores completely missing columns. Ignores columns with more than 50% missingness.  
* input:   bg
* output:  bg
'''

bg = bg.replace('missing', -3)
bg = bg.apply(lambda x: pd.to_numeric(x, errors='ignore'))
bg = bg.select_dtypes(include = [np.number])

threshold = 0.5 # lower is stricter, default to 0.5
bg[bg < 0] = np.nan
col_names_to_remove = []
for col_name in bg:
    to_remove = False
    col = bg[col_name]
    null_prop = float(sum(col.isnull())) / nb_samples
    
    if null_prop >= threshold:
        to_remove = True
    
    if to_remove:
        col_names_to_remove.append(col_name)

nb_features = bg.shape[1]
print '{}/{} features kept'.format(nb_features - len(col_names_to_remove), 
                                   nb_features) 
bg = bg.drop(col_names_to_remove, axis=1)
bg.to_csv(output_dir+'/background.csv', index=True)