This is a "second attempt" at the chunked VIF program, but it does away with implementing a class. Keeping it as simple as possible.

In [1]:
import sys
import numpy as np
import pandas as pd
import statsmodels.api as smapi
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
def dt_chk(X):   
    
    """
    FUNCTION:
    This method reads in a .mydataML type and ensures that data is numerical. 
    It returns a DataFrame object to be read into other methods.

    INPUTS:
    file --> Any .mydataML file large enough to warrant evaluation in chunks.

    RETURNS:
    df --> A processed DataFrame object.
    """ 
    
    # Strip erroneous space, drop non-numeric columns, ensure data is an appropriate type:
    df = pd.read_csv(X, sep="\t")
    df.columns = df.columns.str.strip()
    df.dropna()
    int_cols = \
    df = df._get_numeric_data()
    data_type = df.dtypes
    # Subset DataFrame to include only relevant numerical types.
    df.select_dtypes(include=["int", "int16", "int32", "int64", "float", 
    "float16", "float32", "float64"]).shape[1]

    return df
    
df = dt_chk('discrete.dataforml')

In [3]:
# Bank the "universal" predictors (age, sex, etc.):
def get_uni_col(df):
    
    """
    FUNCTION:
    This method subsets columns from a DataFrame to be added later to a list of DataFrames.

    INPUTS:
    df --> A DataFrame object pre-processed by dt_chk().

    RETURNS:
    cov_uni --> A DataFrame object including only the so-called 'universal columns.'
    """
    col_uni = df.iloc[:,1:5]
    return col_uni

col_uni = get_uni_col(df)

In [4]:
# Bank the 'PHENO' column (outcome):
def get_pheno_col(df):
    
    """
    FUNCTION:
    This method subsets columns from a DataFrame to used in a VIF calculation.

    INPUTS:
    df --> A DataFrame object pre-processed by dt_chk().

    RETURNS:
    df_pheno--> A DataFrame object including only the so-called 'universal columns.'
    """
    df_pheno = df.iloc[:,0:1]
    return df_pheno

df_pheno = get_pheno_col(df)

In [5]:
# Bank the 'SNP' predictors:
def df_snps(df):
    
    """
    FUNCTION:
    This method subsets columns from a DataFrame those to be used in a VIF calculation.

    INPUTS:
    df --> A DataFrame object pre-processed by dt_chk().

    RETURNS:
    snp_preds--> A DataFrame object including only the SNP columns.
    """
    
    df_snps = df.iloc[:,5:]
    return df_snps

df_snps = df_snps(df)

In [6]:
def frag_df(df_snps, og_chunk_delim=50):
    
    """
    FUNCTION:
    This method takes a DataFrame, evaluates its length, determines a fragmentation size,
    and separates the DataFrame into chunks. It returns a list of fragmented DataFrames.

    INPUTS:
    df_snps --> A DataFrame object, ideally one pre-processed by df_snps().
    local --> Determines whether DataFrame fragmentation logic is dependent on arbitrary
    size and scale (local applications), or on availability of HPC cores.

    RETURNS:
    df_list --> A list including a series of fragmented DataFrame objects.
    cov_uni --> Elements from original DataFrame to be included in all VIF calculations.
    """
    
    df_list = []    
    # Subset df by all SNP predictor columns and find the total number of SNPs in the infile.
    snp_count = len(df_snps)
    # Create counters to be used by an iterative loop (for local applications).
    snp_counter = 0
    num_chunks = 1
    chunk_delim = og_chunk_delim

    # Iterate through the snp_count DataFrame and split it into chunks.         
    while chunk_delim + og_chunk_delim <= snp_count:
        df_list.append(df_snps.iloc[:,snp_counter:chunk_delim])
        print('\n' 'SNP elements added to chunk #', num_chunks,
        'which range from', snp_counter, 'to', chunk_delim)                         
        print('Chunk #', num_chunks, 'has been added to df_list.')
        # print(cov_snps.iloc[:,snp_counter:chunk_delim])
        # Move snp_counter up by specified chunk_delim (Defaults to 50 SNPs).
        snp_counter += og_chunk_delim
        chunk_delim += og_chunk_delim
        num_chunks += 1
        
    # Create an end-of-file condition:
    df_list.append(df_snps.iloc[:,snp_counter:snp_count])
    num_chunks = len(df_list)
    print('\n' 'SNP elements added to chunk #', num_chunks,
    'which range from', snp_counter, 'to', snp_count)        
    print('Chunk #', num_chunks, 'has been added to df_list.')
    # print(cov_snps.iloc[:,snp_counter:snp_count])                         
    print('\n' 'DataFrame fragmentation complete. Proceeding to VIF analysis ...')

    return df_list

df_list = frag_df(df_snps)


SNP elements added to chunk # 1 which range from 0 to 50
Chunk # 1 has been added to df_list.

SNP elements added to chunk # 2 which range from 50 to 100
Chunk # 2 has been added to df_list.

SNP elements added to chunk # 3 which range from 100 to 150
Chunk # 3 has been added to df_list.

SNP elements added to chunk # 4 which range from 150 to 200
Chunk # 4 has been added to df_list.

SNP elements added to chunk # 5 which range from 200 to 250
Chunk # 5 has been added to df_list.

SNP elements added to chunk # 6 which range from 250 to 300
Chunk # 6 has been added to df_list.

SNP elements added to chunk # 7 which range from 300 to 350
Chunk # 7 has been added to df_list.

SNP elements added to chunk # 8 which range from 350 to 400
Chunk # 8 has been added to df_list.

SNP elements added to chunk # 9 which range from 400 to 450
Chunk # 9 has been added to df_list.

SNP elements added to chunk # 10 which range from 450 to 500
Chunk # 10 has been added to df_list.

DataFrame fragmentati

In [11]:
def vif_calc(df_list, threshold=5.0):
        
    """
    FUNCTION: This method takes a list of DataFrame objects and conducts VIF analysis
    on each of them, dropping columns based on some VIF threshold.
    
    INPUTS:
    vif_list --> A list of DataFrame objects processed by frag_df.
    threshold --> The VIF threshold by which columns are to be evaluated.

    RETURNS:
    vif_list --> A list of DataFrames without multicolinear predictor columns.
    """
    
    print('\n Iterating through', len(df_list), 'features of the passed list.') 
    print('\n Dropping columns with a VIF threshold greater than', threshold, '... \n')
    
    df_index = 0
    drop_counter = 0
    
    for df in df_list:
        # Create a list of indices corresponding to each column in a given chunk.
        variables = list(range(df.shape[1]))
        df_index +=1
        dropped = True

        while dropped:
            dropped = False
            vif = [variance_inflation_factor(df.iloc[:, variables].values, var) for var in variables]
            max_loc = vif.index(max(vif))
                
            if max(vif) > threshold:
                g = (float("{0:.2f}".format(max(vif))))
                print('Dropping', df.iloc[:, variables].columns[max_loc], 
                      'at index', str(max_loc + 1), 'within Chunk #', df_index,
                      'with a VIF of', g)
                
                df.drop(df.columns[variables[max_loc]], 1, inplace=True)
                variables = list(range(df.shape[1]))
                dropped = True
                drop_counter += 1
    print('Removed a total of', drop_counter, 'SNPs.')
    
    return df_list

vif_list = vif_calc(df_list, 2.0)


 Iterating through 10 features of the passed list.

 Dropping columns with a VIF threshold greater than 2.0 ... 

Removed a total of 0 SNPs.


In [12]:
def reconstitute_df(vif_list):

    """
    FUNCTION: This method takes a list of VIF-calculated DataFrame objects and combines them.
    
    INPUTS:
    vif_list --> A list of DataFrame objects processed by vif_calc

    RETURNS:
    recon_df --> A combined DataFrame without multicolinear predictor columns.
    """
    
    recon_df = pd.concat(vif_list, axis=1)
    
    return recon_df

recon_df = reconstitute_df(vif_list)

In [13]:
def column_loss(df_snps, recon_df):
    
    """
    FUNCTION: This method returns the number of columns removed in the last VIF filter.
    
    INPUTS:
    df --> The original DataFrame the program started with.
    recon_df --> The DataFrame processed in the last filtering iteration

    RETURNS:
    lost_col --> The number of columns removed in the last VIF filter.
    """
    
    col_loss = len(df_snps) - len(recon_df)
    return col_loss
col_loss = column_loss(df_snps, recon_df)

In [10]:
def bootstrap_shuffle(recon_df, col_loss, axis=1):
      
    shuffled_df = recon_df.copy()
    for k in range(col_loss): 
        shuffled_df.apply(np.random.shuffle(shuffled_df.values),axis=axis)  
    return shuffled_df

bootstrap = bootstrap_shuffle(recon_df, col_loss)
print(bootstrap)

     snp403_T  snp439_G  snp370_A  snp389_T  snp475_T  snp399_A  snp223_T  \
0           0         0         1         0         0         1         1   
1           0         0         0         1         0         1         0   
2           0         0         1         0         0         0         2   
3           0         0         0         0         0         0         1   
4           0         0         0         1         1         0         0   
5           0         0         0         0         0         1         0   
6           0         0         0         0         0         1         2   
7           0         0         0         1         0         0         0   
8           1         0         1         1         0         0         2   
9           0         0         0         0         0         0         0   
10          1         0         0         0         1         0         2   
11          0         1         1         1         0         0         1   