An attempt to recreate MIDA Gondara and Wang(2018) from https://gist.github.com/lgondara/18387c5f4d745673e9ca8e23f3d7ebd3, which is written in R

# 1. Loading Dataset

## 1.1. Load a dataset and introduce missingness

Dataset used: Shuttle Dataset (https://archive.ics.uci.edu/ml/datasets/Statlog+(Shuttle)

### 1.1.1. Load the dataset and store it as dataframe(numeric)

In [1]:
import pandas as pd

In [10]:
def get_dataframe_from_csv(filename, header_row = None):
    """
    input filename (full path) and returns dataframe with data
    
    TO DO: 
        -: As of now reading headerless files with header = None, what if the data has a header, how to deal with that
        -: Should the last coloumn name be replaced with "label"?
        -: Add functionality for space de-limited or comma de-limited files
        -: Improve logging, make it module specific logging
    """
    assert isinstance(filename,str), "Input complete filename as a string"
    import pandas as pd
    import logging
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    
    logging.info("Input filename has to be space separated data")
    
    if not header_row:
        data_orig = pd.read_csv(filename,delim_whitespace=True,header=header_row)
    return data_orig 

In [11]:
#Test
filename = "data/shuttle/shuttle_trn"
train_df = get_dataframe_from_csv(filename)

INFO:root:Input filename has to be space separated data


In [12]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,50,21,77,0,28,0,27,48,22,2
1,55,0,92,0,0,26,36,92,56,4
2,53,0,82,0,52,-5,29,30,2,1
3,37,0,76,0,28,18,40,48,8,1
4,37,0,79,0,34,-26,43,46,2,1


In [13]:
train_df.dtypes

0    int64
1    int64
2    int64
3    int64
4    int64
5    int64
6    int64
7    int64
8    int64
9    int64
dtype: object

In [14]:
len(train_df)

43500

### 1.1.2. Inducing missingness

After dataset loading, start with inducing missingness. 

To start off, introduce simple random missing patterns (Missing Completely At Random), i.e sample half of the variables and set observations in those variables to missing if an appended random uniform vector has value less than a certain threshhold. This should introduce about 20\% missingness.

In [119]:
def induce_missingness(dataframe, perc_variables_sampled = 0.5, threshold = 0.2, logger_level =  20):
    """
    Steps:
        1. Append random uniform vector to the dataframe
        2. Decide thresold (default = 20%)
        3. Sample variables (default = 50%)
        4. In those variables (from 3), check the last column and if the value is less than threshold (2), set them to NaN
    
    
    TO DO: 
        -:Add how much percentage of the new data frame is NaN
    
    """
    import pandas as pd
    import numpy as np
    assert isinstance(dataframe,pd.DataFrame)
    import pandas as pd
    import logging
    logger = logging.getLogger()
    logger.setLevel(logger_level)
    
    observations_number, variables_number = dataframe.shape[0], dataframe.shape[1]-1 #-1 to account for the "label"
    sampled_dataframe = dataframe.iloc[:,:-1].sample(n = int(variables_number*perc_variables_sampled), axis = 1) #sample perc_variables_sampled, -1: to account for "label" 
    sampled_variables = list(sampled_dataframe.columns)
    sampled_dataframe["random"] = np.random.uniform(size = observations_number)
    
    new_df = dataframe[:]
    new_df.loc[sampled_dataframe["random"] < threshold, sampled_variables] = np.NAN
    
    logging.debug(f"\n{new_df.head()}")
    logging.debug(f"\n{sampled_dataframe.head()}")
    logging.debug(f"\n{dataframe.head()}")
    
    '''
      TO DO: 
        -:logging.info how much percentage of the new data frame is NaN
    '''
    logging.info("Returning new dataframe with MCAR induced missingness")
    
    return new_df

In [120]:
df1 = train_df[:]

In [121]:
induce_missingness(df1,logger_level=20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,50,21,77.0,0,28.0,0,27,48.0,22.0,2
1,55,0,92.0,0,0.0,26,36,92.0,56.0,4
2,53,0,82.0,0,52.0,-5,29,30.0,2.0,1
3,37,0,76.0,0,28.0,18,40,48.0,8.0,1
4,37,0,,0,,-26,43,,,1
5,85,0,88.0,-4,6.0,1,3,83.0,80.0,5
6,56,0,,0,,11,25,,,4
7,55,-1,95.0,-3,54.0,-4,40,41.0,2.0,1
8,53,8,77.0,0,28.0,0,23,48.0,24.0,4
9,37,0,101.0,-7,28.0,0,64,73.0,8.0,1


In [87]:
list(df1.columns)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [None]:
df1.co

In [64]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,50.0,21.0,77.0,0.0,28.0,0.0,27.0,48.0,22.0,2.0
1,55.0,0.0,92.0,0.0,0.0,26.0,36.0,92.0,56.0,4.0
2,53.0,0.0,82.0,0.0,52.0,-5.0,29.0,30.0,2.0,1.0
3,,,,,,,,,,
4,,,,,,,,,,


In [34]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,random
0,50,21,77,0,28,0,27,48,22,2,0.729022
1,55,0,92,0,0,26,36,92,56,4,0.037615
2,53,0,82,0,52,-5,29,30,2,1,0.532924
3,37,0,76,0,28,18,40,48,8,1,0.251247
4,37,0,79,0,34,-26,43,46,2,1,0.059572


In [35]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,50,21,77,0,28,0,27,48,22,2
1,55,0,92,0,0,26,36,92,56,4
2,53,0,82,0,52,-5,29,30,2,1
3,37,0,76,0,28,18,40,48,8,1
4,37,0,79,0,34,-26,43,46,2,1
