# Missing Data

## Plotting missing data

In [2]:
import numpy as np
import matplotlib.pyplot as plt

def missing_data (data, percentage):
    col_names = []
    percentages = []
    num_col = 0
    for column in data:
        zeroPercentage = sum(pd.isnull(data[column]))/len(data[column])*100
        if zeroPercentage > 0 :
            col_names.append(column)
            percentages.append(zeroPercentage)
            #print (column + "\t", zeroPercentage)
            num_col += 1
    print("Missing data:", num_col)
    
    plt.figure(figsize=(20,20))
    plt.barh(col_names, percentages, alpha=0.5)
    plt.title('Missing Data by feature (%)')
    #plt.margins(0.2)
    plt.xlabel('%')
    #plt.grid() 
    plt.savefig('MissingData_Updated.png', bbox_inches='tight')
    plt.show()
    
def get_columns_with_missing_data(data, percentage):
    col_names = []
    percentages = []
    num_col = 0
    for column in data:
        zeroPercentage = sum(pd.isnull(data[column]))/len(data[column])*100
        if zeroPercentage > 0 :
            col_names.append(column)
            percentages.append(zeroPercentage)
            #print (column + "\t", zeroPercentage)
            num_col += 1
    print("Missing data:", num_col)
    return col_names

## Predicting missing data

In [3]:
def training_predictor_missing_data (data, n_iter, cv):
    data_complete = data.dropna()
    nan_columns = data.columns[data.isna().any()].tolist()
    completed_columns = data.columns[~data.isna().any()].tolist()

    X = data_complete[completed_columns]
    y_true = data_complete[nan_columns]

    CVSearch = findBestRegressorModel(X, y_true, n_iter=n_iter, cv=cv)
    best_pred_model = CVSearch.best_estimator_

    
    return (completed_columns, nan_columns, best_pred_model, X, y_true)

In [4]:
def predicting_missing_data (model, data, completed_columns, nan_columns):
    predicted_data = pd.DataFrame(model.predict(data[completed_columns]), 
                   index=data.index,
                   columns=nan_columns)
    return predicted_data

In [5]:
def training_predictor_missing_data_efficient (data_complete, completed_columns, nan_column, n_iter, cv):
    print("Forecasting of", nan_column)

    X = data_complete[completed_columns]
    y_true = data_complete[nan_column]

    CVSearch = findBestRegressorModel(X, y_true, n_iter=n_iter, cv=cv)
    best_pred_model = CVSearch.best_estimator_

    
    return (best_pred_model, X, y_true)

In [6]:
def predicting_missing_data_efficient (model, data, completed_columns, nan_column):
    predicted_data = pd.DataFrame(model.predict(data[completed_columns]), 
                   index=data.index,
                   columns=[nan_column])
    return predicted_data

## Random Forest Regressor (for predicting missing data)

In [7]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint


def findBestRegressorModel(features, target, n_iter, cv):
    # **********Hyperparameter tuning************** 
    
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start=int(len(features)/4), stop=int(len(features)/2), num=10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(2, 5, num = 3)]
    #max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
  
    
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    #pprint(random_grid)


    # Use the random grid to search for best hyperparameters

    # First create the base model to tune
    # rf = RandomForestClassifier(n_estimators = 50, criterion = 'entropy')
    rf = RandomForestRegressor(n_estimators = 50, criterion = 'mse', random_state=42)


    # Random search of parameters, using 10 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = n_iter, cv = cv, 
                                   verbose=1, random_state=42, n_jobs = -1)

    # Train the model on training data
    trained_model = rf_random.fit(features, target);

    # Look at parameters used by our current forest
    # print('Parameters found by the best estimator:\n')
    # pprint(rf_random.best_estimator_.get_params())
    
    return trained_model

# Managing Missing Data

In [None]:
def manage_missing_data (df, dimension, percentage):
    
    # Remove columns with more than X% missing data
    for column in df:
        nullPercentage = sum(pd.isnull(df[column]))/len(df[column])*100
        if nullPercentage > percentage :
            if dimension=="columns":
                df = df.drop(column, axis=1)
            else:
                df = df.dropna(axis=0, subset=[column])
                
    # Predicting remaining missing data
    completed_columns, nan_columns, best_pred_model, X, y_true = training_predictor_missing_data (df, n_iter=50, cv=10)
    y_pred = predicting_missing_data(best_pred_model, X, completed_columns, nan_columns)


    res = pd.DataFrame(best_pred_model.predict(df[completed_columns]), 
                   index=df.index,
                   columns=nan_columns)

    df_full = df.fillna(res)
    return df_full
    