In [None]:
import pandas as pd
import numpy as np
import src.utils as utils

from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer

# Load Config File

In [None]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

# Plan Preprocessing

**Summary EDA**:
- No missing value
- Features are uncorellated
- Scaling for Time and Amount Features

# Load Dataset

In [None]:
def load_dataset(return_file=True):
    # Load train data
    X_train = utils.pickle_load(CONFIG_DATA['train_set_path'][0])
    y_train = utils.pickle_load(CONFIG_DATA['train_set_path'][1])

    # Load valid data
    X_valid = utils.pickle_load(CONFIG_DATA['valid_set_path'][0])
    y_valid = utils.pickle_load(CONFIG_DATA['valid_set_path'][1])

    # Load test data
    X_test = utils.pickle_load(CONFIG_DATA['test_set_path'][0])
    y_test = utils.pickle_load(CONFIG_DATA['test_set_path'][1])

    # Print 
    print("X_train shape :", X_train.shape)
    print("y_train shape :", y_train.shape)
    print("X_valid shape :", X_valid.shape)
    print("y_valid shape :", y_valid.shape)
    print("X_test shape  :", X_test.shape)
    print("y_test shape  :", y_test.shape)

    if return_file:
        return X_train, X_valid, X_test, y_train, y_valid, y_test

In [None]:
X_train, X_valid, X_test, y_train, y_valid, y_test = load_dataset()

# Preprocess Train

Scale the data

In [None]:
def fit_standardize(data, return_file=True, columns=['Time', 'Amount']):
    """Find standardizer data"""
    standardizer = RobustScaler()

    # Fit standardizer
    standardizer.fit(data[columns])

    # Dump standardizer
    utils.pickle_dump(standardizer, CONFIG_DATA['standardizer_path'])
    
    if return_file:
        return standardizer

In [None]:
# Fit standardizer
standardizer = fit_standardize(data=X_train)

In [None]:
def transform_standardize(data, standardizer, columns=['Time', 'Amount']):
    """Function to standardize data"""
    data_standard = pd.DataFrame(standardizer.transform(data[columns]))
    data_standard.index = data.index
    data[columns] = data_standard
    return data


In [None]:
# Transform
X_train_std = transform_standardize(data = X_train,
                                    standardizer = standardizer)

In [None]:
X_train_std

In [None]:
X_train_std.describe()

Balancing Data

In [None]:
# See that the data is unbalanced
y_train.value_counts(normalize=True)

We will do the downsampling (only for training data)

In [None]:
# !pip install imblearn

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
def random_undersampler(X, y):
    """Function to under sample the majority data"""
    # Create resampling object
    ros = RandomUnderSampler(random_state = CONFIG_DATA['seed'])

    # Balancing the set data
    X_resample, y_resample = ros.fit_resample(X, y)

    # Print
    print('Distribution before resampling :')
    print(y.value_counts())
    print("")
    print('Distribution after resampling  :')
    print(y_resample.value_counts())

    return X_resample, y_resample

In [None]:
X_train_clean, y_train_clean = random_undersampler(X_train_std, y_train)

Drop all preprocessor

In [None]:
preprocessor = {
    'standardizer': standardizer
}

utils.pickle_dump(preprocessor, CONFIG_DATA['preprocessor_path'])

# Preprocess all

In [None]:
def clean_data(data,  standardizer):
    """Function to clean data"""

    # Standardize data
    data_standard = transform_standardize(data, standardizer)

    return data_standard

In [None]:
def _preprocess_data(data):
    """Function to preprocess data"""
    # Load preprocessor
    preprocessor = utils.pickle_load(CONFIG_DATA['preprocessor_path'])
    standardizer = preprocessor['standardizer']

    data_clean = clean_data(data,
                            standardizer)
    
    return data_clean

# Preprocess all

Generate preprocessor

In [None]:
def generate_preprocessor(return_file=True):
    """Function to generate preprocessor"""
    # Load data
    X = utils.pickle_load(CONFIG_DATA['train_set_path'][0])
    y = utils.pickle_load(CONFIG_DATA['train_set_path'][1])

    # Generate preprocessor: standardizer
    standardizer = fit_standardize(X)

    # Dump file
    preprocessor = {
        'standardizer': standardizer
    }
    utils.pickle_dump(preprocessor, CONFIG_DATA['preprocessor_path'])
    
    if return_file:
        return preprocessor
    

In [None]:
preprocessor = generate_preprocessor()

For X train

In [None]:
def preprocess_data(type='train', return_file=True):
    """Function to preprocess train data"""
    # Load data
    X = utils.pickle_load(CONFIG_DATA[f'{type}_set_path'][0])
    y = utils.pickle_load(CONFIG_DATA[f'{type}_set_path'][1])
        
    # Preprocess data
    X_clean = _preprocess_data(X)
    y_clean = y

    # FOR TRAINING ONLY -> DO UNDERSAMPLING
    if type == 'train':
        X_clean, y_clean = random_undersampler(X_clean, y_clean)

    # Print shape
    print("X clean shape:", X_clean.shape)
    print("y clean shape:", y_clean.shape)

    # Dump file
    utils.pickle_dump(X_clean, CONFIG_DATA[f'{type}_clean_path'][0])
    utils.pickle_dump(y_clean, CONFIG_DATA[f'{type}_clean_path'][1])

    if return_file:
        return X_clean, y_clean    

In [None]:
# Transform X_train
X_train_clean, y_train_clean = preprocess_data(type = 'train')

In [None]:
# Transform X_valid
X_valid_clean, y_valid_clean = preprocess_data(type = 'valid')

In [None]:
# Transform X_train
X_test_clean, y_test_clean = preprocess_data(type = 'test')

In [None]:
X_train_clean.describe()

In [None]:
X_train_clean.columns.tolist()