# Splitting 
My target variable (regression) is imbalanced, so I will use a stratified regression split.

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from math import ceil

In [2]:
# specifying data types for the columns to maintain formatting from original data
data_types = {
    'hospital_service_area': object, 
    'hospital_county': object,
    'operating_certificate_number': object, 
    'permanent_facility_id': object,
    'facility_name': object, 
    'age_group': object, 
    'zip_code_3_digits': object, 
    'gender': object, 
    'race': object,
    'ethnicity': object, 
    'payment_typology_1': object, 
    'payment_typology_2': object,
    'payment_typology_3': object, 
    'length_of_stay': int
}

In [3]:
all_visits = pd.read_csv('data/planned_deliveries.csv', dtype=data_types)
all_visits = all_visits.loc[:, all_visits.columns != 'Unnamed: 0']

In [6]:
y = all_visits.loc[:, 'length_of_stay']
X = all_visits.loc[:, all_visits.columns != 'length_of_stay']

In [11]:
def stratified_regression_split(X:pd.DataFrame, y:pd.Series, train_size:float, val_size:float, test_size:float, n_bins: int, random_state:int):
    '''
    Performs a stratified split of inputted data (with respect to y) into a training set, validation set, and test set to specified percentages of the data and
    performs basic error checking.

    Parameters:
    - X: a 2D pandas DataFrame, the feature matrix
    - y: a 1D pandas Series, the target variable matrix matching X
    - train_size: a float between 0 and 1, the percentage of X which should be training data
    - val_size: a float between 0 and 1, the percentage of X which should be reserved for validation
    - test_size: a float between 0 and 1, the percentage of X which should be reserved for final testing
    - n_bins: an int, the number of bins to categorize the target variable y using (in order to perform stratified split)
    - random_state: an int, the random state to split with
    Note: The sum of train_size + val_size + test_size must be 1.0 (100% of X).

    Returns:
    - (X_train) a 2D pandas DataFrame, the feature matrix of training data
    - (y_train) a 1D pandas Series, the target variable matrix for training data
    - (X_val) a 2D pandas DataFrame, the feature matrix of validation data
    - (y_val) a 1D pandas Series, the target variable matrix for validation data
    - (X_test) a 2D pandas DataFrame, the feature matrix of testing data
    - (y_test) a 1D pandas Series, the target variable matrix for testing data

    Raises:
    - ValueError for invalid input
    '''
    if ((train_size + val_size + test_size) != 1):
        raise ValueError('Your train_size + val_size + test_size must add up to 1 (100%)!')
    if (not isinstance(random_state, int)):
        raise ValueError('Your random_state must be an int!')

    bins = np.linspace(np.min(y), np.max(y), n_bins+1)
    y_binned = np.digitize(y,bins)

    X_train, X_other, y_train, y_other = train_test_split(X, y, train_size=train_size, stratify=y_binned, random_state=random_state)
    
    X_len = X.shape[0]
    val_percent_of_other = (val_size * X_len)/(X_len - (train_size * X_len))
    X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, train_size=val_percent_of_other, stratify=y_binned, random_state=random_state)
    
    if not ((X_train.shape[0] == (int)(train_size * X_len)) and (y_train.shape[0] == (int)(train_size * X_len))):
        raise ValueError(f'Training set size should be {train_size * X_len}, instead is: {X_train.shape[0]}')
    if not ((X_val.shape[0] == (int)(val_size * X_len)) and (y_val.shape[0] == (int)(val_size * X_len))):
        raise ValueError(f'Validation set size should be {val_size * X_len}, instead is: {X_val.shape[0]}')
    if not ((X_test.shape[0] == ceil(test_size * X_len)) and (y_test.shape[0] == ceil(test_size * X_len))):
        raise ValueError(f'Test set size should be {test_size * X_len}, instead is: {X_test.shape[0]}')

    return X_train, y_train, X_val, y_val, X_test, y_test

In [13]:
# stratifying length_of_stay into 17 bins corresponding to stays of 0-17 days (each bin is 1 day)
X_train, y_train, X_val, y_val, X_test, y_test = stratified_regression_split(X=X, y=y, train_size=0.6, val_size=0.2, test_size=0.2, n_bins=17, random_state=54)

bins = np.linspace(np.min(y), np.max(y), 18)   # n+1 values where n is the number of bins = 17 given stays of 0-17 days

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.