# Splitting 
My target variable (regression) is imbalanced, so I will use a stratified regression split.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from math import ceil

In [2]:
# specifying data types for the columns to maintain formatting from original data
data_types = {
    'hospital_service_area': object, 
    'hospital_county': object,
    'operating_certificate_number': object, 
    'permanent_facility_id': object,
    'facility_name': object, 
    'age_group': object, 
    'zip_code_3_digits': object, 
    'gender': object, 
    'race': object,
    'ethnicity': object, 
    'payment_typology_1': object, 
    'payment_typology_2': object,
    'payment_typology_3': object, 
    'length_of_stay': int
}

In [3]:
all_visits = pd.read_csv('data/planned_deliveries.csv', dtype=data_types)
all_visits = all_visits.loc[:, all_visits.columns != 'Unnamed: 0']

In [4]:
y = all_visits['length_of_stay']
X = all_visits.loc[:, all_visits.columns != 'length_of_stay']

In [86]:
def stratified_continuous_split(X:pd.DataFrame, y:pd.Series, train_size:float, val_size:float, test_size:float, random_state:int):
    '''
    Performs a stratified split of inputted data (with respect to y) into a training set, validation set, and test set to specified percentages 
    of the data using verstack's scsplit and performs basic error checking.

    Parameters:
    - X: a 2D pandas DataFrame, the feature matrix
    - y: a 1D pandas Series, the target variable matrix matching X
    - train_size: a float between 0 and 1, the percentage of X which should be training data
    - val_size: a float between 0 and 1, the percentage of X which should be reserved for validation
    - test_size: a float between 0 and 1, the percentage of X which should be reserved for final testing
    - random_state: an int, the random state to split with
    Note: The sum of train_size + val_size + test_size must be 1.0 (100% of X).

    Returns:
    - (X_train) a 2D pandas DataFrame, the feature matrix of training data
    - (y_train) a 1D pandas Series, the target variable matrix for training data
    - (X_val) a 2D pandas DataFrame, the feature matrix of validation data
    - (y_val) a 1D pandas Series, the target variable matrix for validation data
    - (X_test) a 2D pandas DataFrame, the feature matrix of testing data
    - (y_test) a 1D pandas Series, the target variable matrix for testing data

    Raises:
    - ValueError for invalid input
    '''
    from verstack.stratified_continuous_split import scsplit
    
    if ((train_size + val_size + test_size) != 1):
        raise ValueError('Your train_size + val_size + test_size must add up to 1 (100%)!')
    if (not isinstance(random_state, int)):
        raise ValueError('Your random_state must be an int!')

    X_train, X_other, y_train, y_other = scsplit(X, y, stratify=y, test_size=(1-train_size), random_state=random_state)
    
    X_len = X.shape[0]
    test_percent_of_other = (test_size * X_len)/(X_len - (train_size * X_len))
    X_other = X_other.reset_index(drop=True)
    y_other = y_other.reset_index(drop=True)
    
    X_val, X_test, y_val, y_test = scsplit(X_other, y_other, stratify=y_other, test_size=test_percent_of_other, random_state=random_state)

    # basic error checking to check that split returned train, val, and test of expected sizes
    train_count_low = (int)(train_size * X_len)
    train_count_high = ceil(train_size * X_len)
    val_count_low = (int)(val_size * X_len)
    val_count_high = ceil(val_size * X_len)
    test_count_low = (int)(test_size * X_len)
    test_count_high = ceil(test_size * X_len)
    
    Xtrain_fin = X_train.shape[0]
    ytrain_fin = y_train.shape[0]
    Xval_fin = X_val.shape[0]
    yval_fin = y_val.shape[0]
    Xtest_fin = X_test.shape[0]
    ytest_fin = y_test.shape[0]
    
    if not (((Xtrain_fin == train_count_low) or (Xtrain_fin == train_count_high)) and ((ytrain_fin == train_count_low) or (ytrain_fin == train_count_high))):
        raise ValueError(f'Training set size should be approx. {train_size * X_len}, instead is: {X_train.shape[0]}')
    if not (((Xval_fin == val_count_low) or (Xval_fin == val_count_high)) and ((yval_fin == val_count_low) or (yval_fin == val_count_high))):
        raise ValueError(f'Validation set size should be approx. {val_size * X_len}, instead is: {X_val.shape[0]}')
    if not (((Xtest_fin == test_count_low) or (Xtest_fin == test_count_high)) and ((ytest_fin == test_count_low) or (ytest_fin == test_count_high))):
        raise ValueError(f'Test set size should be approx. {test_size * X_len}, instead is: {X_test.shape[0]}')

    return X_train, y_train, X_val, y_val, X_test, y_test

In [87]:
X_train, y_train, X_val, y_val, X_test, y_test = stratified_continuous_split(X, y, 0.6, 0.2, 0.2, 26)

In [88]:
print(np.unique(y_train,return_counts=True))
print(np.unique(y_val,return_counts=True))
print(np.unique(y_test,return_counts=True))

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 13, 14, 17]), array([1653, 5714, 3023,  692,  151,   30,    9,    4,    1,    1,    1,
          1,    1,    1], dtype=int64))
(array([ 1,  2,  3,  4,  5,  6,  7,  9, 12]), array([ 551, 1905, 1008,  231,   51,   10,    3,    1,    1], dtype=int64))
(array([ 1,  2,  3,  4,  5,  6,  7,  8, 14]), array([ 551, 1905, 1008,  231,   50,   10,    3,    2,    1], dtype=int64))


# Preprocessing
Preprocessing my train, validation, and test sets for use in ML models. I will address missing values in categorical and ordinal columns here.

In [67]:
# categorizing the columns in my dataset by how they should be encoded
onehot_ftrs = ['hospital_service_area', 'hospital_county', 'operating_certificate_number', 'permanent_facility_id', \
               'facility_name', 'zip_code_3_digits', 'gender', 'race', 'ethnicity', 'payment_typology_1', \
               'payment_typology_2', 'payment_typology_3']
ordinal_ftrs = ['age_group']
ordinal_cats = [['0 to 17', '18 to 29', '30 to 49', '50 to 69', '70 or Older']]

In [68]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

In [71]:
# replace missing values in categorical columns with 'not reported'
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='not reported')),
    ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'))])

# my data has no missing values in its ordinal column, so only encoding is necessary
ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories = ordinal_cats))])

In [72]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', categorical_transformer, onehot_ftrs),
        ('ordinal', ordinal_transformer, ordinal_ftrs)])

clf = Pipeline(steps=[('preprocessor', preprocessor)])
X_train_prep = clf.fit_transform(X_train) # ONLY fitting training data
X_val_prep = clf.transform(X_val)
X_test_prep = clf.transform(X_test)

print('Before fit_transform:', X_train.shape)
print('After fit_transform:', X_train_prep.shape)

Before fit_transform: (11282, 13)
After fit_transform: (11282, 496)


#### Non-pipelined categorical column encoding to verify results

In [78]:
cat_X_train = X_train.loc[:, X_train.columns != 'age_group']

In [82]:
si_cat = SimpleImputer(strategy='constant',fill_value='not reported')
si_cat = si_cat.fit(cat_X_train)
cat_X_train_no_nans = si_cat.transform(cat_X_train)
cat_X_train_no_nans

array([['Capital/Adirond', 'Albany', '0101000', ...,
        'Managed Care, Unspecified', 'not reported', 'not reported'],
       ['New York City', 'Queens', '7003000', ..., 'Medicaid',
        'not reported', 'not reported'],
       ['Finger Lakes', 'Monroe', '2754001', ..., 'Medicaid',
        'not reported', 'not reported'],
       ...,
       ['not reported', 'not reported', 'not reported', ...,
        'Private Health Insurance', 'not reported', 'not reported'],
       ['Hudson Valley', 'Westchester', '5907001', ..., 'Medicaid',
        'not reported', 'not reported'],
       ['Long Island', 'Suffolk', '5153000', ...,
        'Managed Care, Unspecified', 'not reported', 'not reported']],
      dtype=object)

In [83]:
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# fit the training data
enc.fit(cat_X_train_no_nans)
# print('categories:',enc.categories_)
print('feature names:',enc.get_feature_names_out(onehot_ftrs))
# transform X_train
X_train_ohe = enc.transform(cat_X_train_no_nans)
#print(X_train_ohe)
# do all of this in one step
X_train_ohe = enc.fit_transform(cat_X_train_no_nans)
print('X_train transformed')
print(X_train_ohe)
# transform X_test
X_test_ohe = enc.transform(cat_X_train_no_nans)
print('X_test transformed')
print(X_test_ohe)

feature names: ['hospital_service_area_Capital/Adirond'
 'hospital_service_area_Central NY' 'hospital_service_area_Finger Lakes'
 'hospital_service_area_Hudson Valley' 'hospital_service_area_Long Island'
 'hospital_service_area_New York City'
 'hospital_service_area_Southern Tier' 'hospital_service_area_Western NY'
 'hospital_service_area_not reported' 'hospital_county_Albany'
 'hospital_county_Allegany' 'hospital_county_Bronx'
 'hospital_county_Broome' 'hospital_county_Cattaraugus'
 'hospital_county_Cayuga' 'hospital_county_Chautauqua'
 'hospital_county_Chemung' 'hospital_county_Chenango'
 'hospital_county_Clinton' 'hospital_county_Cortland'
 'hospital_county_Dutchess' 'hospital_county_Erie'
 'hospital_county_Franklin' 'hospital_county_Fulton'
 'hospital_county_Genesee' 'hospital_county_Jefferson'
 'hospital_county_Kings' 'hospital_county_Livingston'
 'hospital_county_Madison' 'hospital_county_Manhattan'
 'hospital_county_Monroe' 'hospital_county_Montgomery'
 'hospital_county_Nassau' 

In [85]:
print(X_train_ohe.shape)

(11282, 495)
