In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine
from IPython.display import display

In [15]:
def train_validation_test_split(X, y,
                                train_size=0.7, val_size=0.1,
                                test_size=0.2, random_state=None,
                                shuffle=True):
    """
    This function is a utility wrapper around the Scikit-Learn train_test_split that splits arrays or 
    matrices into train, validation, and test subsets.

    Args:
        X (Numpy array or DataFrame): This is the first param.
        y (Numpy array or DataFrame): This is a second param.
        train_size (float or int): Proportion of the dataset to include in the train split (0 to 1).
        val_size (float or int): Proportion of the dataset to include in the validation split (0 to 1).
        test_size (float or int): Proportion of the dataset to include in the test split (0 to 1).
        random_state (int): Controls the shuffling applied to the data before applying the split for reproducibility.
        shuffle (bool): Whether or not to shuffle the data before splitting

    Returns:
        Train, test, and validation dataframes for features (X) and target (y). 
    """
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, shuffle=shuffle)

    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=val_size / (train_size + val_size),
        random_state=random_state, shuffle=shuffle)

    return X_train, X_val, X_test, y_train, y_val, y_test

def print_split_summary(X_train, X_val, X_test):

    print('######################## TRAINING DATA ########################')
    print(f'X_train Shape: {X_train.shape}')
    display(X_train.head())
    display(X_train.describe(include='all').transpose())
    print('')

    print('######################## VALIDATION DATA ######################')
    print(f'X_val Shape: {X_val.shape}')
    display(X_val.head())
    display(X_val.describe(include='all').transpose())
    print('')

    print('######################## TEST DATA ############################')
    print(f'X_test Shape: {X_test.shape}')
    display(X_test.head())
    display(X_test.describe(include='all').transpose())
    print('')

In [16]:
raw_data = load_wine()
df = pd.DataFrame(data=raw_data['data'], columns=raw_data['feature_names'])
df['target'] = raw_data['target']
print(df.shape)

(178, 14)


In [17]:
X_train, X_val, X_test, y_train, y_val, y_test = train_validation_test_split(
    df[['alcohol', 'ash']], df['target'])
print(X_train.shape, X_val.shape, X_test.shape)

(124, 2) (18, 2) (36, 2)


In [18]:
print_split_summary(X_train, X_val, X_test)

######################## TRAINING DATA ########################
X_train Shape: (124, 2)


Unnamed: 0,alcohol,ash
91,12.0,2.42
5,14.2,2.45
105,12.42,2.27
58,13.72,2.5
76,13.03,1.71


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
alcohol,124.0,13.030968,0.804589,11.41,12.4075,13.05,13.7225,14.83
ash,124.0,2.376532,0.270711,1.7,2.2175,2.36,2.6,3.22



######################## VALIDATION DATA ######################
X_val Shape: (18, 2)


Unnamed: 0,alcohol,ash
97,12.29,1.98
160,12.36,2.38
39,14.22,2.51
6,14.39,2.45
114,12.08,2.5


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
alcohol,18.0,13.117222,0.81059,12.0,12.3375,13.195,13.7025,14.39
ash,18.0,2.296111,0.21669,1.94,2.1225,2.36,2.445,2.6



######################## TEST DATA ############################
X_test Shape: (36, 2)


Unnamed: 0,alcohol,ash
59,12.37,1.36
36,13.28,2.84
174,13.4,2.48
101,12.6,1.9
109,11.61,2.7


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
alcohol,36.0,12.837778,0.837425,11.03,12.2375,12.775,13.325,14.75
ash,36.0,2.367222,0.31204,1.36,2.23,2.36,2.54,3.23





## Refactor Functions to Class


In [33]:
class My_Data_Splitter():
    def __init__(self, df, features, target):
        self.df = df
        self.features = features
        self.target = target
        self.X = df[features]
        self.y = df[target]
        
    def train_validation_test_split(self,
                                    train_size=0.7, val_size=0.1,
                                    test_size=0.2, random_state=None,
                                    shuffle=True):
        """
        This function is a utility wrapper around the Scikit-Learn train_test_split that splits arrays or 
        matrices into train, validation, and test subsets.

        Args:
            X (Numpy array or DataFrame): This is the first param.
            y (Numpy array or DataFrame): This is a second param.
            train_size (float or int): Proportion of the dataset to include in the train split (0 to 1).
            val_size (float or int): Proportion of the dataset to include in the validation split (0 to 1).
            test_size (float or int): Proportion of the dataset to include in the test split (0 to 1).
            random_state (int): Controls the shuffling applied to the data before applying the split for reproducibility.
            shuffle (bool): Whether or not to shuffle the data before splitting

        Returns:
            Train, test, and validation dataframes for features (X) and target (y). 
        """
        X_train_val, X_test, y_train_val, y_test = train_test_split(
            self.X, self.y, test_size=test_size, random_state=random_state, shuffle=shuffle)

        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val, test_size=val_size / (train_size + val_size),
            random_state=random_state, shuffle=shuffle)

        return X_train, X_val, X_test, y_train, y_val, y_test

    def print_split_summary(self, X_train, X_val, X_test):

        print('######################## TRAINING DATA ########################')
        print(f'X_train Shape: {X_train.shape}')
        display(X_train.head())
        display(X_train.describe(include='all').transpose())
        print('')

        print('######################## VALIDATION DATA ######################')
        print(f'X_val Shape: {X_val.shape}')
        display(X_val.head())
        display(X_val.describe(include='all').transpose())
        print('')

        print('######################## TEST DATA ############################')
        print(f'X_test Shape: {X_test.shape}')
        display(X_test.head())
        display(X_test.describe(include='all').transpose())
        print('')

In [34]:
splitter = My_Data_Splitter(df=df, features=['alcohol', 'ash'], target='target')
X_train, X_val, X_test, y_train, y_val, y_test = splitter.train_validation_test_split()
splitter.print_split_summary(X_train, X_val, X_test)

######################## TRAINING DATA ########################
X_train Shape: (124, 2)


Unnamed: 0,alcohol,ash
76,13.03,1.71
111,12.52,2.17
149,13.08,2.36
85,12.67,2.24
118,12.77,1.98


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
alcohol,124.0,12.925,0.803584,11.03,12.29,12.975,13.5125,14.83
ash,124.0,2.355081,0.280802,1.36,2.2,2.35,2.545,3.23



######################## VALIDATION DATA ######################
X_val Shape: (18, 2)


Unnamed: 0,alcohol,ash
49,13.94,2.27
137,12.53,2.64
112,11.76,2.92
128,12.37,2.3
164,13.78,2.3


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
alcohol,18.0,13.164444,0.873655,11.76,12.3825,13.165,13.8625,14.37
ash,18.0,2.448889,0.277338,1.7,2.3,2.465,2.6175,2.92



######################## TEST DATA ############################
X_test Shape: (36, 2)


Unnamed: 0,alcohol,ash
61,12.64,2.02
146,13.88,2.23
56,14.22,2.3
15,13.63,2.7
63,12.37,2.16


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
alcohol,36.0,13.179167,0.791232,11.45,12.5675,13.215,13.8425,14.3
ash,36.0,2.364722,0.249416,1.7,2.2375,2.37,2.5225,2.86



