# Preface: 

We use the train_test_split function from sklearn all the time to separate our test and train data. However, in order to make adjustments to sampling, you have to understand the underlying code and see what's going on behind the curtains

I wrote two train test split functions one for numpy arrays and one for pandas dataframes to illustrate the underlying code of this function.

In [54]:
#First the numpy train test split function

import numpy as np
def train_test_split(X, y, test_size=0.2, random_state=None):
   
 # check random seed, set to standard
    if random_state is not None:
        np.random.seed(random_state)

#If test_size argument is a float, change to int
    if isinstance(test_size, float):
        test_index = int(X.shape[0] * test_size)
#If test_size argument is a int use as is
    elif isinstance(test_size, int):
         test_index = test_size
    else:
        raise ValueError("test_size must be a float or an int")

    permutation = np.random.permutation(X.shape[0])
    test_indices = permutation[:test_index]
    train_indices = permutation[test_index:]

    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, X_test, y_train, y_test

In [55]:
#Test this out with numpy arrays
#Just pulled this from a random number generator

X = np.array([[1, 1, 2, 3, 3, 4, 4, 6, 7, 8, 10], 
              [1, 2, 4, 5, 2, 3, 4, 5, 2, 1, 2], 
              [2, 3, 5, 3, 4, 1, 3, 4, 5, 6, 7],
              [1, 1, 2, 3, 3, 4, 4, 6, 7, 8, 10], 
              [1, 2, 4, 5, 2, 3, 4, 5, 2, 1, 2], 
              [2, 3, 5, 3, 4, 1, 3, 4, 5, 6, 7],
              [1, 1, 2, 3, 3, 4, 4, 6, 7, 8, 10], 
              [1, 2, 4, 5, 2, 3, 4, 5, 2, 1, 2], 
              [2, 3, 5, 3, 4, 1, 3, 4, 5, 6, 7],
              [1, 1, 2, 3, 3, 4, 4, 6, 7, 8, 10], 
              [1, 2, 4, 5, 2, 3, 4, 5, 2, 1, 2], 
              [2, 3, 5, 3, 4, 1, 3, 4, 5, 6, 7],
              [1, 1, 2, 3, 3, 4, 4, 6, 7, 8, 10], 
              [1, 2, 4, 5, 2, 3, 4, 5, 2, 1, 2], 
              [2, 3, 5, 3, 4, 1, 3, 4, 5, 6, 7],
              [1, 1, 2, 3, 3, 4, 4, 6, 7, 8, 10], 
              [1, 2, 4, 5, 2, 3, 4, 5, 2, 1, 2], 
              [2, 3, 5, 3, 4, 1, 3, 4, 5, 6, 7],
              [1, 1, 2, 3, 3, 4, 4, 6, 7, 8, 10], 
              [1, 2, 4, 5, 2, 3, 4, 5, 2, 1, 2], 
              [2, 3, 5, 3, 4, 1, 3, 4, 5, 6, 7],
              [1, 1, 2, 3, 3, 4, 4, 6, 7, 8, 10], 
              [1, 2, 4, 5, 2, 3, 4, 5, 2, 1, 2], 
              [2, 3, 5, 3, 4, 1, 3, 4, 5, 6, 7],
              [1, 1, 2, 3, 3, 4, 4, 6, 7, 8, 10], 
              [1, 2, 4, 5, 2, 3, 4, 5, 2, 1, 2], 
              [2, 3, 5, 3, 4, 1, 3, 4, 5, 6, 7],
              [1, 1, 2, 3, 3, 4, 4, 6, 7, 8, 10], 
              [1, 2, 4, 5, 2, 3, 4, 5, 2, 1, 2], 
              [2, 3, 5, 3, 4, 1, 3, 4, 5, 6, 7],
              [1, 1, 2, 3, 3, 4, 4, 6, 7, 8, 10], 
              [1, 2, 4, 5, 2, 3, 4, 5, 2, 1, 2], 
              [2, 3, 5, 3, 4, 1, 3, 4, 5, 6, 7],
              [1, 1, 2, 3, 3, 4, 4, 6, 7, 8, 10], 
              [1, 2, 4, 5, 2, 3, 4, 5, 2, 1, 2], 
              [2, 3, 5, 3, 4, 1, 3, 4, 5, 6, 7],
              [1, 1, 2, 3, 3, 4, 4, 6, 7, 8, 10], 
              [1, 2, 4, 5, 2, 3, 4, 5, 2, 1, 2], 
              [2, 3, 5, 3, 4, 1, 3, 4, 5, 6, 7]])

#Made these up

y = np.array([12, 13, 14, 15, 16, 17, 18, 19, 11, 20, 11, 11, 13, 14, 15, 13, 14, 15, 16, 13, 16, 20, 22, 12, 11, 10, 12, 12, 11,
             12, 122, 14, 15, 16, 16, 17, 17, 19, 19, 10, 11, 12, 14, 16, 32, 13, 12])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 10)

print("X_train is ", X_train, "\n")
print("X_test is ", X_test, "\n")
print("y_train is ", y_train, "\n")
print("y_test is ", y_test, "\n")

X_train is  [[ 1  1  2  3  3  4  4  6  7  8 10]
 [ 2  3  5  3  4  1  3  4  5  6  7]
 [ 1  2  4  5  2  3  4  5  2  1  2]
 [ 1  1  2  3  3  4  4  6  7  8 10]
 [ 1  2  4  5  2  3  4  5  2  1  2]
 [ 1  2  4  5  2  3  4  5  2  1  2]
 [ 1  2  4  5  2  3  4  5  2  1  2]
 [ 1  2  4  5  2  3  4  5  2  1  2]
 [ 1  1  2  3  3  4  4  6  7  8 10]
 [ 1  2  4  5  2  3  4  5  2  1  2]
 [ 2  3  5  3  4  1  3  4  5  6  7]
 [ 2  3  5  3  4  1  3  4  5  6  7]
 [ 2  3  5  3  4  1  3  4  5  6  7]
 [ 2  3  5  3  4  1  3  4  5  6  7]
 [ 2  3  5  3  4  1  3  4  5  6  7]
 [ 2  3  5  3  4  1  3  4  5  6  7]
 [ 1  2  4  5  2  3  4  5  2  1  2]
 [ 2  3  5  3  4  1  3  4  5  6  7]
 [ 1  2  4  5  2  3  4  5  2  1  2]
 [ 1  2  4  5  2  3  4  5  2  1  2]
 [ 1  1  2  3  3  4  4  6  7  8 10]
 [ 1  1  2  3  3  4  4  6  7  8 10]
 [ 1  1  2  3  3  4  4  6  7  8 10]
 [ 1  1  2  3  3  4  4  6  7  8 10]] 

X_test is  [[ 1  1  2  3  3  4  4  6  7  8 10]
 [ 1  2  4  5  2  3  4  5  2  1  2]
 [ 2  3  5  3  4  1  3  4  5  6  7]
 [

This is great and all, but usually we split the dataset of pandas dataframes and not numpyarrays

The following code splits pandas dataframes


In [56]:
#Now develop a train test split using pandas dataframe
import pandas as pd

In [57]:
#You can find this dataset in the datasets folder

df = pd.read_csv("Diabetes.csv")
def train_test_split_pandas(df, test_size=0.2, random_state=None):

    if random_state:
        np.random.seed(random_state)

    shuffled_indices = np.random.permutation(len(df))
    test_set_size = int(len(df) * test_size)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]

    train_df = df.iloc[train_indices]
    test_df = df.iloc[test_indices]
    return train_df, test_df



In [58]:
print(np.random.seed(True))

None


In [59]:
#Test out the pandas split function

train_df, test_df = train_test_split_pandas(df, test_size=0.3, random_state=42)

print("Train DataFrame:")
print(train_df)
print("\n\n\n\n\n")
print("Test DataFrame:")
print(test_df)

Train DataFrame:
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
302            5       77             82             41       42  35.8   
334            1       95             60             18       58  23.9   
139            5      105             72             29      325  36.9   
485            0      135             68             42      250  42.3   
547            4      131             68             21      166  33.1   
18             1      103             30             38       83  43.3   
593            2       82             52             22      115  28.5   
140            3      128             78              0        0  21.1   
326            1      122             64             32      156  35.1   
266            0      138              0              0        0  36.3   
626            0      125             68              0        0  24.7   
83             0      101             65             28        0  24.6   
61             8     