# 3 Different Approaches for Training/Test Splitting of a Pandas Dataframe

## Load Dataset as a Pandas Dataframe

In [1]:
import pandas as pd

df = pd.read_csv('source/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [2]:
df.shape

(303, 14)

## 1 sklearn.train_test_split()

Select columns for X and Y variables.

In [3]:
Y_col = 'output'
X_cols = df.loc[:, df.columns != Y_col].columns
Y_col, X_cols

('output',
 Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
        'exng', 'oldpeak', 'slp', 'caa', 'thall'],
       dtype='object'))

Define X and Y variables

In [4]:
X = df[X_cols]
Y = df[Y_col]

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[X_cols], df[Y_col],test_size=0.2)

In [6]:
X_train.shape, X_test.shape

((242, 13), (61, 13))

In [7]:
X_train.index

Int64Index([ 24,  51,  39, 131, 216,  34, 236,  44, 279, 229,
            ...
            167,  90, 170,  50,  79, 163, 161,   6,  10,  91],
           dtype='int64', length=242)

In [8]:
X_test.index

Int64Index([ 17,  96, 110, 101, 127,  68, 230, 253, 118, 197, 133, 245, 267,
             63, 155, 244, 139, 141,  42, 186,  28,  41, 243, 120,  48, 158,
            132, 275, 241, 206,  21,   4,  92, 293, 153, 129,  54,  61, 109,
            137, 285, 207, 114, 177, 232,  99, 173, 228, 148,  46, 282, 105,
            240, 117, 213, 172, 271, 246, 164,  58, 195],
           dtype='int64')

## 2 df.sample()

In [9]:
df_train = df.sample(frac=0.8, random_state=1)
df_test=df.drop(df_train.index)

X_train = df_train[X_cols]
X_test = df_test[X_cols]

y_train = df_train[Y_col]
y_test = df_test[Y_col]

In [10]:
X_train.shape, X_test.shape

((242, 13), (61, 13))

In [11]:
X_train.index

Int64Index([204, 159, 219, 174, 184, 295, 269, 119, 193, 154,
            ...
            190,  74,  92, 199, 260,  87, 143, 251,  10, 162],
           dtype='int64', length=242)

In [12]:
X_test.index

Int64Index([  1,   3,   7,   9,  15,  22,  23,  25,  26,  30,  32,  37,  43,
             49,  50,  57,  64,  68,  71,  72,  76,  83,  86, 100, 104, 109,
            115, 121, 125, 126, 129, 133, 136, 141, 144, 153, 155, 156, 165,
            166, 178, 195, 196, 203, 209, 215, 216, 226, 231, 235, 237, 241,
            252, 254, 255, 264, 276, 279, 281, 292, 294],
           dtype='int64')

## 3 np.random.rand()

In [13]:
import numpy as np

mask = np.random.rand(len(df)) < 0.8

df_train = df[mask]
df_test = df[~mask]

In [14]:
df_train.shape, df_test.shape

((256, 14), (47, 14))

In [15]:
df_train.index

Int64Index([  0,   1,   3,   4,   6,   7,   8,   9,  10,  11,
            ...
            292, 293, 294, 295, 296, 298, 299, 300, 301, 302],
           dtype='int64', length=256)

In [16]:
df_test.index

Int64Index([  2,   5,  16,  23,  34,  39,  46,  49,  50,  63,  68,  71,  81,
             85,  87,  90, 103, 108, 113, 115, 125, 135, 146, 151, 158, 172,
            182, 184, 191, 203, 208, 209, 225, 243, 245, 251, 252, 257, 262,
            265, 268, 273, 281, 282, 285, 289, 297],
           dtype='int64')

## Save results

In [17]:
X_train.to_csv('output/X_train.csv')
X_test.to_csv('output/X_test.csv')
y_train.to_csv('output/y_train.csv')
y_test.to_csv('output/y_test.csv')