In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import model_selection

[Link to dataset used](https://www.kaggle.com/andrewmvd/fetal-health-classification)

In [2]:
df = pd.read_csv('fetal_health.csv')

In [3]:
def custom_split(dataset,test_size,ratio_class_1,ratio_class_2,ratio_class_3):
    
    '''Function to split dataset using different ratio for different classes
    
    Parameters:
    
    dataset: pandas dataframe on which to perform custom split
    test_size: ratio of the dataframe to be used. Ranges between 0-1
    ratio_class_1: ratio of class 1 samples in the custom split dataset 
    ratio_class_2: ratio of class 2 samples in the custom split dataset
    ratio_class_3: ratio of class 3 samples in the custom split dataset
    
    Note that ratio_class_1+ratio_class_2+ratio_class_3 should equal to 1. 
    
    '''
    
    test_set_len = int(test_size*len(dataset)) # desired test set length
        
    count_class_1 = int(test_set_len*ratio_class_1) #actual count of number of class 1 samples
    count_class_2 = int(test_set_len*ratio_class_2)
    count_class_3 = int(test_set_len*ratio_class_3)

    y_test_value = [] 
    indexes = []

    for index, row in dataset.iterrows():
        if index == 0:
            y_test_value.append(row['fetal_health'])
            indexes.append(0)
        else:
            if len(y_test_value)<test_set_len:
                if row['fetal_health']==1.0 and y_test_value.count(1.0)<count_class_1:
                    y_test_value.append(row['fetal_health'])
                    indexes.append(index)

                elif row['fetal_health']==2.0 and y_test_value.count(2.0)<count_class_2:
                    y_test_value.append(row['fetal_health'])
                    indexes.append(index)

                elif row['fetal_health']==3.0 and y_test_value.count(3.0)<count_class_3:
                    y_test_value.append(row['fetal_health'])
                    indexes.append(index)
    return y_test_value,indexes

In [4]:
y_test_value,indexes = custom_split(df,test_size=0.05,ratio_class_1=0.50,ratio_class_2=0.30,ratio_class_3=0.20)

In [5]:
np.unique(y_test_value,return_counts=True)

(array([1., 2., 3.]), array([53, 31, 21]))

Now we have a test dataset with correct class ratios we specified.

We can now create the test dataframe using the indexes from the custom_split function

In [6]:
test_df = df.iloc[df.index.isin(indexes)]

In [7]:
test_df

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.000,0.000,0.000,0.000,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.000,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.000,0.008,0.000,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,125.0,0.000,0.009,0.000,0.000,0.0,0.0,68.0,0.3,75.0,...,120.0,128.0,0.0,0.0,125.0,125.0,126.0,0.0,0.0,3.0
296,148.0,0.000,0.015,0.000,0.000,0.0,0.0,74.0,0.2,78.0,...,139.0,153.0,0.0,0.0,148.0,148.0,149.0,0.0,0.0,3.0
297,148.0,0.000,0.012,0.000,0.000,0.0,0.0,75.0,0.2,84.0,...,145.0,152.0,1.0,0.0,148.0,148.0,149.0,0.0,0.0,3.0
298,148.0,0.000,0.019,0.000,0.000,0.0,0.0,70.0,0.3,71.0,...,139.0,153.0,1.0,0.0,150.0,148.0,150.0,0.0,1.0,3.0


The above dataframe is now the test dataframe

To get the remaining dataframe, which can be used for training, we remove the indexes to create the test set.

In [8]:
remaining_df = df.drop(indexes, axis=0)

In [9]:
remaining_df

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
77,145.0,0.007,0.009,0.002,0.0,0.0,0.0,36.0,1.4,0.0,...,57.0,176.0,3.0,1.0,148.0,150.0,150.0,12.0,1.0,1.0
78,145.0,0.003,0.003,0.001,0.0,0.0,0.0,34.0,1.7,0.0,...,57.0,174.0,6.0,1.0,150.0,147.0,150.0,11.0,1.0,1.0
79,145.0,0.005,0.010,0.005,0.0,0.0,0.0,35.0,1.9,0.0,...,56.0,196.0,5.0,0.0,148.0,150.0,151.0,12.0,1.0,1.0
80,145.0,0.000,0.002,0.002,0.0,0.0,0.0,34.0,1.7,0.0,...,57.0,166.0,5.0,1.0,150.0,147.0,150.0,10.0,1.0,1.0
81,145.0,0.002,0.008,0.003,0.0,0.0,0.0,40.0,1.4,0.0,...,56.0,196.0,9.0,1.0,148.0,148.0,149.0,7.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,140.0,0.000,0.000,0.007,0.0,0.0,0.0,79.0,0.2,25.0,...,137.0,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0,2.0
2122,140.0,0.001,0.000,0.007,0.0,0.0,0.0,78.0,0.4,22.0,...,103.0,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0,2.0
2123,140.0,0.001,0.000,0.007,0.0,0.0,0.0,79.0,0.4,20.0,...,103.0,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0,2.0
2124,140.0,0.001,0.000,0.006,0.0,0.0,0.0,78.0,0.4,27.0,...,103.0,169.0,6.0,0.0,152.0,147.0,151.0,4.0,1.0,2.0
