# 1. Load Requirement Libraries

In [1]:
import pandas as pd
import src.util as utils
from sklearn.model_selection import train_test_split

# 2. Load Confoguration File

In [2]:
config = utils.load_config()

# 3. Load Dataset

In [3]:
dataset = pd.read_csv(config["dataset_path"])

In [4]:
dataset

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [5]:
dataset = dataset[['Age', 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Outcome']]
dataset

Unnamed: 0,Age,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Outcome
0,50,6,148,72,35,0,33.6,0.627,1
1,31,1,85,66,29,0,26.6,0.351,0
2,32,8,183,64,0,0,23.3,0.672,1
3,21,1,89,66,23,94,28.1,0.167,0
4,33,0,137,40,35,168,43.1,2.288,1
...,...,...,...,...,...,...,...,...,...
763,63,10,101,76,48,180,32.9,0.171,0
764,27,2,122,70,27,0,36.8,0.340,0
765,30,5,121,72,23,112,26.2,0.245,0
766,47,1,126,60,0,0,30.1,0.349,1


# 4. Data Validation

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       768 non-null    int64  
 1   Pregnancies               768 non-null    int64  
 2   Glucose                   768 non-null    int64  
 3   BloodPressure             768 non-null    int64  
 4   SkinThickness             768 non-null    int64  
 5   Insulin                   768 non-null    int64  
 6   BMI                       768 non-null    float64
 7   DiabetesPedigreeFunction  768 non-null    float64
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [7]:
dataset.isnull().sum()

Age                         0
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Outcome                     0
dtype: int64

In [8]:
dataset.describe()

Unnamed: 0,Age,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,33.240885,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,0.348958
std,11.760232,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,0.476951
min,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078,0.0
25%,24.0,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,0.0
50%,29.0,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,0.0
75%,41.0,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,1.0
max,81.0,17.0,199.0,122.0,99.0,846.0,67.1,2.42,1.0


# 5. Data Defense

In [9]:
def check_data(input_data, config):
    # Measure the range of input data
    len_input_data = len(input_data)

    # Check data types
    assert input_data.select_dtypes("int64").columns.to_list() == config["int_columns"], "an error occurs in int column(s)."
    assert input_data.select_dtypes("float").columns.to_list() == config["float_columns"], "an error occurs in float column(s)."

    # Check range of data
    assert input_data[config["float_columns"][0]].between(config["range_bmi"][0], config["range_bmi"][1]).sum() == len_input_data, "an error occurs in bmi range."
    assert input_data[config["float_columns"][1]].between(config["range_diabetes_pedigree_function"][0], config["range_diabetes_pedigree_function"][1]).sum() == len_input_data, "an error occurs in diabetes pedigree function range."
    assert input_data[config["int_columns"][0]].between(config["range_age"][0], config["range_age"][1]).sum() == len_input_data, "an error occurs in age range."
    assert input_data[config["int_columns"][1]].between(config["range_pregnancies"][0], config["range_pregnancies"][1]).sum() == len_input_data, "an error occurs in pregnancies range."
    assert input_data[config["int_columns"][2]].between(config["range_glucose"][0], config["range_glucose"][1]).sum() == len_input_data, "an error occurs in glucose range."
    assert input_data[config["int_columns"][3]].between(config["range_blood_pressure"][0], config["range_blood_pressure"][1]).sum() == len_input_data, "an error occurs in blood pressure range."
    assert input_data[config["int_columns"][4]].between(config["range_skin_thickness"][0], config["range_skin_thickness"][1]).sum() == len_input_data, "an error occurs in skin thickness range."
    assert input_data[config["int_columns"][5]].between(config["range_insulin"][0], config["range_insulin"][1]).sum() == len_input_data, "an error occurs in insulin range."
    assert input_data[config["int_columns"][6]].between(config["range_outcome"][0], config["range_outcome"][1]).sum() == len_input_data, "an error occurs in outcome range."    

In [10]:
check_data(dataset, config)

# 6. Data Splitting

In [11]:
x = dataset[config["predictors"]].copy()
y = dataset[config["label"]].copy()

In [12]:
x

Unnamed: 0,Age,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction
0,50,6,148,72,35,0,33.6,0.627
1,31,1,85,66,29,0,26.6,0.351
2,32,8,183,64,0,0,23.3,0.672
3,21,1,89,66,23,94,28.1,0.167
4,33,0,137,40,35,168,43.1,2.288
...,...,...,...,...,...,...,...,...
763,63,10,101,76,48,180,32.9,0.171
764,27,2,122,70,27,0,36.8,0.340
765,30,5,121,72,23,112,26.2,0.245
766,47,1,126,60,0,0,30.1,0.349


In [13]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42, stratify = y)

In [15]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42, stratify = y_test)

In [16]:
utils.pickle_dump(dataset, config["dataset_cleaned_path"])

utils.pickle_dump(x_train, config["train_set_path"][0])
utils.pickle_dump(y_train, config["train_set_path"][1])

utils.pickle_dump(x_valid, config["valid_set_path"][0])
utils.pickle_dump(y_valid, config["valid_set_path"][1])

utils.pickle_dump(x_test, config["test_set_path"][0])
utils.pickle_dump(y_test, config["test_set_path"][1])