In [173]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import pytest
import ipytest
ipytest.autoconfig()

### Quick check in the data

In [174]:
# Simple setup in the data
filepath = r'D:\DevOps\MLOpsPactices\Parkinson\parkinsons_updrs.data'

parkinson_data = pd.read_csv(filepath)
parkinson_data['target'] = parkinson_data['total_UPDRS'] > parkinson_data['total_UPDRS'].median()

parkinson_data.columns = (
    parkinson_data.columns.str.replace('[#%:(),]', '_', regex=True).str.lower()
)
parkinson_data = pd.DataFrame(parkinson_data, columns=parkinson_data.columns)


In [175]:
parkinson_data

Unnamed: 0,subject_,age,sex,test_time,motor_updrs,total_updrs,jitter___,jitter_abs_,jitter_rap,jitter_ppq5,...,shimmer_apq3,shimmer_apq5,shimmer_apq11,shimmer_dda,nhr,hnr,rpde,dfa,ppe,target
0,1,72,0,5.6431,28.199,34.398,0.00662,0.000034,0.00401,0.00317,...,0.01438,0.01309,0.01662,0.04314,0.014290,21.640,0.41888,0.54842,0.16006,True
1,1,72,0,12.6660,28.447,34.894,0.00300,0.000017,0.00132,0.00150,...,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.10810,True
2,1,72,0,19.6810,28.695,35.389,0.00481,0.000025,0.00205,0.00208,...,0.00734,0.00844,0.01458,0.02202,0.020220,23.047,0.46222,0.54405,0.21014,True
3,1,72,0,25.6470,28.905,35.810,0.00528,0.000027,0.00191,0.00264,...,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.48730,0.57794,0.33277,True
4,1,72,0,33.6420,29.187,36.375,0.00335,0.000020,0.00093,0.00130,...,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,42,61,0,142.7900,22.485,33.485,0.00406,0.000031,0.00167,0.00168,...,0.00973,0.01133,0.01549,0.02920,0.025137,22.369,0.64215,0.55314,0.21367,True
5871,42,61,0,149.8400,21.988,32.988,0.00297,0.000025,0.00119,0.00147,...,0.01052,0.01277,0.01904,0.03157,0.011927,22.886,0.52598,0.56518,0.12621,True
5872,42,61,0,156.8200,21.495,32.495,0.00349,0.000025,0.00152,0.00187,...,0.01371,0.01456,0.01877,0.04112,0.017701,25.065,0.47792,0.57888,0.14157,True
5873,42,61,0,163.7300,21.007,32.007,0.00281,0.000020,0.00128,0.00151,...,0.00693,0.00870,0.01307,0.02078,0.007984,24.422,0.56865,0.56327,0.14204,True


In [176]:
parkinson_data.head()

Unnamed: 0,subject_,age,sex,test_time,motor_updrs,total_updrs,jitter___,jitter_abs_,jitter_rap,jitter_ppq5,...,shimmer_apq3,shimmer_apq5,shimmer_apq11,shimmer_dda,nhr,hnr,rpde,dfa,ppe,target
0,1,72,0,5.6431,28.199,34.398,0.00662,3.4e-05,0.00401,0.00317,...,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006,True
1,1,72,0,12.666,28.447,34.894,0.003,1.7e-05,0.00132,0.0015,...,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081,True
2,1,72,0,19.681,28.695,35.389,0.00481,2.5e-05,0.00205,0.00208,...,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014,True
3,1,72,0,25.647,28.905,35.81,0.00528,2.7e-05,0.00191,0.00264,...,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277,True
4,1,72,0,33.642,29.187,36.375,0.00335,2e-05,0.00093,0.0013,...,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361,True


In [177]:
parkinson_data['target'].value_counts()

target
False    2941
True     2934
Name: count, dtype: int64

In [178]:
parkinson_data.describe()

Unnamed: 0,subject_,age,sex,test_time,motor_updrs,total_updrs,jitter___,jitter_abs_,jitter_rap,jitter_ppq5,...,shimmer_db_,shimmer_apq3,shimmer_apq5,shimmer_apq11,shimmer_dda,nhr,hnr,rpde,dfa,ppe
count,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,...,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0
mean,21.494128,64.804936,0.317787,92.863722,21.296229,29.018942,0.006154,4.4e-05,0.002987,0.003277,...,0.31096,0.017156,0.020144,0.027481,0.051467,0.03212,21.679495,0.541473,0.65324,0.219589
std,12.372279,8.821524,0.465656,53.445602,8.129282,10.700283,0.005624,3.6e-05,0.003124,0.003732,...,0.230254,0.013237,0.016664,0.019986,0.039711,0.059692,4.291096,0.100986,0.070902,0.091498
min,1.0,36.0,0.0,-4.2625,5.0377,7.0,0.00083,2e-06,0.00033,0.00043,...,0.026,0.00161,0.00194,0.00249,0.00484,0.000286,1.659,0.15102,0.51404,0.021983
25%,10.0,58.0,0.0,46.8475,15.0,21.371,0.00358,2.2e-05,0.00158,0.00182,...,0.175,0.00928,0.01079,0.015665,0.02783,0.010955,19.406,0.469785,0.59618,0.15634
50%,22.0,65.0,0.0,91.523,20.871,27.576,0.0049,3.5e-05,0.00225,0.00249,...,0.253,0.0137,0.01594,0.02271,0.04111,0.018448,21.92,0.54225,0.6436,0.2055
75%,33.0,72.0,1.0,138.445,27.5965,36.399,0.0068,5.3e-05,0.00329,0.00346,...,0.365,0.020575,0.023755,0.032715,0.061735,0.031463,24.444,0.614045,0.711335,0.26449
max,42.0,85.0,1.0,215.49,39.511,54.992,0.09999,0.000446,0.05754,0.06956,...,2.107,0.16267,0.16702,0.27546,0.48802,0.74826,37.875,0.96608,0.8656,0.73173


### Setting up the classes to build a simple model

In [179]:
class ParkinsonUPDRSModel:
    def __init__(self, filepath):
        self.filepath = filepath

    def load_data(self):
        # Load the data
        self.data = pd.read_csv(self.filepath)
        self.data['target'] = self.data['total_UPDRS'] > self.data['total_UPDRS'].median()
        
        self.data.columns = (
            self.data.columns.str.replace('[#%:(),]', '_', regex=True).str.lower()
        )

        self.data = pd.DataFrame(self.data, columns=self.data.columns)

        X = self.data.select_dtypes(include=['float64', 'int64']).drop(columns=['motor_updrs', 'total_updrs', 'sex'])
        y = self.data['total_updrs']

        # Removing High correlated columns
        correlated_cols = []
        for col in X:
            if 'total_updrs' in col or 'jitter_rap' in col or 'jitter_ppq5' in col or 'jitter_ddp' in col or 'shimmer_db_' in col or 'shimmer_apq3' in col or 'shimmer_apq5' in col or 'shimmer_apq11' in col or 'shimmer_dda' in col:
                correlated_cols.append(col)

        # Define preprocessing
        preprocessing = ColumnTransformer(
            transformers=[
                ('drop_correlated_cols', 'drop', correlated_cols)
            ],
            remainder='passthrough'
        )

        pipeline = Pipeline([
            ('preprocessing', preprocessing),
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=0.90))
        ])
        
        # Train-test split
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        self.X_train = pipeline.fit_transform(self.X_train)
        self.X_test = pipeline.transform(self.X_test)

        self.feature_names = pipeline.named_steps['preprocessing'].get_feature_names_out()
        return self
    
    def train_model(self):
        # GradientBoostingRegressor model
        self.model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
        self.model.fit(self.X_train, self.y_train)
        return self
    
    def predict(self, input_data):
        return self.model.predict(input_data)

    def get_accuracy(self):
        return self.model.score(X=self.X_test, y=self.y_test)
    
    def run_pipeline(self):
        """Execution method for running the pipeline several times."""
        self.load_data()
        self.train_model()

### Running the pipeline

In [180]:
filepath=r'D:\DevOps\MLOpsPactices\Parkinson\parkinsons_updrs.data'

pipeline = ParkinsonUPDRSModel(filepath)
pipeline.run_pipeline()
accuracy_score = pipeline.get_accuracy()
print(f'The Accuracy of the model is: {accuracy_score}')

The Accuracy of the model is: 0.616928781521296


# Testing

In [181]:
# Define the input data schema with expected ranges and data types
input_data_schema = {
    "subject_": {"dtype": "int64", "range": {"min": 1, "max": 42}},
    "age": {"dtype": "int64", "range": {"min": 36, "max": 85}},
    "sex": {"dtype": "int64", "range": {"min": 0, "max": 1}},
    "test_time": {"dtype": "float64", "range": {"min": -4.2625, "max": 215.49}},
    "motor_updrs": {"dtype": "float64", "range": {"min": 5.0377, "max": 39.511}},
    "total_updrs": {"dtype": "float64", "range": {"min": 7, "max": 54.992}},
    "jitter___": {"dtype": "float64", "range": {"min": 0.0008, "max": 0.09999}},
    "jitter_abs_": {"dtype": "float64", "range": {"min": 0.000002, "max": 0.000446}},
    "jitter_rap": {"dtype": "float64", "range": {"min": 0.0003, "max": 0.05754}},
    "jitter_ppq5": {"dtype": "float64", "range": {"min": 0.0004, "max": 0.06956}},
    "jitter_ddp": {"dtype": "float64", "range": {"min": 0.0009, "max": 0.17261}},
    "shimmer": {"dtype": "float64", "range": {"min": 0.004, "max": 0.64898}},
    "shimmer_db_": {"dtype": "float64", "range": {"min": 0.02, "max": 2.107}},
    "shimmer_apq3": {"dtype": "float64", "range": {"min": 0.001, "max": 0.16267}},
    "shimmer_apq5": {"dtype": "float64", "range": {"min": 0.0019, "max": 0.16702}},
    "shimmer_apq11": {"dtype": "float64", "range": {"min": 0.0024, "max": 0.27546}},
    "shimmer_dda": {"dtype": "float64", "range": {"min": 0.004, "max": 0.48802}},
    "nhr": {"dtype": "float64", "range": {"min": 0.0002, "max": 0.74826}},
    "hnr": {"dtype": "float64", "range": {"min": 1.5, "max": 37.875}},
    "rpde": {"dtype": "float64", "range": {"min": 0.15, "max": 0.96608}},
    "dfa": {"dtype": "float64", "range": {"min": 0.51, "max": 0.8656}},
    "ppe": {"dtype": "float64", "range": {"min": 0.02, "max": 0.73173}},
}

# Convert all keys to lowercase
input_data_schema = {k.lower(): v for k, v in input_data_schema.items()}

In [182]:
filepath=r'D:\DevOps\MLOpsPactices\Parkinson\parkinsons_updrs.data'
@pytest.fixture
def pipeline():
    pl = ParkinsonUPDRSModel(filepath)
    pl.run_pipeline()
    return pl

### Creating the tests

In [None]:
%%ipytest

def test_input_data_ranges(pipeline):
    max_values = pipeline.data.max()
    min_values = pipeline.data.min()
    
    for feature in pipeline.feature_names:
        clean_feature = feature.replace("remainder__", "")
        print(clean_feature)
        assert max_values[clean_feature] <= input_data_schema[clean_feature]['range']['max'] + 0.001, \
            f"Max value for {clean_feature} exceeds schema range"

def test_input_data_types(pipeline):
    data_types = pipeline.data.dtypes
    
    for feature in pipeline.feature_names:
        clean_feature = feature.replace("remainder__", "")
        print(clean_feature)
        assert data_types[clean_feature] == input_data_schema[clean_feature]['dtype']

[32m.[0m[32m.[0m[32m                                                                                           [100%][0m
[32m[32m[1m2 passed[0m[32m in 1.73s[0m[0m


# Let´s make the test fail

In [184]:
%%ipytest

def test_input_data_ranges(pipeline):
    max_values = pipeline.data.max()
    min_values = pipeline.data.min()
    
    for feature in pipeline.feature_names:
        clean_feature = feature.replace("remainder__", "")
        print(clean_feature)
        assert max_values[clean_feature] < 0            
        assert min_values[clean_feature] > 1000

def test_input_data_types(pipeline):
    data_types = pipeline.data.dtypes
    
    for feature in pipeline.feature_names:
        clean_feature = feature.replace("remainder__", "")
        print(clean_feature)
        assert data_types[clean_feature] == input_data_schema[clean_feature]['dtype']

[31mF[0m[32m.[0m[31m                                                                                           [100%][0m
[31m[1m_____________________________________ test_input_data_ranges ______________________________________[0m

pipeline = <__main__.ParkinsonUPDRSModel object at 0x0000020C7C76F810>

    [0m[94mdef[39;49;00m [92mtest_input_data_ranges[39;49;00m(pipeline):[90m[39;49;00m
        max_values = pipeline.data.max()[90m[39;49;00m
        min_values = pipeline.data.min()[90m[39;49;00m
    [90m[39;49;00m
        [94mfor[39;49;00m feature [95min[39;49;00m pipeline.feature_names:[90m[39;49;00m
            clean_feature = feature.replace([33m"[39;49;00m[33mremainder__[39;49;00m[33m"[39;49;00m, [33m"[39;49;00m[33m"[39;49;00m)[90m[39;49;00m
            [96mprint[39;49;00m(clean_feature)[90m[39;49;00m
>           [94massert[39;49;00m max_values[clean_feature] < [94m0[39;49;00m[90m[39;49;00m
[1m[31mE           assert np.int64(42) <