In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split

# Creating the classes

### Data Explorer

In [27]:
class DataExplorer:
    @staticmethod
    def load_data(filepath):
        df = pd.read_csv(filepath)

    @staticmethod
    def explore_data(data):
        print("First 5 rows of the dataset:")
        print(data.head().T)
        print("\nStatistical summary:")
        print(data.describe())
        print("\nInformation about the dataset:")
        print(data.info())
        print("\nPercentage of missing values in each column:")
        print(data.isna().mean() * 100)

    @staticmethod
    def plot_correlation_matrix(data):
        numericas = data.select_dtypes(exclude='object')
        matriz_correlacion = numericas.corr()
        plt.figure(figsize=(20, 10))
        sns.heatmap (matriz_correlacion, annot=True, fmt=".2f")
        plt.title('Mapa de Correlacion')
        plt.show()

    @staticmethod
    def plot_histograms(data):
        data.hist(bins=15, figsize=(15, 15))
        plt.show()

    @staticmethod
    def plot_feature_relationships(data, target):
        for column in data.columns[:-1]:
            plt.figure(figsize=(8, 4))
            sns.scatterplot(x=column, y=target, data=data)
            plt.title(f'Relationship between total_UPDRS and {column}')
            plt.show()



### Pakinson total_UPDRS Model

In [None]:
class ParkinsonUPDRSModel:
    def __init__(self, filepath):
        self.filepath = filepath

    def load_data(self):
        # Load the data
        self.data = pd.read_csv(self.filepath)
        X = self.data.select_dtypes(include=['float64', 'int64']).drop(columns=['motor_UPDRS', 'total_UPDRS', 'sex'])
        y = self.data['total_UPDRS']

        # Removing High correlated columns
        correlated_cols = []
        for col in X:
            if 'total_UPDRS' in col or 'Jitter:RAP' in col or 'Jitter:PPQ5' in col or 'Jitter:DDP' in col or 'Shimmer(dB)' in col or 'Shimmer:APQ3' in col or 'Shimmer:APQ5' in col or 'Shimmer:APQ11' in col or 'Shimmer:DDA' in col:
                correlated_cols.append(col)

        # Define preprocessing
        preprocessing = ColumnTransformer(
            transformers=[
                ('drop_correlated_cols', 'drop', correlated_cols)
            ],
            remainder='passthrough'
        )

        pipeline = Pipeline([
            ('preprocessing', preprocessing),
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=0.90))
        ])
        
        # Train-test split
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        self.X_train = pipeline.fit_transform(self.X_train)
        self.X_test = pipeline.transform(self.X_test)
        return self
    
    def train_model(self):
        # GradientBoostingRegressor model
        self.model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
        self.model.fit(self.X_train, self.y_train)
        return self
    
    def evaluate_model(self):
        # Evaluate on the test set
        y_pred = self.model.predict(self.X_test)
        mae = mean_absolute_error(self.y_test, y_pred)
        r2 = r2_score(self.y_test, y_pred)
        print(f'Mean Absolute Error: {mae}')
        print(f'R2 Score: {r2}')

    def cross_validate_model(self, cv=5):
        # Cross-validate the stacked model
        scores = cross_val_score(self.model, self.X_train, self.y_train, cv=cv, scoring='r2')
        print("Average Accuracy with CV:", np.mean(scores))

# Executing the code

In [29]:
filepath=r'D:\DevOps\MLOpsPactices\Parkinson\parkinsons_updrs.data'

model = ParkinsonUPDRSModel(filepath)
model.load_data()
model.train_model()
model.evaluate_model()
model.cross_validate_model()

Mean Absolute Error: 5.150695631804961
R2 Score: 0.616928781521296
Average Accuracy with CV: 0.588167917718408
