In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
#Import Your Ensemble Model 
from sklearn.ensemble import VotingClassifier

#Load the Dataset
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data'
df = pd.read_csv(data_url)

print("--- Data Loaded Successfully ---")
print(df.info())
print("\n")

#Prepare Your Data (X and y)
# 'status' is our target variable (y)
# All other columns (except 'name') are our features (X)
y = df['status']
X = df.drop(columns=['status', 'name'])

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

#Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#Scale Your Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n--- Features Scaled ---")

#Create Your Individual Model
# These are the "voters" for your ensemble
model_1 = LogisticRegression(random_state=42, max_iter=1000, solver='liblinear')
model_2 = SVC(random_state=42, probability=True) # Added probability=True for some advanced uses
model_3 = KNeighborsClassifier(n_neighbors=5) 

#Create the Ensemble Model (Voting Classifier)
# 'hard' voting = majority rule. (e.g., if 2 models say '1' and 1 says '0', the result is '1')
ensemble_model = VotingClassifier(
    estimators=[
        ('lr', model_1),
        ('svm', model_2),
        ('knn', model_3)
    ],
    voting='hard' 
)

print("\n--- Starting Model Training ---")

#Train the Ensemble Model
ensemble_model.fit(X_train_scaled, y_train)

print("--- Training Complete ---")

#Evaluate Your Model
y_pred = ensemble_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"\n--- Model Evaluation ---")
print(f"Overall Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(report)

--- Data Loaded Successfully ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR        