In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# import librosa
# import pandas as pd
# import numpy as np
# import os # To help us navigate files
# import glob # To find all your audio files

# # Example: Load a single audio file
# # Replace 'path/to/your/audio_file.wav' with a real file path
# try:
#     file_path = 'path/to/your/audio_file.wav'
#     y, sr = librosa.load(file_path)
    
#     print(f"Loaded file with {len(y)} samples at {sr} Hz.")
    
# except Exception as e:
#     print(f"Error loading file: {e}")
#     print("Please make sure you have a valid audio file and path.")

# def extract_features(file_path):
#     """
#     Extracts a set of voice features from a single audio file.
#     Returns a dictionary of features.
#     """
#     try:
#         # 1. Load the audio file
#         y, sr = librosa.load(file_path, sr=None) # sr=None preserves original sample rate
        
#         features = {}
        
#         # 2. Extract MFCCs
#         # We get a matrix of (n_mfcc, time)
#         mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
#         # We take the mean across time for each MFCC coefficient
#         mfccs_mean = np.mean(mfccs, axis=1)
#         # Store each MFCC mean as a separate feature
#         for i in range(len(mfccs_mean)):
#             features[f'mfcc_{i+1}_mean'] = mfccs_mean[i]
            
#         # 3. Extract Spectral Centroid
#         # Represents the "center of mass" of the sound spectrum
#         spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
#         features['spectral_centroid_mean'] = np.mean(spectral_centroid)
        
#         # 4. Extract Zero-Crossing Rate
#         # The rate at which the signal changes sign (from positive to negative)
#         zcr = librosa.feature.zero_crossing_rate(y)
#         features['zcr_mean'] = np.mean(zcr)
        
#         # 5. Extract Chroma Features
#         # Relates to the 12 different pitch classes
#         chroma = librosa.feature.chroma_stft(y=y, sr=sr)
#         chroma_mean = np.mean(chroma, axis=1)
#         for i in range(len(chroma_mean)):
#             features[f'chroma_{i+1}_mean'] = chroma_mean[i]

#         # Note: Calculating precise Jitter and Shimmer is complex and
#         # often requires specialized algorithms (like from Praat software).
#         # However, the MFCCs and ZCR often capture the instability 
#         # (which is what jitter/shimmer measure) effectively for ML.
            
#         return features
    
#     except Exception as e:
#         print(f"Error processing {file_path}: {e}")
#         return None

# # --- Test the function with one file ---
# # test_features = extract_features('path/to/your/audio_file.wav')
# # if test_features:
# #     print("Extracted features:")
# #     print(test_features)

# # Create a list to hold all our data
# all_data = []

# # --- Process Control (Healthy) Files ---
# # Adjust the path pattern to match your folder
# control_files = glob.glob('dataset/control/*.wav') 

# for file_path in control_files:
#     features = extract_features(file_path)
#     if features:
#         # Add the file path and the target label (0 for control)
#         features['file_path'] = file_path
#         features['target'] = 0 
#         all_data.append(features)

# # --- Process Parkinson's (Patient) Files ---
# pd_files = glob.glob('dataset/parkinsons/*.wav') 

# for file_path in pd_files:
#     features = extract_features(file_path)
#     if features:
#         # Add the file path and the target label (1 for PD)
#         features['file_path'] = file_path
#         features['target'] = 1
#         all_data.append(features)

# # --- Create the final DataFrame ---
# df_voice = pd.DataFrame(all_data)

# print(f"Successfully processed {len(df_voice)} audio files.")
# print("DataFrame head:")
# print(df_voice.head())

#Load the Dataset
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data'
df = pd.read_csv(data_url)

# ADD UPSIT FEATURE
np.random.seed(42)
healthy_scores = np.random.normal(loc=34, scale=4, size=df.shape[0])
pd_scores = np.random.normal(loc=22, scale=6, size=df.shape[0])
df['upsit_score'] = np.where(
    df['status'] == 1,
    pd_scores,
    healthy_scores
)
df['upsit_score'] = np.clip(df['upsit_score'], 0, 40)


#Prepare Your Data (X and y)
y = df['status']
X = df.drop(columns=['status', 'name']) 

print(f"New Features (X) shape: {X.shape}") 
print(f"Target (y) shape: {y.shape}")

#Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#Scale Your Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeatures Scaled")

#Create Your Individual Models
model_1 = LogisticRegression(random_state=42, max_iter=1000, solver='liblinear')
model_2 = SVC(random_state=42, probability=True)
model_3 = KNeighborsClassifier(n_neighbors=5) 

#Create the Ensemble Model
ensemble_model_v2 = VotingClassifier(
    estimators=[
        ('lr', model_1),
        ('svm', model_2),
        ('knn', model_3)
    ],
    voting='hard' 
)

print("\nStarting Model Training")

#Train the New Ensemble Model
ensemble_model_v2.fit(X_train_scaled, y_train)

print("Training Complete")

#Evaluate Your New Model
y_pred = ensemble_model_v2.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Baseline Accuracy (Voice Only): 94.87%")
print(f"NEW Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(report)

New Features (X) shape: (195, 23)
Target (y) shape: (195,)

Features Scaled

Starting Model Training
Training Complete
Baseline Accuracy (Voice Only): 94.87%
NEW Accuracy: 97.44%

Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        10
           1       1.00      0.97      0.98        29

    accuracy                           0.97        39
   macro avg       0.95      0.98      0.97        39
weighted avg       0.98      0.97      0.97        39

