In [68]:
# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Model
from sklearn.ensemble import RandomForestRegressor

# Evaluation
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [69]:
df=pd.read_csv('wearable_health_devices_performance_upto_26june2025.csv')

In [70]:
df.head()

Unnamed: 0,Test_Date,Device_Name,Brand,Model,Category,Price_USD,Battery_Life_Hours,Heart_Rate_Accuracy_Percent,Step_Count_Accuracy_Percent,Sleep_Tracking_Accuracy_Percent,Water_Resistance_Rating,User_Satisfaction_Rating,GPS_Accuracy_Meters,Connectivity_Features,Health_Sensors_Count,App_Ecosystem_Support,Performance_Score
0,2025-06-01,Fitbit Inspire 4,Fitbit,Inspire 4,Fitness Tracker,141.74,129.9,89.69,93.03,78.91,3ATM,6.5,,"Bluetooth, WiFi",5,Cross-platform,68.4
1,2025-06-01,Apple Watch SE 3,Apple,Watch SE 3,Smartwatch,834.64,26.5,95.92,98.2,79.76,IP68,8.3,4.9,"WiFi, Bluetooth, NFC",8,iOS,60.1
2,2025-06-01,Fitbit Versa 4,Fitbit,Versa 4,Sports Watch,145.34,161.2,92.24,96.81,74.49,IPX8,6.0,1.7,Bluetooth,7,Cross-platform,59.3
3,2025-06-01,Polar Vantage V3,Polar,Vantage V3,Smartwatch,349.53,69.4,96.77,95.56,78.06,IP68,8.0,3.2,"WiFi, Bluetooth, NFC, LTE",12,Cross-platform,61.0
4,2025-06-01,Samsung Galaxy Watch FE,Samsung,Galaxy Watch FE,Smartwatch,502.43,39.7,92.27,98.15,75.23,IPX8,8.3,1.6,"WiFi, Bluetooth, NFC, LTE",14,Android/iOS,61.2


In [71]:
df.shape

(2375, 17)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2375 entries, 0 to 2374
Data columns (total 17 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Test_Date                        2375 non-null   object 
 1   Device_Name                      2375 non-null   object 
 2   Brand                            2375 non-null   object 
 3   Model                            2375 non-null   object 
 4   Category                         2375 non-null   object 
 5   Price_USD                        2375 non-null   float64
 6   Battery_Life_Hours               2375 non-null   float64
 7   Heart_Rate_Accuracy_Percent      2375 non-null   float64
 8   Step_Count_Accuracy_Percent      2375 non-null   float64
 9   Sleep_Tracking_Accuracy_Percent  2375 non-null   float64
 10  Water_Resistance_Rating          2375 non-null   object 
 11  User_Satisfaction_Rating         2375 non-null   float64
 12  GPS_Accuracy_Meters 

In [73]:
df.isnull().sum()

Test_Date                            0
Device_Name                          0
Brand                                0
Model                                0
Category                             0
Price_USD                            0
Battery_Life_Hours                   0
Heart_Rate_Accuracy_Percent          0
Step_Count_Accuracy_Percent          0
Sleep_Tracking_Accuracy_Percent      0
Water_Resistance_Rating              0
User_Satisfaction_Rating             0
GPS_Accuracy_Meters                632
Connectivity_Features                0
Health_Sensors_Count                 0
App_Ecosystem_Support                0
Performance_Score                    0
dtype: int64

In [74]:
df.duplicated().sum()

np.int64(0)

In [75]:
# ✅ Convert 'Test_Date' column to datetime format
df['Test_Date'] = pd.to_datetime(df['Test_Date'])

# ✅ Extract day, month, and year from the date and store them in new columns
df['Test_Date_day'] = df['Test_Date'].dt.day
df['Test_Date_month'] = df['Test_Date'].dt.month
df['Test_Date_year'] = df['Test_Date'].dt.year

# ✅ Drop original date and non-numeric/categorical columns that may not be useful for modeling
df.drop('Test_Date', axis=1, inplace=True)
df.drop('Connectivity_Features', axis=1, inplace=True)
df.drop('App_Ecosystem_Support', axis=1, inplace=True)
df.drop('Device_Name', axis=1, inplace=True)

# ✅ One-hot encode all remaining categorical (object) columns
for col in df.select_dtypes(include='object').columns:
    dummies = pd.get_dummies(df[col], prefix=col, drop_first=True).astype(int)  # Create binary columns
    df = pd.concat([df.drop(columns=[col]), dummies], axis=1)  # Replace original with dummies

# ✅ Impute missing values in 'GPS_Accuracy_Meters' using median strategy
imputer = SimpleImputer(strategy='median')
df['GPS_Accuracy_Meters'] = imputer.fit_transform(df[['GPS_Accuracy_Meters']]).ravel()  # Flatten to 1D

In [76]:
# ✅ Step 1: Split the features (X) and target (y)
x = df.drop(columns=['User_Satisfaction_Rating'], axis=1)  # All columns except the target
y = df['User_Satisfaction_Rating']  # Target column

# ✅ Step 2: Train-test split (80% training, 20% testing)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# ✅ Step 3: Feature scaling using StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# Fit the scaler on training data and transform both train and test sets
x_train_sc = sc.fit_transform(x_train)
x_test_sc = sc.transform(x_test)  # Use transform (not fit_transform) to prevent data leakage

# ✅ Step 4: Initialize and train the Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_samples=0.75, random_state=42)
rf.fit(x_train_sc, y_train)

# ✅ Step 5: Make predictions on the test set
y_pred = rf.predict(x_test_sc)

# ✅ Step 6: Evaluate model using R² Score
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)  # (true values, predicted values)
print("R² Score:", r2)


R² Score: 0.8660917518116604


In [77]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'max_features': ['auto', 'sqrt'],
    'min_samples_split': [2, 5]
}

In [67]:
from sklearn.model_selection import GridSearchCV

# ✅ Step 1: Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'max_features': ['auto', 'sqrt']
}

# ✅ Step 2: Set up GridSearchCV with 5-fold cross-validation and R² as the scoring metric
grid_search = GridSearchCV(
    estimator=rf,              # RandomForestRegressor instance
    param_grid=param_grid,     # Dictionary of hyperparameters to try
    cv=5,                      # 5-fold cross-validation
    scoring='r2',              # Evaluation metric: R² score
    n_jobs=-1,                 # Use all available CPU cores for parallel processing
    verbose=1                  # Print progress during search
)

# ✅ Step 3: Run the grid search on the scaled training data
grid_search.fit(x_train_sc, y_train)

# ✅ Step 4: Get the best model and parameters found during the search
best_model = grid_search.best_estimator_              # Best trained model
print("Best Parameters:", grid_search.best_params_)   # Best hyperparameters
print("Best R² Score from GridSearchCV:", grid_search.best_score_)  # Best cross-validated R² score

Fitting 5 folds for each of 24 candidates, totalling 120 fits


60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
34 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\clash\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\clash\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1356, in wrapper
    estimator._validate_params()
  File "C:\Users\clash\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\Lo

Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}
Best R² Score from GridSearchCV: 0.7919355753034667
