# Import required Libraries

In [1]:
import pandas as pd

# Load the dataset

In [2]:
df=pd.read_csv(r"D:\knee_ml_model\dataset\updated_data.csv")

In [3]:
df.head()

Unnamed: 0,Patient_Type,Flexion_Angle,Flexion_Category,Recovery_Time_Estimate (Weeks),pain_curability_percent
0,Normal,176,fully_stretched,0,99.0
1,Normal,101,partially_bent,0,84.15
2,Normal,43,fully_bent,0,69.3
3,Normal,177,fully_stretched,0,99.0
4,Normal,92,partially_bent,0,84.15


In [4]:
df.columns

Index(['Patient_Type', 'Flexion_Angle', 'Flexion_Category',
       'Recovery_Time_Estimate (Weeks)', 'pain_curability_percent'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 5 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Patient_Type                    3000 non-null   object 
 1   Flexion_Angle                   3000 non-null   int64  
 2   Flexion_Category                3000 non-null   object 
 3   Recovery_Time_Estimate (Weeks)  3000 non-null   int64  
 4   pain_curability_percent         3000 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 117.3+ KB


### check Statistical counts

In [6]:
df.describe()

Unnamed: 0,Flexion_Angle,Recovery_Time_Estimate (Weeks),pain_curability_percent
count,3000.0,3000.0,3000.0
mean,110.774667,17.661667,50.629343
std,50.972109,14.213768,26.684044
min,30.0,0.0,4.2
25%,70.0,6.75,31.975
50%,97.0,17.0,52.49
75%,176.0,28.0,70.425
max,180.0,46.0,99.0


### check the unique variables of each column

In [7]:
df.nunique()

Patient_Type                       4
Flexion_Angle                     97
Flexion_Category                   3
Recovery_Time_Estimate (Weeks)    10
pain_curability_percent           17
dtype: int64

### Check what are the categories Present in Categorical columns

In [8]:
print(df["Patient_Type"].unique())
print(df["Flexion_Category"].unique())

['Normal' 'Mild' 'Moderate' 'Severe']
['fully_stretched' 'partially_bent' 'fully_bent']


### Preprocessing and Model performance

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Load data
data = pd.read_csv(r"D:\knee_ml_model\dataset\updated_data.csv")



# Preprocessing: Define features (X) and target (y)
X = data[['Patient_Type', 'Flexion_Angle', 'Flexion_Category',]]
y = data['pain_curability_percent']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline
# Define categorical and numerical features
categorical_features = ['Patient_Type', 'Flexion_Category']
numerical_features = ['Flexion_Angle']

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

# Define different models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42, n_estimators=100),
    'Support Vector Regressor': SVR()
}

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    # Create a pipeline with preprocessor and model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    # Print results
    print(f"{name} Results:")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print("-" * 30)
    
    return mae, rmse

# Evaluate each model
results = {}
for name, model in models.items():
    mae, rmse = evaluate_model(name, model, X_train, X_test, y_train, y_test)
    results[name] = {'MAE': mae, 'RMSE': rmse}

# Display results
print("Model Performance Summary:")
for model_name, metrics in results.items():
    print(f"{model_name} - MAE: {metrics['MAE']:.2f}, RMSE: {metrics['RMSE']:.2f}")


Linear Regression Results:
MAE: 2.02
RMSE: 3.06
------------------------------
Decision Tree Results:
MAE: 0.00
RMSE: 0.00
------------------------------
Random Forest Results:
MAE: 0.00
RMSE: 0.00
------------------------------
Support Vector Regressor Results:
MAE: 1.20
RMSE: 2.80
------------------------------
Model Performance Summary:
Linear Regression - MAE: 2.02, RMSE: 3.06
Decision Tree - MAE: 0.00, RMSE: 0.00
Random Forest - MAE: 0.00, RMSE: 0.00
Support Vector Regressor - MAE: 1.20, RMSE: 2.80


### *Result* : We can observe that Decison Tree and Random Forest have zero error metrics but we decided to choose Random Forest due to its ensemble capabilities .