In [None]:
#!pip install pycaret

## Load Data

In [75]:
import pandas  as pd
from pycaret.regression import  *

df = pd.read_excel('TG_T_CashValues_Rel.xlsx')
df.head()


Unnamed: 0,Gender,Age,Dur,PPV
0,Male,0,20,15.198437
1,Male,0,21,15.741015
2,Male,0,22,16.267189
3,Male,0,23,16.777448
4,Male,0,24,17.272269


## Init Setup
This step initializes the environment by preprocessing the dataset, automatically handling missing values, encoding categorical features, scaling numeric features, and splitting the data into training and testing sets, ensuring the data is ready for machine learning model training.

In [76]:
#s = setup(df, target='PPV', train_size=0.8, session_id=100)
s = setup(df, target='PPV', train_size=0.8, session_id=100, 
          numeric_features=['Age', 'Dur'], 
          categorical_features=['Gender'])


Unnamed: 0,Description,Value
0,Session id,100
1,Target,PPV
2,Target type,Regression
3,Original data shape,"(5270, 4)"
4,Transformed data shape,"(5270, 4)"
5,Transformed train set shape,"(4216, 4)"
6,Transformed test set shape,"(1054, 4)"
7,Numeric features,2
8,Categorical features,1
9,Preprocess,True


## Compare Models 
The compare_models() function compares multiple models based on performance metrics, selecting the best model



In [77]:
best_model  = compare_models()
print(best_model )


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0084,0.0006,0.0234,1.0,0.0018,0.0006,0.136
rf,Random Forest Regressor,0.0325,0.0034,0.0582,0.9999,0.0047,0.0026,0.143
lightgbm,Light Gradient Boosting Machine,0.0322,0.004,0.063,0.9999,0.0044,0.0025,0.116
dt,Decision Tree Regressor,0.0472,0.0073,0.0855,0.9998,0.0058,0.0033,0.014
gbr,Gradient Boosting Regressor,0.0769,0.016,0.1261,0.9997,0.0111,0.0078,0.073
knn,K Neighbors Regressor,0.1544,0.0499,0.223,0.999,0.038,0.0243,0.016
ada,AdaBoost Regressor,0.5595,0.4369,0.6605,0.991,0.0921,0.0749,0.065
lar,Least Angle Regression,1.0915,1.7202,1.311,0.9646,0.1913,0.1782,0.013
br,Bayesian Ridge,1.0915,1.7202,1.311,0.9646,0.1913,0.1782,0.014
ridge,Ridge Regression,1.0915,1.7202,1.311,0.9646,0.1913,0.1782,0.012



Styler.applymap has been deprecated. Use Styler.map instead.



ExtraTreesRegressor(n_jobs=-1, random_state=100)


## Evaluate Model
The evaluate_model() function  provides a detailed analysis of the chosen model's performance, including visualizations and metrics.

In [78]:
evaluate_model(best_model ) 

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

## Predict on Test Set
In this part, the model will be tested on data that it was not trained on, which represents 20% of the data

With a small Mean Absolute Error (MAE), it meaning the average difference between predicted and actual values is minimal and the model predicts PPV values ​​excellently.

In [79]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
test = predict_model(best_model)
print("MAE(Mean Absolute Error): ", mean_absolute_error(test['PPV'], test['prediction_label']))
print("RMSE(Mean Squared Error): ", mean_squared_error(test['PPV'], test['prediction_label'], squared=False))
test.head(10)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,0.0086,0.0008,0.0279,1.0,0.0023,0.0006


MAE(Mean Absolute Error):  0.008552081735355807
RMSE(Mean Squared Error):  0.02789045343241796



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Unnamed: 0,Gender,Age,Dur,PPV,prediction_label
4560,Female,43,32,19.737616,19.771211
1564,Male,35,11,9.445037,9.443396
1645,Male,37,1,1.0,1.0
3355,Female,18,11,9.482568,9.48266
652,Male,16,40,23.24806,23.241538
4034,Female,31,40,22.924696,22.934663
1617,Male,36,18,13.872947,13.868043
4887,Female,53,24,15.931975,15.904194
1267,Male,29,8,7.199136,7.199047
3222,Female,15,22,16.252451,16.252783


## Predict on Data

In [80]:
new_df = df.copy().drop('PPV', axis=1)
predictions = predict_model(best_model, data=new_df)
print("Data with prediction PPV")
predictions.head()


Data with prediction PPV


Unnamed: 0,Gender,Age,Dur,prediction_label
0,Male,0,20,15.198438
1,Male,0,21,15.741014
2,Male,0,22,16.267189
3,Male,0,23,16.777449
4,Male,0,24,17.272268


In [81]:
print("Data with orginal PPV")
df.head()

Data with orginal PPV


Unnamed: 0,Gender,Age,Dur,PPV
0,Male,0,20,15.198437
1,Male,0,21,15.741015
2,Male,0,22,16.267189
3,Male,0,23,16.777448
4,Male,0,24,17.272269


## Save the Model
We save the model to use it later without need to training it again

In [82]:
save_model(best_model, 'TG_T_CashValues_Rel_pipeline')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['Age', 'Dur'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['Gender'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('ordinal_encoding',
                  TransformerWrapper(include=['Gender'],
                                     transformer=OrdinalEncoder(cols=['Gender'],
                                                                handle_missing='return_nan',
                                                                mapping=[{'col': 'Gender',
                                                                          'data_type': dtype('O'),
                                                                          'mapping': Female    0
 Male      1
 NaN      -1
 dtype: int64}]))),


## Load the Saved model and Use it

In [83]:
loaded_model = load_model('TG_T_CashValues_Rel_pipeline')
new_data = pd.DataFrame({
    'Gender': ['Male'],
    'Age': [0],
    'Dur': [15.7]
})

predictions = predict_model(loaded_model, data=new_data)
print(predictions)



Transformation Pipeline and Model Successfully Loaded


  Gender  Age   Dur  prediction_label
0   Male    0  15.7         12.633543
