# **PyCaret**
Inspired by [Greg Hogg](https://www.youtube.com/watch?v=NbBoZQZ3bxo).

[PyCaret](https://pycaret.gitbook.io/docs/get-started/tutorials) promises **low code** Machine Learning.

In [None]:
# Restart runtime after running this!
# !pip install --pre pycaret

## Regression
`Regression` is used to predict quantitative variables (e.g. housing prices, insurance premiums) based on a given set of features.

In [4]:
import pandas as pd
df = pd.read_csv('sample_data/california_housing_train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           17000 non-null  float64
 1   latitude            17000 non-null  float64
 2   housing_median_age  17000 non-null  float64
 3   total_rooms         17000 non-null  float64
 4   total_bedrooms      17000 non-null  float64
 5   population          17000 non-null  float64
 6   households          17000 non-null  float64
 7   median_income       17000 non-null  float64
 8   median_house_value  17000 non-null  float64
dtypes: float64(9)
memory usage: 1.2 MB


In [None]:
from pycaret.regression import *

s = setup(df, target='median_house_value')

Unnamed: 0,Description,Value
0,Session id,1387
1,Target,median_house_value
2,Target type,Regression
3,Original data shape,"(17000, 9)"
4,Transformed data shape,"(17000, 9)"
5,Transformed train set shape,"(11900, 9)"
6,Transformed test set shape,"(5100, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


In [None]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,32402.5344,2319350236.0541,48100.5203,0.8299,0.2385,0.1826,0.244
rf,Random Forest Regressor,32903.7636,2526022021.1891,50192.6894,0.8148,0.2407,0.1833,5.069
et,Extra Trees Regressor,36395.395,2954449469.3326,54293.2882,0.7835,0.2565,0.2027,2.161
gbr,Gradient Boosting Regressor,38213.8924,3017523527.6638,54851.8079,0.7789,0.2728,0.2174,1.597
lr,Linear Regression,51079.0773,4911021209.6,70016.359,0.6401,0.4176,0.3044,0.45
lasso,Lasso Regression,51079.1523,4911044352.0,70016.5035,0.6401,0.4176,0.3044,0.09
ridge,Ridge Regression,51079.0426,4911042534.4,70016.491,0.6401,0.4174,0.3044,0.035
llar,Lasso Least Angle Regression,51104.6984,4911055360.0,70016.5641,0.6401,0.4188,0.3046,0.03
br,Bayesian Ridge,51078.1213,4911038285.1471,70016.4529,0.6401,0.4163,0.3044,0.031
dt,Decision Tree Regressor,45697.021,5080477101.1218,71189.6204,0.6274,0.3298,0.247,0.109


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

In [None]:
finalize_model(best)

Pipeline(memory=Memory(location=/tmp/joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=['longitude', 'latitude',
                                             'housing_median_age',
                                             'total_rooms', 'total_bedrooms',
                                             'population', 'households',
                                             'median_income'],
                                    transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=[],
                                    transformer=SimpleImputer(strategy='most_frequent'))),
                ('actual_estimator', LGBMRegressor(random_state=1387))])

In [None]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
predict_model(best)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,32190.4141,2272011650.1058,47665.6234,0.8254,0.2341,0.1789


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,prediction_label
11900,-118.389999,34.080002,52.0,3759.0,464.0,1407.0,422.0,15.0001,500001.0,508020.723735
11901,-122.190002,37.820000,32.0,1835.0,264.0,635.0,263.0,8.3170,365900.0,432283.450416
11902,-117.129997,32.790001,35.0,1458.0,262.0,723.0,257.0,4.2098,174100.0,152596.943713
11903,-120.690002,39.119999,19.0,1048.0,262.0,493.0,184.0,2.2917,118200.0,123429.621424
11904,-122.250000,37.770000,52.0,2650.0,566.0,1468.0,567.0,3.0161,215700.0,214855.520594
...,...,...,...,...,...,...,...,...,...,...
16995,-118.540001,34.220001,35.0,1664.0,300.0,1000.0,309.0,4.6731,224100.0,205861.968523
16996,-118.180000,33.950001,42.0,2608.0,610.0,2062.0,616.0,3.5341,167500.0,166886.677923
16997,-118.239998,33.950001,21.0,1260.0,342.0,1167.0,310.0,0.9708,107600.0,101403.294953
16998,-119.480003,36.500000,32.0,3451.0,625.0,1968.0,574.0,2.9554,110300.0,74629.402614


In [None]:
test_df = pd.read_csv('sample_data/california_housing_test.csv')

predict_model(best, test_df)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,32792.8603,2404968118.5276,49040.4743,0.812,0.2464,0.1855


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,prediction_label
0,-122.050003,37.369999,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0,416797.177627
1,-118.300003,34.259998,43.0,1510.0,310.0,809.0,277.0,3.5990,176500.0,196190.140082
2,-117.809998,33.779999,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0,287986.351385
3,-118.360001,33.820000,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0,349727.567163
4,-119.669998,36.330002,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0,77331.185268
...,...,...,...,...,...,...,...,...,...,...
2995,-119.860001,34.419998,23.0,1450.0,642.0,1258.0,607.0,1.1790,225000.0,235243.326043
2996,-118.139999,34.060001,27.0,5257.0,1082.0,3496.0,1036.0,3.3906,237200.0,215361.385945
2997,-119.699997,36.299999,10.0,956.0,201.0,693.0,220.0,2.2895,62000.0,63741.247765
2998,-117.120003,34.099998,40.0,96.0,14.0,46.0,14.0,3.2708,162500.0,157214.040078
