In [1]:
import numpy as np
import pandas as pd
from statsmodels.genmod import families
from statsmodels.othermod.betareg import BetaModel
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

links = families.links

# Set a seed for reproducibility
np.random.seed(0)

# Generate independent variables
n = 1000  # number of samples
Months_on_Book = np.random.normal(50, 10, n)  # Normal distribution with mean 50 and standard deviation 10
utilisation_ratio = np.random.normal(0.6, 0.1, n)  # Normal distribution with mean 0.6 and standard deviation 0.1

# Generate dependent variable with some correlation to the independent variables
noise = np.random.uniform(-0.02, 0.02, n)  # Some noise
CCF = utilisation_ratio * 0.5 + Months_on_Book * 0.002 + noise  # CCF has some correlation with the independent variables

# Ensure CCF is in range 0-1
CCF = np.clip(CCF, 0, 1)

# Create a dataframe
df = pd.DataFrame({'CCF': CCF, 'utilisation_ratio': utilisation_ratio, 'Months_on_Book': Months_on_Book})

# Split the data
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Specify and fit the model to the training data
model = "CCF ~ utilisation_ratio + Months_on_Book"
mod = BetaModel.from_formula(model, df_train, link_precision=links.identity())
res = mod.fit()

print(res.summary())

                              BetaModel Results                               
Dep. Variable:                    CCF   Log-Likelihood:                 2441.1
Model:                      BetaModel   AIC:                            -4874.
Method:            Maximum Likelihood   BIC:                            -4855.
Date:                Sat, 17 Jun 2023                                         
Time:                        19:29:49                                         
No. Observations:                 800                                         
Df Residuals:                     796                                         
Df Model:                           3                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -2.0915      0.014   -147.702      0.000      -2.119      -2.064
utilisation_ratio     2.1106   

In [2]:
# Make predictions on the test data
df_test['pred'] = res.predict(df_test)

# Calculate R^2
r2 = r2_score(df_test['CCF'], df_test['pred'])
print("R^2: ", r2)

R^2:  0.9480404105243195
