In [None]:
# Measuring the model's accuracy

import pandas as pd
import numpy as np
from prophet import Prophet
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load and prepare data
df = pd.read_csv("C:/GitHub/Machine-Learning/data/monthly payment.csv")
df['ACC_MONTH'] = pd.to_datetime(df['ACC_MONTH'])
df = df[['ACC_MONTH', 'TOT_PAID', 'EST_COST', 'NBR_CASE']].dropna()
df = df.sort_values('ACC_MONTH').reset_index(drop=True)

# Create lag-12 features
df['EST_COST_LAG12'] = df['EST_COST'].shift(12)
df['NBR_CASE_LAG12'] = df['NBR_CASE'].shift(12)
df = df.dropna().reset_index(drop=True)

# Prepare Prophet format
df_prophet = df.rename(columns={'ACC_MONTH': 'ds', 'TOT_PAID': 'y'})[['ds', 'y', 'EST_COST_LAG12', 'NBR_CASE_LAG12']]

# Function to forecast future values of a regressor
def forecast_regressor(data, regressor_name, periods=18):
    reg_df = data[['ds', regressor_name]].rename(columns={regressor_name: 'y'})
    model = Prophet()
    model.fit(reg_df)
    future = model.make_future_dataframe(periods=periods, freq='MS')
    forecast = model.predict(future)
    return forecast[['ds', 'yhat']].rename(columns={'yhat': f'{regressor_name}_forecast'})

# Forecast regressors independently
forecast_est = forecast_regressor(df_prophet, 'EST_COST_LAG12')
forecast_case = forecast_regressor(df_prophet, 'NBR_CASE_LAG12')

# Prepare future regressor dataframe
last_date = df_prophet['ds'].max()
future_dates = pd.date_range(start=last_date + pd.offsets.MonthBegin(1), periods=18, freq='MS')
future_df = pd.DataFrame({'ds': future_dates})

# Merge forecasted regressors into future dataframe
future_df = future_df.merge(forecast_est, on='ds', how='left')
future_df = future_df.merge(forecast_case, on='ds', how='left')
future_df.rename(columns={
    'EST_COST_LAG12_forecast': 'EST_COST_LAG12',
    'NBR_CASE_LAG12_forecast': 'NBR_CASE_LAG12'
}, inplace=True)

# Train Prophet model with regressors
model = Prophet()
model.add_regressor('EST_COST_LAG12')
model.add_regressor('NBR_CASE_LAG12')
model.fit(df_prophet)

# Prepare full future dataframe for prediction
full_future = pd.concat([
    df_prophet[['ds', 'EST_COST_LAG12', 'NBR_CASE_LAG12']],
    future_df
], ignore_index=True)

# Forecast TOT_PAID
forecast = model.predict(full_future)

# Extract 18-month forecast
forecast_out = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(18)
print("\n📈 18-Month Forecast of TOT_PAID:")
print(forecast_out)

# -------------------------------
# 🔍 Model Evaluation on Training
# -------------------------------
merged = pd.merge(df_prophet[['ds', 'y']], forecast[['ds', 'yhat']], on='ds')
rmse = mean_squared_error(merged['y'], merged['yhat'], squared=False)
mae = mean_absolute_error(merged['y'], merged['yhat'])
mape = np.mean(np.abs((merged['y'] - merged['yhat']) / merged['y'])) * 100

print("\n📊 Model Accuracy on Training Data:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE : {mae:.2f}")
print(f"MAPE: {mape:.2f}%")

# Optional: Plot full forecast
model.plot(forecast)
plt.title("Forecast of TOT_PAID with Lagged Regressors")
plt.xlabel("Date")
plt.ylabel("TOT_PAID")
plt.tight_layout()
plt.show()


In [None]:
# model fitting
import pandas as pd
from prophet import Prophet
import matplotlib.pyplot as plt

# Load and prepare data
df = pd.read_csv("C:/GitHub/Machine-Learning/data/monthly payment.csv")
df['ACC_MONTH'] = pd.to_datetime(df['ACC_MONTH'])
df = df[['ACC_MONTH', 'TOT_PAID', 'EST_COST', 'NBR_CASE']].dropna()
df = df.sort_values('ACC_MONTH').reset_index(drop=True)

# Create lag-12 features
df['EST_COST_LAG12'] = df['EST_COST'].shift(12)
df['NBR_CASE_LAG12'] = df['NBR_CASE'].shift(12)
df = df.dropna().reset_index(drop=True)

# Prepare Prophet format
df_prophet = df.rename(columns={'ACC_MONTH': 'ds', 'TOT_PAID': 'y'})[['ds', 'y', 'EST_COST_LAG12', 'NBR_CASE_LAG12']]

# Function to forecast future values of a regressor
def forecast_regressor(data, regressor_name, periods=18):
    reg_df = data[['ds', regressor_name]].rename(columns={regressor_name: 'y'})
    model = Prophet()
    model.fit(reg_df)
    future = model.make_future_dataframe(periods=periods, freq='MS')
    forecast = model.predict(future)
    return forecast[['ds', 'yhat']].rename(columns={'yhat': f'{regressor_name}_forecast'})

# Forecast regressors independently
forecast_est = forecast_regressor(df_prophet, 'EST_COST_LAG12')
forecast_case = forecast_regressor(df_prophet, 'NBR_CASE_LAG12')

# Prepare future regressor dataframe
last_date = df_prophet['ds'].max()
future_dates = pd.date_range(start=last_date + pd.offsets.MonthBegin(1), periods=18, freq='MS')
future_df = pd.DataFrame({'ds': future_dates})

# Merge forecasted regressors into future dataframe
future_df = future_df.merge(forecast_est, on='ds', how='left')
future_df = future_df.merge(forecast_case, on='ds', how='left')
future_df.rename(columns={
    'EST_COST_LAG12_forecast': 'EST_COST_LAG12',
    'NBR_CASE_LAG12_forecast': 'NBR_CASE_LAG12'
}, inplace=True)

# Train Prophet model with regressors
model = Prophet()
model.add_regressor('EST_COST_LAG12')
model.add_regressor('NBR_CASE_LAG12')
model.fit(df_prophet)

# Prepare full future dataframe for prediction
full_future = pd.concat([
    df_prophet[['ds', 'EST_COST_LAG12', 'NBR_CASE_LAG12']],
    future_df
], ignore_index=True)

# Forecast TOT_PAID
forecast = model.predict(full_future)

# Extract and display only the 18-month forecast
forecast_out = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(18)
print(forecast_out)

# Optional: Plot forecast
model.plot(forecast)
plt.title("Forecast of TOT_PAID with Lagged Regressors")
plt.xlabel("Date")
plt.ylabel("TOT_PAID")
plt.show()


In [None]:
#LAG analysis:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from statsmodels.tsa.stattools import grangercausalitytests

# Select relevant columns
df = df[['ACC_MONTH', 'TOT_PAID', 'EST_COST', 'NBR_KEY_CASE', 'NBR_CASE','NBR_OTHER']].dropna()

# Function to test lag correlation analysis
def test_lag_correlation(target, regressor, max_lag=24):
    correlations = {}

    for lag in range(0, max_lag + 1):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning)
            df[f'{regressor}_LAG_{lag}'] = df[regressor].shift(lag)  # only once

        correlations[lag] = df[[target, f'{regressor}_LAG_{lag}']].corr().iloc[0, 1]

    correlation_df = pd.DataFrame(list(correlations.items()), columns=['Lag (Months)', 'Correlation'])
    best_lag = correlation_df.iloc[correlation_df['Correlation'].abs().idxmax()]

    print(f"\nBest Lag for {regressor}: {best_lag['Lag (Months)']} months (Correlation = {best_lag['Correlation']:.2f})")

    plt.figure(figsize=(10, 5))
    plt.plot(correlation_df['Lag (Months)'], correlation_df['Correlation'], marker='o', linestyle='dashed', label=regressor)
    plt.axvline(x=12, color='red', linestyle='dotted', label="12-Month Lag")
    plt.axhline(y=0, color='black', linestyle='dashed')
    plt.xlabel("Lag (Months)")
    plt.ylabel("Correlation with TOT_PAID")
    plt.title(f"Correlation of {regressor} at Different Lags with TOT_PAID")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    return best_lag

# Run lag correlation analysis
best_est_cost_lag = test_lag_correlation('TOT_PAID', 'EST_COST')
best_case_lag = test_lag_correlation('TOT_PAID', 'NBR_CASE')
best_other_lag = test_lag_correlation('TOT_PAID','NBR_OTHER')
best_key_case_lag = test_lag_correlation('TOT_PAID', 'NBR_KEY_CASE')

# Run Granger Causality Tests
print("\nRunning Granger Causality Test (Does EST_COST Granger-cause TOT_PAID?)")
granger_budget = grangercausalitytests(df[['TOT_PAID', 'EST_COST']].dropna(), maxlag=int(best_est_cost_lag['Lag (Months)']))

print("\nRunning Granger Causality Test (Does NBR_CASE Granger-cause TOT_PAID?)")
granger_case = grangercausalitytests(df[['TOT_PAID', 'NBR_CASE']].dropna(), maxlag=int(best_case_lag['Lag (Months)']))

print("\nRunning Granger Causality Test (Does NBR_OTHER Granger-cause TOT_PAID?)")
granger_other = grangercausalitytests(df[['TOT_PAID', 'NBR_OTHER']].dropna(), maxlag=int(best_other_lag['Lag (Months)']))

print("\nRunning Granger Causality Test (Does NBR_KEY_CASE Granger-cause TOT_PAID?)")
granger_key = grangercausalitytests(df[['TOT_PAID', 'NBR_KEY_CASE']].dropna(), maxlag=int(best_key_case_lag['Lag (Months)']))


In [None]:
# LAG Analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests

# Load Dataset
data = pd.read_csv("C:\GitHub\Machine-Learning\data\Adjusted_DATA_2.csv")

# Convert ACC_MONTH to datetime and sort
data['ACC_MONTH'] = pd.to_datetime(data['ACC_MONTH'])
data = data.sort_values(by='ACC_MONTH')

# Select relevant columns
df = data[['ACC_MONTH', 'TOT_PAID', 'BUDGET_AMT', 'NBR_MURDER', 'NBR_CASE']].dropna()

# Function to test lag correlation analysis
def test_lag_correlation(target, regressor, max_lag=24):
    correlations = {}
    
    for lag in range(0, max_lag + 1):
        df[f'{regressor}_LAG_{lag}'] = df[regressor].shift(lag)
        correlations[lag] = df[[target, f'{regressor}_LAG_{lag}']].corr().iloc[0, 1]
    
    correlation_df = pd.DataFrame(list(correlations.items()), columns=['Lag (Months)', 'Correlation'])
    best_lag = correlation_df.iloc[correlation_df['Correlation'].abs().idxmax()]
    
    print(f"Best Lag for {regressor}: {best_lag['Lag (Months)']} months (Correlation = {best_lag['Correlation']:.2f})")
    
    plt.figure(figsize=(10, 5))
    plt.plot(correlation_df['Lag (Months)'], correlation_df['Correlation'], marker='o', linestyle='dashed', label=regressor)
    plt.axvline(x=12, color='red', linestyle='dotted', label="12-Month Lag")
    plt.axhline(y=0, color='black', linestyle='dashed')
    plt.xlabel("Lag (Months)")
    plt.ylabel("Correlation with TOT_PAID")
    plt.title(f"Correlation of {regressor} at Different Lags with TOT_PAID")
    plt.legend()
    plt.grid(True)
    plt.show()
    
    return best_lag

# Test lag correlation for BUDGET_AMT, NBR_MURDER, and NBR_CASE
best_budget_lag = test_lag_correlation('TOT_PAID', 'BUDGET_AMT')
best_murder_lag = test_lag_correlation('TOT_PAID', 'NBR_MURDER')
best_case_lag = test_lag_correlation('TOT_PAID', 'NBR_CASE')

# Run Granger Causality Test for each regressor
print("\nRunning Granger Causality Test (Does BUDGET_AMT Granger-cause TOT_PAID?)")
granger_budget = grangercausalitytests(df[['TOT_PAID', 'BUDGET_AMT']].dropna(), 12, verbose=True)

print("\nRunning Granger Causality Test (Does NBR_MURDER Granger-cause TOT_PAID?)")
granger_murder = grangercausalitytests(df[['TOT_PAID', 'NBR_MURDER']].dropna(), 12, verbose=True)

print("\nRunning Granger Causality Test (Does NBR_CASE Granger-cause TOT_PAID?)")
granger_case = grangercausalitytests(df[['TOT_PAID', 'NBR_CASE']].dropna(), 12, verbose=True)


In [None]:
#Correlation Matrix

import seaborn as sns
import matplotlib.pyplot as plt

# Copy and clean data
df_corr = BCM_DATA_GA.copy()

# Ensure ACC_MONTH is excluded and only numeric columns are used
df_corr = df_corr.select_dtypes(include='number').dropna()

# Compute correlation matrix
corr_matrix = df_corr.corr()

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', linewidths=0.5, square=True)
plt.title("Correlation Matrix - BCM_DATA_GA", fontsize=14)
plt.tight_layout()
plt.show()
