# Packages

In [1]:
import pandas as pd 
import numpy as np 
from functools import reduce
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt 
import seaborn as sns

#Regression
from sklearn.neighbors import KNeighborsRegressor
from statsmodels.api import OLS, add_constant
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson
from scipy.stats import shapiro
from sklearn.inspection import permutation_importance
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

#Classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering

from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

#Decomposition
from sklearn.decomposition import PCA

## Reading the data

In [2]:
xlsx = pd.read_excel('/Users/azmanizakary/Downloads/Integrated_Data_BAM.xlsx', sheet_name=None, header=0)
for sheet in xlsx.keys(): xlsx[sheet].to_excel(sheet+'.xlsx', header=True, index=True)

## Data Preprocessing

In [None]:
for key in xlsx.keys():
    print(key, ':\n', xlsx[key].isnull().sum())

In [None]:
for key in xlsx.keys():
    xlsx[key].replace('--', np.nan, inplace=True)
    print(key, ':\n', xlsx[key])

In [5]:
for key in xlsx.keys() - {'uk_home_retail'}:
    columns_to_fill = xlsx[key].columns[1:]
    
    # Calculate the median for the selected columns
    median_values = xlsx[key][columns_to_fill].median()
    
    # Fill NaN values with the median
    xlsx[key][columns_to_fill] = xlsx[key][columns_to_fill].fillna(median_values)

In [6]:
xlsx['uk_home_retail'] = xlsx['uk_home_retail'].drop('UK Home Sales', axis=1)
xlsx['uk_home_retail'] = xlsx['uk_home_retail'].dropna(axis=1)
xlsx['construction_cost_prices_sales'] = xlsx['construction_cost_prices_sales'].dropna()

## Time features 

In [7]:
import re

keys_q_only = [
    key for key in xlsx.keys()
    if not xlsx[key].empty  # Vérifie que le DataFrame n'est pas vide
    and xlsx[key].shape[1] > 0  # Vérifie qu'il a au moins une colonne
    and xlsx[key][xlsx[key].columns[0]].astype(str)  # Convertit en str
        .str.match(r'^Q', na=False)  # Vérifie que toutes les valeurs commencent par "Q"
        .all()  # S'assure que c'est vrai pour toute la colonne
]

In [8]:
for key in xlsx.keys() - set(keys_q_only):
    xlsx[key][xlsx[key].columns[0]] = pd.to_datetime(xlsx[key][xlsx[key].columns[0]], errors='coerce')
    
for key in keys_q_only:
    # Extraction du Quarter et de l'Année
    xlsx[key][xlsx[key].columns[0]] = pd.to_datetime(xlsx[key][xlsx[key].columns[0]].str[-4:] + xlsx[key][xlsx[key].columns[0]].str[:2])

In [9]:
def to_snake_case(col_name):
    col_name = re.sub(r'([a-z])([A-Z])', r'\1_\2', col_name)  # Convert camelCase/PascalCase to snake_case
    col_name = re.sub(r'\W+', '_', col_name)  # Replace non-word characters (spaces, special chars) with "_"
    col_name = re.sub(r'__+', '_', col_name)  # Replace multiple "_" with a single "_"
    return col_name.lower().strip('_')  # Convert to lowercase and remove leading/trailing "_"

In [10]:
for key in xlsx.keys():
    xlsx[key].columns = [to_snake_case(xlsx[key].columns[0])] + [f"{key}.{to_snake_case(col)}" for col in xlsx[key].columns[1:]]
    xlsx[key] = xlsx[key].rename(columns={'yymm': 'date', 'revenues': 'date'})

In [11]:
#drop duplicated time samples
xlsx['uk_home_retail'] = xlsx['uk_home_retail'].sort_values('date', ascending=False).drop_duplicates('date').sort_index()

In [12]:
for key in xlsx.keys():
    xlsx[key] = xlsx[key].sort_values('date', ascending=False)

## Merging the data into one dataset

In [13]:
def reformat_trimestrielle(df):
    df_expanded = pd.DataFrame()
    for _, row in df.iterrows():
        # Créer 3 mois à partir de la date du trimestre
        months = pd.date_range(start=row['date'], periods=3, freq='ME')
        expanded = pd.DataFrame([row.to_dict()] * 3)
        expanded['date'] = months
        df_expanded = pd.concat([df_expanded, expanded], ignore_index=True)
    return df_expanded

def reformat_annuelle(df):
    df_expanded = pd.DataFrame()
    for _, row in df.iterrows():
        # Créer 12 mois à partir de la date annuelle
        months = pd.date_range(start=row['date'], periods=12, freq='ME')
        expanded = pd.DataFrame([row.to_dict()] * 12)
        expanded['date'] = months
        df_expanded = pd.concat([df_expanded, expanded], ignore_index=True)
    return df_expanded

In [14]:
def process_data(xlsx):
    result = pd.DataFrame()
    data_frames = []
    for sheet_name, df in xlsx.items():

        # Identifier la granularité des données
        freq = pd.infer_freq(df['date'].sort_values())

        if 'Y' in freq:  # Données annuelles
            df_processed = reformat_annuelle(df)
        elif 'Q' in freq:  # Données trimestrielles
            df_processed = reformat_trimestrielle(df)
        elif 'MS' in freq:  # Données mensuelles (aucune duplication)
            df_processed = df.copy()
        data_frames.append(df)

    result = reduce(lambda  left,right: pd.merge(left,right,on=['date'],
                                            how='outer'), data_frames).bfill()
    result = result.ffill()
    
    result['year'] = result['date'].dt.year
    result['month'] = result['date'].dt.month
    result['day'] = result['date'].dt.day
    
    return result

In [None]:
# Traitement des données
df_final = process_data(xlsx)
df_final.head()

## Regression

### unemployment rate 

In [None]:
X = df_final[['year', 'month', 'unemployment_rate.population']]
y = df_final['unemployment_rate.unenployment_rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

model.fit(X_train, y_train)

print("Intercept: ", model.intercept_)
print("Coefficients:")
list(zip(X.columns, model.coef_))

In [None]:
y_pred = model.predict(X_test)

mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred})
mlr_diff.head()

In [None]:
# Résidus
residuals = y_test - y_pred

# Graphique des résidus
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Valeurs prédites")
plt.ylabel("Résidus")
plt.title("Résidus vs Valeurs prédites")
plt.show()

In [None]:
import statsmodels.api as sm
X_test_sm = sm.add_constant(X_test)
bp_test = het_breuschpagan(residuals, X_test_sm)

# Résultats
labels = ['Statistique LM', 'p-value', 'F-statistique', 'p-value F']
print(dict(zip(labels, bp_test)))

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(residuals, kde=True)
plt.title("Distribution des résidus")
plt.show()

In [None]:
X_vif = sm.add_constant(X)  # Ajout de la constante
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i+1) for i in range(len(X.columns))]

print(vif_data)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("R²:", r2)

In [23]:
def linear_regression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LinearRegression()

    model.fit(X_train, y_train)

    print("Intercept: ", model.intercept_)
    print("Coefficients:")
    print(list(zip(X.columns, model.coef_)))

    y_pred = model.predict(X_test)

    meanAbErr = metrics.mean_absolute_error(y_test, y_pred)
    meanSqErr = metrics.mean_squared_error(y_test, y_pred)
    rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

    print('R^2: {:.2f}'.format(r2_score(y_test, y_pred)))

    #Mean Absolute Error is the absolute difference between the true values and the predicted values.
    #The lower the value, the better is the model’s performance.
    print('Mean Absolute Error:', meanAbErr)

    #Mean Square Error is calculated by taking the average of the square of the difference between the original and predicted values of the data.
    #The lower the value, the better is the model’s performance.
    print('Mean Square Error:', meanSqErr)

    #Root Mean Square Error is the standard deviation of the errors which occur when a prediction is made on a dataset. 
    #The root of the value is considered while determining the accuracy of the model.
    print('Root Mean Square Error:', rootMeanSqErr)
    
    return {'model': model, 'y_test': y_test, 'y_pred': y_pred}

In [None]:
linear_regression(X, y)

In [None]:
for name in xlsx['company_revenue'].columns[1:]:
    y = df_final[name]

    L = list(xlsx['gov_total_expense_revenue'].columns)
    L.append('year')
    L.append('month')
    L.append('day')


    X = df_final[L]
    X = X.drop('date', axis=1)
    X = X.drop('gov_total_expense_revenue.surplus_or_deficit', axis=1)

    test = linear_regression(X, y)

    # Résidus
    residuals = test['y_pred'] - test['y_test']


    # Graphique des résidus

    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=test['y_pred'], y=residuals)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel("Valeurs prédites")
    plt.ylabel("Résidus")
    plt.title("Résidus vs Valeurs prédites")
    plt.show()

In [26]:
import statsmodels.formula.api as smf
def run_regression(df, dependent_var, independent_vars, independent_file):
    # Suppression du préfixe pour la variable dépendante
    dep_var_clean = dependent_var.split('.')[-1]


    X = df[[f"{independent_file}.{var}" for var in independent_vars[1:]]]

    # Ajout de la colonne 'year' qui n'a pas de préfixe
    X.loc[:, 'year'] = df[independent_vars[0]]

    new_col_names = [col.split('.')[-1] for col in independent_vars]
    X.columns = new_col_names

    # Extraction de la variable dépendante
    y = df[dependent_var]

    # Fusion des données
    df_model = X.join(y)
    df_model.columns = new_col_names + [dep_var_clean]

    # Construction de la formule de régression
    formula = f"{dep_var_clean} ~ {' + '.join(new_col_names)}"

    # Modélisation
    model = smf.ols(formula=formula, data=df_model).fit()

    return model.summary()

In [None]:
result = run_regression(
    df=df_final,
    dependent_var='company_revenue.speedy_hire_plc_sdy_l',
    independent_vars=['year', 'gbp_to_usd_exchange_rate', 'economic_growth_rate'],
    independent_file='economic_growth'
)

result

In [94]:
def regression_analysis(df, dependent_var, independent_vars, show_feature_importance=False):
    results = []
    feature_importance_results = {}

    # Préparation des données
    X = df[independent_vars].copy()
    y = df[dependent_var].copy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Test de normalité
    p_values = [shapiro(X_train[col])[1] for col in X_train.columns]
    p_value_y = shapiro(y_train)[1]
    normality_verified = all(p > 0.05 for p in p_values) and p_value_y > 0.05

    # Application du scaler
    scaler_X = StandardScaler() if normality_verified else MinMaxScaler()
    scaler_y = StandardScaler() if normality_verified else MinMaxScaler()

    print("Data is standardized" if normality_verified else "Data is normalized")

    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)

    y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
    y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).flatten()

    # Modèles à évaluer
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'Support Vector Regressor': SVR(),
        'MLP Regressor': MLPRegressor(random_state=42, max_iter=1000)
    }

    # KNN avec k variant de 1 à 10
    for k in range(1, 11):
        models[f'KNN (k={k})'] = KNeighborsRegressor(n_neighbors=k)

    for model_name, model in models.items():
        model.fit(X_train_scaled, y_train_scaled)

        # Prédictions (scalées)
        y_pred_train_scaled = model.predict(X_train_scaled)
        y_pred_test_scaled = model.predict(X_test_scaled)

        # Inversion du scaling pour les prédictions
        y_pred_train = scaler_y.inverse_transform(y_pred_train_scaled.reshape(-1, 1)).flatten()
        y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled.reshape(-1, 1)).flatten()

        # Calcul des métriques
        mae_train = mean_absolute_error(y_train, y_pred_train)
        mae_test = mean_absolute_error(y_test, y_pred_test)

        # Importance des features
        if hasattr(model, "coef_"):  # Modèles linéaires
            importance = model.coef_.flatten()
            feature_names = independent_vars

        elif hasattr(model, "feature_importances_"):  # Arbres de décision
            importance = model.feature_importances_
            feature_names = independent_vars

        else:  # Autres modèles (SVR, KNN, MLP)
            perm_importance = permutation_importance(model, X_test_scaled, y_test_scaled, n_repeats=10, random_state=42)
            importance = perm_importance.importances_mean
            feature_names = X.columns

        # Vérification des dimensions
        if len(importance) == len(feature_names):
            feature_importance_results[model_name] = pd.Series(importance, index=feature_names).sort_values(ascending=False)
        else:
            print(f"⚠️ Dimension mismatch for {model_name}: {len(importance)} importances vs {len(feature_names)} features")
            feature_importance_results[model_name] = pd.Series(importance[:len(feature_names)], index=feature_names).sort_values(ascending=False)

        # Tests statistiques pour la régression linéaire uniquement
        if model_name == "Linear Regression":
            X_const = add_constant(X_train_scaled)
            ols_model = OLS(y_train_scaled, X_const).fit()

            r2 = ols_model.rsquared_adj
            shapiro_test = shapiro(ols_model.resid)
            bp_test = het_breuschpagan(ols_model.resid, X_const)
            dw_test = durbin_watson(ols_model.resid)

        else:
            r2 = r2_score(y_test, y_pred_test)
            shapiro_test = (np.nan, np.nan)
            bp_test = (np.nan, np.nan, np.nan, np.nan)
            dw_test = np.nan

        results.append({
            "Model": model_name,
            'MAE Train': mae_train,
            'MAE Test': mae_test,
            'R² Test': r2,
            "Shapiro-W (p-value)": shapiro_test[1],
            "Breusch-Pagan (p-value)": bp_test[3],
            "Durbin-Watson": dw_test
        })

    # Conversion en DataFrame pour analyse
    results_df = pd.DataFrame(results)
    
    if show_feature_importance:
        for model, importance in feature_importance_results.items():
            print(f"\n🔍 Top 10 Features for {model}:")
            print(importance.head(10))

    return results_df


In [29]:
def regression_pipeline(df, dependent_var, independent_vars):
    # Préparation des données
    X = df[independent_vars]
    y = df[dependent_var]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Test de normalité sur X_train
    _, p_value_train = shapiro(X_train)
    
    # Application du scaler en fonction du test de normalité
    if p_value_train > 0.05:  # Normalité vérifiée
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        print("Data is standardised")
        
    else:  # Normalité non vérifiée
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        print("Data is normalized")

    models = {
        "Linear Regression": LinearRegression(),
        "Ridge Regression": Ridge(),
        "Lasso Regression": Lasso(),
        "KNN Regression": KNeighborsRegressor(),
        "Random Forest": RandomForestRegressor(random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(random_state=42)
    }

    results = []

    for name, model in models.items():
                
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)

        # Tests statistiques pour la régression linéaire uniquement
        if name == "Linear Regression":
            X_const = add_constant(X_train_scaled)
            ols_model = OLS(y_train, X_const).fit()

            # Adjusted R-squared
            r2 = ols_model.rsquared_adj

            # Test de normalité des résidus
            shapiro_test = shapiro(ols_model.resid)

            # Test de Breusch-Pagan pour l'homoscédasticité
            bp_test = het_breuschpagan(ols_model.resid, X_const)

            # Test de Durbin-Watson pour l'autocorrélation
            dw_test = durbin_watson(ols_model.resid)

        else:
            r2 = r2_score(y_test, y_pred)
            shapiro_test = (np.nan, np.nan)
            bp_test = (np.nan, np.nan, np.nan, np.nan)
            dw_test = np.nan

        results.append({
            "Model": name,
            "R²": r2,
            "RMSE": rmse,
            "MAE": mae,
            "Shapiro-W (p-value)": shapiro_test[1],
            "Breusch-Pagan (p-value)": bp_test[3],
            "Durbin-Watson": dw_test
        })

    return pd.DataFrame(results)

In [40]:
dependent_vars = list(df_final[xlsx['company_revenue'].columns[1:]])
independent_vars = list(xlsx['bonds'].columns[1:])
independent_vars.append('year')
df = df_final[dependent_vars +independent_vars].copy()

In [85]:
X = list(xlsx['unemployment_rate'].columns[1:]) 
X += list(xlsx['extra_unemployment_rate'].columns[3:5])
X.append(xlsx['gov_total_expense_revenue'].columns[3])
X += list(xlsx['money_supply'].columns[1:])
X += list(xlsx['bonds'].columns[1:])
X += list(xlsx['cpi'].columns[1:])
X += list(xlsx['economic_growth'].columns[1:])
X += list(xlsx['import_export'].columns[1:])
X += list(xlsx['seasonal_retail_sales'].columns[1:])
#X += list(xlsx['uk_home_retail'].columns[1:]) too many columns
X += list(xlsx['uk_building'].columns[1:])
#X += list(xlsx['uk_affordability_index'].columns[1:]) too many columns
X += list(xlsx['uk_retail_price'].columns[1:])
X += list(xlsx['mortage'].columns[1:])
X += list(xlsx['mortage_interest_rate'].columns[1:])
X.append(xlsx['construction_cost_prices_sales'].columns[1])
X += list(xlsx['construction_cost_prices_sales'].columns[3:5])
X += list(xlsx['construction_cost_prices_sales'].columns[6:])
X.append('year')

In [86]:
df = df_final[dependent_vars + X].copy()

In [100]:
def regression_analysis(df, dependent_var, independent_vars, show_feature_importance=False):
    results = []
    feature_importance_results = {}
    best_models = {}  # Stockage des meilleurs modèles

    # Préparation des données
    X = df[independent_vars].copy()
    y = df[dependent_var].copy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Test de normalité
    p_values = [shapiro(X_train[col])[1] for col in X_train.columns]
    normality_verified = all(p > 0.05 for p in p_values)

    # Application du scaler
    scaler = StandardScaler() if normality_verified else MinMaxScaler()
    print("Data is standardized" if normality_verified else "Data is normalized")

    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Modèles à évaluer
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'Support Vector Regressor': SVR(),
        #'MLP Regressor': MLPRegressor(random_state=42, max_iter=1000)
    }

    # KNN avec k variant de 1 à 10
    for k in range(3, 11):
        models[f'KNN (k={k})'] = KNeighborsRegressor(n_neighbors=k)

    for model_name, model in models.items():
        model.fit(X_train_scaled, y_train)

        # Prédictions
        y_pred_train = model.predict(X_train_scaled)
        y_pred_test = model.predict(X_test_scaled)

        # Calcul des métriques
        mae_train = mean_absolute_error(y_train, y_pred_train)
        mae_test = mean_absolute_error(y_test, y_pred_test)

        # Importance des features
        if hasattr(model, "coef_"):  # Modèles linéaires
            importance = model.coef_.flatten()
            feature_names = independent_vars
        elif hasattr(model, "feature_importances_"):  # Arbres de décision
            importance = model.feature_importances_
            feature_names = independent_vars
        else:  # Autres modèles (SVR, KNN, MLP)
            perm_importance = permutation_importance(model, X_test_scaled, y_test, n_repeats=10, random_state=42)
            importance = perm_importance.importances_mean
            feature_names = X.columns

        # Vérification des dimensions
        if len(importance) == len(feature_names):
            feature_importance_results[model_name] = pd.Series(importance, index=feature_names).sort_values(ascending=False)
        else:
            print(f"⚠️ Dimension mismatch for {model_name}: {len(importance)} importances vs {len(feature_names)} features")
            feature_importance_results[model_name] = pd.Series(importance[:len(feature_names)], index=feature_names).sort_values(ascending=False)

        # Tests statistiques pour la régression linéaire uniquement
        if model_name == "Linear Regression":
            X_const = add_constant(X_train_scaled)
            ols_model = OLS(y_train, X_const).fit()

            r2 = ols_model.rsquared_adj
            shapiro_test = shapiro(ols_model.resid)
            bp_test = het_breuschpagan(ols_model.resid, X_const)
            dw_test = durbin_watson(ols_model.resid)
        else:
            r2 = r2_score(y_test, y_pred_test)
            shapiro_test = (np.nan, np.nan)
            bp_test = (np.nan, np.nan, np.nan, np.nan)
            dw_test = np.nan

        results.append({
            "Model": model_name,
            'MAE Train': mae_train,
            'MAE Test': mae_test,
            'R² Test': r2,
            "Shapiro-W (p-value)": shapiro_test[1],
            "Breusch-Pagan (p-value)": bp_test[3],
            "Durbin-Watson": dw_test
        })

    # Sélection du meilleur modèle (basé sur MAE Test et R²)
    results_df = pd.DataFrame(results)
    best_model_row = results_df.sort_values(by=['MAE Test', 'R² Test'], ascending=[True, False]).iloc[0]
    best_model_name = best_model_row['Model']
    best_models[dependent_var] = {
        "Model": best_model_name,
        "MAE Train": best_model_row['MAE Train'],
        "MAE Test": best_model_row['MAE Test'],
        "R² Test": best_model_row['R² Test'],
        "Top Features": feature_importance_results[best_model_name].head(10)
    }

    if show_feature_importance:
        for model, importance in feature_importance_results.items():
            print(f"\n🔍 Top 10 Features for {model}:")
            print(importance.head(10))

    return results_df, best_models

## Revenues prediction

In [101]:
for i in dependent_vars:
    resultats = regression_analysis(df, i, X)
    print(f'y: {i}\n ', resultats)

y: company_revenue.speedy_hire_plc_sdy_l
  (                       Model  MAE Train   MAE Test   R² Test  \
0          Linear Regression   3.877175  11.164484  0.885778   
1              Random Forest   2.452950   9.253837  0.741999   
2          Gradient Boosting   1.492737   8.008315  0.757074   
3   Support Vector Regressor  12.125114  21.001732  0.261103   
4                  KNN (k=3)   2.710307   9.314530  0.679642   
5                  KNN (k=4)   3.107895   9.784615  0.658962   
6                  KNN (k=5)   3.659474  10.349744  0.644053   
7                  KNN (k=6)   4.114254  11.217094  0.641233   
8                  KNN (k=7)   4.320113  10.970696  0.649528   
9                  KNN (k=8)   4.720806  11.518590  0.637609   
10                 KNN (k=9)   5.291667  11.769801  0.631151   
11                KNN (k=10)   5.721053  11.755385  0.662547   

    Shapiro-W (p-value)  Breusch-Pagan (p-value)  Durbin-Watson  
0          3.260848e-10             2.978892e-11       2.

## Cost prediction

In [102]:
dependent_vars = list(df_final[xlsx['operating_expenses'].columns[1:]])
df = df_final[dependent_vars + X].copy()

In [103]:
for i in dependent_vars:
    resultats = regression_analysis(df, i, X)
    print(f'y: {i}\n ', resultats)

Data is normalized
⚠️ Dimension mismatch for Linear Regression: 83 importances vs 77 features
⚠️ Dimension mismatch for Random Forest: 83 importances vs 77 features
⚠️ Dimension mismatch for Gradient Boosting: 83 importances vs 77 features
y: operating_expenses.ashtead_group_plc_aht_l
  (                       Model    MAE Train     MAE Test   R² Test  \
0          Linear Regression    63.429908   201.211474  0.994555   
1              Random Forest    35.763853   102.898435  0.989061   
2          Gradient Boosting    17.299989    99.205038  0.986845   
3   Support Vector Regressor  1516.470244  1605.577114 -0.076997   
4                  KNN (k=3)    44.954605    93.638462  0.990340   
5                  KNN (k=4)    53.506579   107.426282  0.988924   
6                  KNN (k=5)    63.995395   113.209231  0.988389   
7                  KNN (k=6)    75.749123   136.854274  0.987199   
8                  KNN (k=7)    81.258177   134.480952  0.987475   
9                  KNN (k=8)   