In [1]:

# It's interesting to compare feature importance to causal inference!

#Feature Importance:
#  Feature importance is a concept within machine learning that quantifies the contribution of each feature (independent variable) to the predictive performance of a model.
#  It helps identify which features are most influential in making predictions, providing insights into the relative importance of different variables.
#  Feature importance is generally derived from the model's internal mechanisms, such as coefficients in linear models or impurity reduction in tree-based models.

#Causal Inference:
#  Causal inference aims to understand cause-and-effect relationships between variables. It goes beyond predictive modeling and seeks to identify the causal impact of one variable on another.
#  Causal inference involves determining the extent to which changes in one variable cause changes in another while considering and addressing potential confounding factors.
#  Techniques for causal inference often involve experimental design, observational studies, and statistical methods that account for potential biases and confounding.

# let's run an experiment and see how things play out.


In [3]:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from causalnex.structure import StructureModel
from causalnex.discretiser import Discretiser
from causalnex.network import BayesianNetwork
from causalnex.inference import InferenceEngine
import matplotlib.pyplot as plt
from sklearn.preprocessing import KBinsDiscretizer


# Generate synthetic data for crime prediction
np.random.seed(42)

# Features (independent variables)
population_density = np.random.uniform(100, 1000, 1000)
unemployment_rate = np.random.uniform(2, 15, 1000)
poverty_rate = np.random.uniform(5, 30, 1000)

# Continuous target variable (Crime rate)
crime_rate = 50 + 2 * population_density + 3 * unemployment_rate + 5 * poverty_rate + np.random.normal(0, 10, 1000)

# Create a DataFrame
crime_data = pd.DataFrame({
    'PopulationDensity': population_density,
    'UnemploymentRate': unemployment_rate,
    'PovertyRate': poverty_rate,
    'CrimeRate': crime_rate
})
    

In [4]:

# Discretize the features using KMeansDiscretizer
discretised_data = crime_data.copy()
for feature in ['PopulationDensity', 'UnemploymentRate', 'PovertyRate']:
    discretiser = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='kmeans')
    discretised_data[feature] = discretiser.fit_transform(discretised_data[[feature]])
    
    
# Split the data into training and testing sets
train_data, test_data = train_test_split(discretised_data, test_size=0.2, random_state=42)


# Define the structure of the Bayesian Network
sm = StructureModel()
sm.add_edges_from([
    ('PopulationDensity', 'CrimeRate'),
    ('UnemploymentRate', 'CrimeRate'),
    ('PovertyRate', 'CrimeRate'),
])




In [9]:

# Discretize the target variable
# The double square brackets [['CrimeRate']] create a DataFrame with a single column, 
# ensuring that the input is a 2D array.
train_data['CrimeRate'] = discretiser.fit_transform(train_data[['CrimeRate']])

# Create and fit the Bayesian Network
bn = BayesianNetwork(sm)
bn.fit_node_states(train_data)
bn.fit_cpds(train_data)


# Use the Bayesian Network for causal inference
ie = InferenceEngine(bn)
causal_effects = ie.query()




In [10]:

print(f'Causal Effects on CrimeRate:')
for variable, effect in causal_effects.items():
    print(f'{variable}: {effect}')
    

# Print the interpretation of the results
print(f'Causal Effects on CrimeRate:')
for variable, effect in causal_effects.items():
    print(f'{variable}:')
    print(f'  Effect Information: {effect}')
    print()
    

Causal Effects on CrimeRate:
PopulationDensity: {0.0: 0.22374999999999998, 1.0: 0.19875000000000004, 2.0: 0.18625, 3.0: 0.18000000000000005, 4.0: 0.21125}
CrimeRate: {0.0: 0.22631938810239235, 1.0: 0.19308480017885765, 2.0: 0.19631787329799114, 3.0: 0.17803114766555056, 4.0: 0.2062467907552083}
UnemploymentRate: {0.0: 0.1925, 1.0: 0.16374999999999995, 2.0: 0.195, 3.0: 0.2275, 4.0: 0.22125000000000003}
PovertyRate: {0.0: 0.185, 1.0: 0.20750000000000002, 2.0: 0.19875000000000004, 3.0: 0.2, 4.0: 0.20875}
Causal Effects on CrimeRate:
PopulationDensity:
  Effect Information: {0.0: 0.22374999999999998, 1.0: 0.19875000000000004, 2.0: 0.18625, 3.0: 0.18000000000000005, 4.0: 0.21125}

CrimeRate:
  Effect Information: {0.0: 0.22631938810239235, 1.0: 0.19308480017885765, 2.0: 0.19631787329799114, 3.0: 0.17803114766555056, 4.0: 0.2062467907552083}

UnemploymentRate:
  Effect Information: {0.0: 0.1925, 1.0: 0.16374999999999995, 2.0: 0.195, 3.0: 0.2275, 4.0: 0.22125000000000003}

PovertyRate:
  Effe

In [None]:

# How can we interpret the final results?

# The estimated effects provided for each level (bin) of UnemploymentRate and PovertyRate represent the causal 
# effects of these variables on CrimeRate as predicted by the Bayesian Network model.

# In causal inference, a causal effect is an estimate of the change in the outcome variable (in this case, 
# CrimeRate) that is directly attributable to a change in the predictor variable (such as UnemploymentRate or 
# PovertyRate). 


# UnemploymentRate:

#  For each bin of UnemploymentRate (0.0, 1.0, 2.0, 3.0, 4.0), the estimated effect on CrimeRate is given.
#  For example, when UnemploymentRate is in the bin 0.0, the estimated effect on CrimeRate is approximately 
#  0.1925. Similarly, for other bins.


# PovertyRate:

#  Similar interpretation applies to PovertyRate. Each bin of PovertyRate (0.0, 1.0, 2.0, 3.0, 4.0) has an 
#  estimated effect on CrimeRate.
#  For example, when PovertyRate is in the bin 0.0, the estimated effect on CrimeRate is approximately 0.1850. 
#  Similarly, for other bins.


# These values represent the estimated change in CrimeRate associated with a one-unit increase in the respective 
# bin of UnemploymentRate or PovertyRate. The positive values suggest a positive association, indicating that 
# higher levels of UnemploymentRate or PovertyRate are associated with higher CrimeRate.


In [None]:

# You can learn about Feature Importance here:
# https://github.com/ash-wicus-ml/Notebooks/blob/master/XG%20Boost%20-%20Feature%20Importance.ipynb
