# Candidate City Selection for Causal Inference

This notebook iterates over all cities as potential targets and identifies control cities that match the multi-layer robustness criteria from the causal impact analysis.

In [23]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
from causalimpact import CausalImpact
import seaborn as sns
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import mutual_info_regression
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

# Set visual style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# Set up periods
pre_beg, pre_end = '2023-01-01', '2023-05-31'
post_beg, post_end = '2023-06-01', '2023-06-30'
pre_period = [pre_beg, pre_end]
post_period = [post_beg, post_end]

In [24]:
def select_sparse_portfolio(df_pre, target):
    """TRICK 1: LASSO selection for a sparse 'optimal' control group."""
    from sklearn.linear_model import LassoCV
    X = df_pre.drop(columns=[target]).fillna(0)
    y = df_pre[target].fillna(0)
    lasso = LassoCV(cv=5, random_state=42).fit(X, y)
    import pandas as pd
    weights = pd.Series(lasso.coef_, index=X.columns)
    return weights[weights != 0].sort_values(ascending=False)

def calculate_dtw_distance(s1, s2):
    """TRICK 3: Dynamic Time Warping for shape-based matching."""
    from fastdtw import fastdtw
    distance, path = fastdtw(s1.fillna(0).values, s2.fillna(0).values, dist=lambda a, b: abs(a - b))
    return distance

def get_mutual_info(s1, s2):
    """TRICK 4: Non-linear dependency detection."""
    from sklearn.feature_selection import mutual_info_regression
    mi = mutual_info_regression(s1.values.reshape(-1, 1), s2)
    return mi[0]

def get_stationary_transform(series, seasonal_period=7):
    """Returns (step_name, transform_func) or (None, None)."""
    from statsmodels.tsa.stattools import adfuller
    def is_stationary(s):
        try: return adfuller(s.dropna())[1] < 0.05
        except: return False
    if is_stationary(series): return "Raw", lambda s: s
    try:
        s_log = np.log(series)
        if is_stationary(s_log): return "Log", lambda s: np.log(s)
        s_diff = s_log.diff()
        if is_stationary(s_diff): return "Log+Diff", lambda s: np.log(s).diff()
        s_seasonal = s_diff.diff(seasonal_period)
        if is_stationary(s_seasonal): return "Log+Diff+Seasonal", lambda s: np.log(s).diff().diff(seasonal_period)
    except: pass
    return None, None

def select_best_controls(df_full, target, pre_beg, pre_end, t1_thresh=0.8, t2_thresh=0.6):
    """SENIOR SELECTION: Multi-metric ranking using Correlation, LASSO, DTW, Mutual Info, and Granger Causality."""
    import pandas as pd
    import numpy as np
    from statsmodels.tsa.stattools import grangercausalitytests
    df_pre = df_full.loc[pre_beg:pre_end]
    potential_controls = [c for c in df_full.columns if c != target]
    
    print(f"--- Senior Selection for {target} ---")
    
    lasso_weights = select_sparse_portfolio(df_pre, target)
    
    results = []
    for city in potential_controls:
        corr_raw = df_pre[target].corr(df_pre[city])
        step_name, transform_func = get_stationary_transform(df_pre[city])
        
        corr_trans = 0
        var_ratio = 0
        granger_p = 1.0
        
        if step_name:
            s_city_trans = transform_func(df_pre[city]).dropna()
            s_target_trans = transform_func(df_pre[target]).dropna()
            joined = pd.concat([s_city_trans, s_target_trans], axis=1).dropna()
            joined.columns = [city, target]
            corr_trans = joined.iloc[:, 0].corr(joined.iloc[:, 1])
            var_ratio = s_city_trans.std() / s_target_trans.std() if s_target_trans.std() != 0 else 0
            
            # Granger Causality Test
            try:
                # Test if city causes target
                granger_result = grangercausalitytests(joined[[target, city]], maxlag=2, verbose=False)
                # p-value of the F-test for lag 1
                granger_p = granger_result[1][0]['params_ftest'][1]
            except: pass
        
        dtw_dist = calculate_dtw_distance(df_pre[target], df_pre[city])
        mi_score = get_mutual_info(df_pre[city], df_pre[target])
        lasso_w = lasso_weights.get(city, 0.0)
        
        # Tiering Logic refined with Granger and Variance
        tier = "None"
        if corr_trans > t1_thresh and 0.5 < var_ratio < 2.0:
            # Elite if predictive (Granger) OR strong ML evidence (LASSO + MI)
            if granger_p < 0.05 or (lasso_w > 0 and mi_score > 0.6):
                tier = "Tier 1 (Elite)"
            else:
                tier = "Tier 2 (Robust)"
        elif corr_raw > t2_thresh:
            tier = "Tier 3 (Baseline)"
            
        results.append({
            'City': city,
            'Corr_Transformed': corr_trans,
            'Variance_Ratio': var_ratio,
            'Granger_p': granger_p,
            'DTW_Distance': dtw_dist,
            'Mutual_Info': mi_score,
            'LASSO_Weight': lasso_w,
            'Selection_Tier': tier
        })
        
    matrix = pd.DataFrame(results).sort_values(['Selection_Tier', 'DTW_Distance'])
    selected = matrix[matrix['Selection_Tier'].isin(["Tier 1 (Elite)", "Tier 2 (Robust)"])]['City'].tolist()
    
    if not selected:
        selected = matrix[matrix['Selection_Tier'] == "Tier 3 (Baseline)"]['City'].tolist()
        status = "Falling back to Baseline candidates."
    else:
        status = f"Successfully selected {len(selected)} high-purity candidates."
        
    return matrix, selected, status

In [25]:
df_long = pd.read_csv('sales_data.csv')
df_long['Date'] = pd.to_datetime(df_long['Date'])

# Pivot the long format data back to wide format for analysis
df = df_long.pivot(index='Date', columns='City', values='Value')

print(f"Dataset loaded in LONG format and pivoted to WIDE. Total columns: {len(df.columns)}")
df.head()

df.index.freq = 'D'

Dataset loaded in LONG format and pivoted to WIDE. Total columns: 23


In [26]:
# List all cities
all_cities = df.columns.tolist()

# To store candidate pairs
candidate_pairs = []

for target in all_cities:
    selection_matrix, selected_cities, selection_status = select_best_controls(df, target, pre_beg, pre_end)
    # Add pairs
    for control in selected_cities:
        candidate_pairs.append((target, control))

# Convert to DataFrame for easy viewing
pairs_df = pd.DataFrame(candidate_pairs, columns=['Target', 'Control'])


# Save to CSV
pairs_df.to_csv('candidate_pairs.csv', index=False)
print("Candidate pairs saved to 'candidate_pairs.csv'")

--- Senior Selection for Barcelona ---


  model = cd_fast.enet_coordinate_descent(


--- Senior Selection for City_1 ---
--- Senior Selection for City_10 ---


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


--- Senior Selection for City_11 ---
--- Senior Selection for City_12 ---
--- Senior Selection for City_13 ---


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


--- Senior Selection for City_14 ---
--- Senior Selection for City_15 ---


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

--- Senior Selection for City_16 ---
--- Senior Selection for City_17 ---
--- Senior Selection for City_18 ---
--- Senior Selection for City_19 ---
--- Senior Selection for City_2 ---
--- Senior Selection for City_20 ---
--- Senior Selection for City_3 ---


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

--- Senior Selection for City_4 ---
--- Senior Selection for City_5 ---
--- Senior Selection for City_6 ---


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


--- Senior Selection for City_7 ---


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

--- Senior Selection for City_8 ---
--- Senior Selection for City_9 ---
--- Senior Selection for City_High_Variance ---


  model = cd_fast.enet_coordinate_descent_gram(


--- Senior Selection for City_Spurious ---
Candidate pairs saved to 'candidate_pairs.csv'


## experiment Design: Power & Volume Analysis

## experiment Design: Power & Volume Analysis

## experiment Design: Power & Volume Analysis

In [27]:
def run_power_simulation(df_full, target, controls, lift_percent, pre_period, post_period, num_sims=10):
    """Simulates a synthetic lift and returns detection power."""
    data_base = df_full[[target] + controls].fillna(method='ffill').fillna(0)
    hits = 0
    for _ in range(num_sims):
        sim_data = data_base.copy()
        pre_mean = data_base.loc[pre_period[0]:pre_period[1], target].mean()
        lift_amount = pre_mean * (lift_percent / 100)
        sim_data.loc[post_period[0]:post_period[1], target] += lift_amount
        try:
            ci = CausalImpact(sim_data, pre_period, post_period)
            if ci.p_value < 0.05: hits += 1
        except: pass
    return hits / num_sims

def get_volume_requirements(df_full, target, controls, pre_period, post_period, lift_range=[2, 5, 10, 15, 20]):
    results = []
    for lift in lift_range:
        power = run_power_simulation(df_full, target, controls, lift, pre_period, post_period)
        results.append({'Lift_%': lift, 'Power': power})
    return pd.DataFrame(results)

In [28]:
target_city = 'City_1'
matrix, best_selection, status = select_best_controls(df, target_city, pre_beg, pre_end)
print(status)
display(matrix[matrix['Selection_Tier'] != 'None'].head(10))
if best_selection:
    power_df = get_volume_requirements(df, target_city, best_selection[:3], pre_period, post_period)
    print(power_df)


--- Senior Selection for City_1 ---
Successfully selected 1 high-purity candidates.


Unnamed: 0,City,Corr_Transformed,Variance_Ratio,Granger_p,DTW_Distance,Mutual_Info,LASSO_Weight,Selection_Tier
9,City_18,0.846793,0.751994,6.377158e-12,1136.873046,0.711615,0.111308,Tier 1 (Elite)
15,City_5,0.002337,1.031829,0.3804227,573.341353,0.542119,0.07039,Tier 3 (Baseline)
12,City_20,0.017038,0.929923,0.05564714,573.391844,0.516016,0.187086,Tier 3 (Baseline)
2,City_11,-0.0504,0.996616,0.1913188,594.681151,0.515066,0.110556,Tier 3 (Baseline)
7,City_16,0.218296,1.091756,0.0176818,602.262323,0.642986,0.167968,Tier 3 (Baseline)
11,City_2,-0.075044,0.225558,0.133563,1013.378407,0.623704,0.161631,Tier 3 (Baseline)
18,City_8,0.0,0.0,1.0,1043.829508,0.679896,0.139162,Tier 3 (Baseline)
19,City_9,-0.156884,0.21951,0.3271496,1054.582421,0.661001,0.0,Tier 3 (Baseline)
5,City_14,0.0,0.0,1.0,1074.296751,0.623685,0.0,Tier 3 (Baseline)
10,City_19,0.049834,0.243736,0.214722,1083.853897,0.673812,0.001575,Tier 3 (Baseline)


   Lift_%  Power
0       2    1.0
1       5    1.0
2      10    1.0
3      15    0.0
4      20    0.0
