# Candidate City Selection for Causal Inference

This notebook iterates over all cities as potential targets and identifies control cities that match the multi-layer robustness criteria from the causal impact analysis.

In [24]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
from causalimpact import CausalImpact
import seaborn as sns
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import mutual_info_regression
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

# Set visual style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# Set up periods
pre_beg, pre_end = '2023-01-01', '2023-05-31'
post_beg, post_end = '2023-06-01', '2023-06-30'
pre_period = [pre_beg, pre_end]
post_period = [post_beg, post_end]

In [25]:
def get_stationary_transform(series, seasonal_period=7):
    """Returns (step_name, transform_func) or (None, None)."""
    def is_stationary(s):
        try:
            return adfuller(s.dropna())[1] < 0.05
        except:
            return False

    if is_stationary(series):
        return "Raw", lambda s: s
    
    try:
        s_log = np.log(series)
        if is_stationary(s_log): return "Log", lambda s: np.log(s)
        
        s_diff = s_log.diff()
        if is_stationary(s_diff): return "Log+Diff", lambda s: np.log(s).diff()
        
        s_seasonal = s_diff.diff(seasonal_period)
        if is_stationary(s_seasonal): 
            return "Log+Diff+Seasonal", lambda s: np.log(s).diff().diff(seasonal_period)
    except:
        pass
        
    return None, None

def select_best_controls(df_full, target, pre_beg, pre_end, t1_thresh=0.8, t2_thresh=0.6):
    df_pre = df_full.loc[pre_beg:pre_end]
    print(f"Selection tests applied to range: {df_pre.index.min()} to {df_pre.index.max()}")
    potential_controls = [c for c in df_full.columns if c != target]
    
    results = []
    for city in potential_controls:
        corr_raw = df_pre[target].corr(df_pre[city])
        step_name, transform_func = get_stationary_transform(df_pre[city])
        
        corr_trans = None
        granger_p = None
        var_ratio = None
        
        if step_name:
            s_city_trans = transform_func(df_pre[city]).dropna()
            s_target_trans = transform_func(df_pre[target]).dropna()
            
            # Behavioral Correlation
            joined = pd.concat([s_city_trans, s_target_trans], axis=1).dropna()
            corr_trans = joined.iloc[:, 0].corr(joined.iloc[:, 1])
            
            # Granger Causality
            try:
                granger_result = grangercausalitytests(joined[[target, city]], maxlag=2)
                granger_p = granger_result[1][0]['params_ftest'][1]
            except: granger_p = 1.0
            
            # Variance Ratio (Volatility Matching)
            var_ratio = s_city_trans.std() / s_target_trans.std()
        
        # Initial Assignment with Variance and Correlation Filter
        tier = "None"
        if corr_trans and corr_trans > t1_thresh:
            if 0.5 < var_ratio < 2.0:
                tier = "Tier 1 (Strict)"
            else:
                tier = "Rejected (High Variance)"
        elif corr_trans and corr_trans > t2_thresh and corr_raw > t2_thresh:
            tier = "Tier 2 (Fallback)"
        
        results.append({
            'City': city,
            'Correlation_Raw': corr_raw,
            'Correlation_Transformed': corr_trans,
            'Granger_p_value': granger_p,
            'Variance_Ratio': var_ratio,
            'Selection_Tier': tier
        })
    
    matrix = pd.DataFrame(results)
    
    # Final Selection Logic
    t1_cities = matrix[matrix['Selection_Tier'] == "Tier 1 (Strict)"]['City'].tolist()
    if t1_cities:
        selected = t1_cities
        status = f"Using Tier 1 (Strict) selection."
    else:
        t2_cities = matrix[matrix['Selection_Tier'] == "Tier 2 (Fallback)"]['City'].tolist()
        selected = t2_cities
        status = "No Tier 1 cities found. Falling back to Tier 2." if t2_cities else "No eligible cities found."
        
    return matrix, selected, status

In [None]:
def select_sparse_portfolio(df_pre, target):
    """TRICK 1: LASSO selection for a sparse 'optimal' control group."""
    X = df_pre.drop(columns=[target]).fillna(0)
    y = df_pre[target].fillna(0)
    
    lasso = LassoCV(cv=5, random_state=42).fit(X, y)
    weights = pd.Series(lasso.coef_, index=X.columns)
    
    return weights[weights != 0].sort_values(ascending=False)

def check_rolling_stability(df_pre, target, city, window=30):
    """TRICK 2: Filter out volatile/drifting correlations."""
    rolling_corr = df_pre[target].rolling(window).corr(df_pre[city])
    return rolling_corr.mean(), rolling_corr.std()

def calculate_dtw_distance(s1, s2):
    """TRICK 3: Dynamic Time Warping for shape-based matching."""
    distance, path = fastdtw(s1.fillna(0).values, s2.fillna(0).values, dist=lambda a, b: abs(a - b))
    return distance

def get_mutual_info(s1, s2):
    """TRICK 4: Non-linear dependency detection."""
    mi = mutual_info_regression(s1.values.reshape(-1, 1), s2)
    return mi[0]

In [26]:
df_long = pd.read_csv('sales_data.csv')
df_long['Date'] = pd.to_datetime(df_long['Date'])

# Pivot the long format data back to wide format for analysis
df = df_long.pivot(index='Date', columns='City', values='Value')

print(f"Dataset loaded in LONG format and pivoted to WIDE. Total columns: {len(df.columns)}")
df.head()

df.index.freq = 'D'

Dataset loaded in LONG format and pivoted to WIDE. Total columns: 23


In [34]:
# List all cities
all_cities = df.columns.tolist()

# To store candidate pairs
candidate_pairs = []

for target in all_cities:
    selection_matrix, selected_cities, selection_status = select_best_controls(df, target, pre_beg, pre_end)
    # Add pairs
    for control in selected_cities:
        candidate_pairs.append((target, control))

# Convert to DataFrame for easy viewing
pairs_df = pd.DataFrame(candidate_pairs, columns=['Target', 'Control'])


# Save to CSV
pairs_df.to_csv('candidate_pairs.csv', index=False)
print("Candidate pairs saved to 'candidate_pairs.csv'")

Selection tests applied to range: 2023-01-01 00:00:00 to 2023-05-31 00:00:00

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.4839  , p=0.4878  , df_denom=146, df_num=1
ssr based chi2 test:   chi2=0.4938  , p=0.4822  , df=1
likelihood ratio test: chi2=0.4930  , p=0.4826  , df=1
parameter F test:         F=0.4839  , p=0.4878  , df_denom=146, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.1538  , p=0.8576  , df_denom=143, df_num=2
ssr based chi2 test:   chi2=0.3184  , p=0.8528  , df=2
likelihood ratio test: chi2=0.3180  , p=0.8530  , df=2
parameter F test:         F=0.1538  , p=0.8576  , df_denom=143, df_num=2

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.5947  , p=0.4418  , df_denom=147, df_num=1
ssr based chi2 test:   chi2=0.6069  , p=0.4360  , df=1
likelihood ratio test: chi2=0.6056  , p=0.4364  , df=1
parameter F test:         F=0.5947  , p=0.4418  , df_denom=147, df_num=1

Granger Causalit

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


ssr based F test:         F=2.1604  , p=0.1439  , df_denom=139, df_num=1
ssr based chi2 test:   chi2=2.2071  , p=0.1374  , df=1
likelihood ratio test: chi2=2.1901  , p=0.1389  , df=1
parameter F test:         F=2.1604  , p=0.1439  , df_denom=139, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.1993  , p=0.3046  , df_denom=136, df_num=2
ssr based chi2 test:   chi2=2.4867  , p=0.2884  , df=2
likelihood ratio test: chi2=2.4651  , p=0.2916  , df=2
parameter F test:         F=1.1993  , p=0.3046  , df_denom=136, df_num=2

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.9559  , p=0.1641  , df_denom=147, df_num=1
ssr based chi2 test:   chi2=1.9959  , p=0.1577  , df=1
likelihood ratio test: chi2=1.9827  , p=0.1591  , df=1
parameter F test:         F=1.9559  , p=0.1641  , df_denom=147, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.2606  , p=0.2866  , df_denom=144, df_num=2
ssr based chi2 test: 

## Outside the Box: Advanced Selection Demo

In [None]:
target_city = 'City_1' # Example
df_pre = df.loc[pre_beg:pre_end]

print(f'--- Advanced Selection for {target_city} ---')

# 1. LASSO Portfolio
lasso_weights = select_sparse_portfolio(df_pre, target_city)
print('\n1. LASSO Weights (The Sparse Portfolio):')
print(lasso_weights)

# 2. Advanced Metrics Loop
# We use a limited set of cities for the demo to save time
potential_controls = [c for c in df.columns if c != target_city][:10] # Top 10 for speed
adv_results = []

for city in potential_controls:
    stab_mean, stab_std = check_rolling_stability(df_pre, target_city, city)
    dtw_dist = calculate_dtw_distance(df_pre[target_city], df_pre[city])
    mi_score = get_mutual_info(df_pre[city], df_pre[target_city])
    
    adv_results.append({
        'City': city,
        'Rolling_Corr_Mean': stab_mean,
        'Rolling_Corr_Std': stab_std,
        'DTW_Distance': dtw_dist,
        'Mutual_Info': mi_score
    })

df_adv = pd.DataFrame(adv_results).sort_values('DTW_Distance')
print('\n2. Advanced Metrics (Demo Subset):')
display(df_adv.head(5))

# 3. Selection Trick: Stability vs Correlation
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_adv, x='Rolling_Corr_Mean', y='Rolling_Corr_Std', size='Mutual_Info', hue='DTW_Distance')
plt.title('Selection Trick: Stability vs Correlation (Size=MI)')
plt.grid(True)
plt.show()