# Automated City Pairing for Causal Analysis

This notebook iterates through all available cities, identifies suitable control candidates using advanced metrics, and exports the findings to `city_pairings.csv`. This automates the selection process for any configuration of target and controls.

In [7]:
import pandas as pd
import numpy as np
import warnings
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
from tqdm import tqdm

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

# Constants from causal_impact.ipynb
pre_beg, pre_end = '2023-01-01', '2023-05-31'
t1_thresh = 0.8
t2_thresh = 0.6

## 1. Load and Prepare Data

In [8]:
df_long = pd.read_csv('sales_data.csv')
df_long['Date'] = pd.to_datetime(df_long['Date'])
df = df_long.pivot(index='Date', columns='City', values='Value')
df.index.freq = 'D'

cities = sorted(df.columns.tolist())
print(f"Processing {len(cities)} cities...")

Processing 23 cities...


## 2. Refined Selection Step (Sequential Stationarity)

We evaluate each of the 20 cities. A city is selected if it has high correlation AND can be made stationary through our standard pipeline.

In [9]:
def get_stationary_transform(series, seasonal_period=7):
    """Returns (step_name, transform_func) or (None, None)."""
    def is_stationary(s):
        try:
            return adfuller(s.dropna())[1] < 0.05
        except:
            return False

    if is_stationary(series):
        return "Raw", lambda s: s
    
    try:
        s_log = np.log(series)
        if is_stationary(s_log): return "Log", lambda s: np.log(s)
        
        s_diff = s_log.diff()
        if is_stationary(s_diff): return "Log+Diff", lambda s: np.log(s).diff()
        
        s_seasonal = s_diff.diff(seasonal_period)
        if is_stationary(s_seasonal): 
            return "Log+Diff+Seasonal", lambda s: np.log(s).diff().diff(seasonal_period)
    except:
        pass
        
    return None, None

## 3. Batch Evaluation (Adapting select_best_controls loop)

In [10]:
results = []
df_pre = df.loc[pre_beg:pre_end]

for target in tqdm(cities, desc="Targets"):
    for city in cities:
        if target == city:
            continue
            
        corr_raw = df_pre[target].corr(df_pre[city])
        step_name, transform_func = get_stationary_transform(df_pre[city])
        
        corr_trans = None
        granger_p = None
        var_ratio = None
        
        if step_name:
            s_city_trans = transform_func(df_pre[city]).dropna()
            s_target_trans = transform_func(df_pre[target]).dropna()
            
            # Behavioral Correlation
            joined = pd.concat([s_city_trans, s_target_trans], axis=1).dropna()
            corr_trans = joined.iloc[:, 0].corr(joined.iloc[:, 1])
            
            # Granger Causality
            try:
                granger_result = grangercausalitytests(joined[[target, city]], maxlag=2, verbose=False)
                granger_p = granger_result[1][0]['params_ftest'][1]
            except: granger_p = 1.0
            
            # Variance Ratio (Volatility Matching)
            var_ratio = s_city_trans.std() / s_target_trans.std()
        
        # Initial Assignment with Variance and Correlation Filter (Verbatim logic)
        tier = "None"
        if corr_trans and corr_trans > t1_thresh:
            if 0.5 < var_ratio < 2.0:
                tier = "Tier 1 (Strict)"
            else:
                tier = "Rejected (High Variance)"
        elif corr_trans and corr_trans > t2_thresh and corr_raw > t2_thresh:
            tier = "Tier 2 (Fallback)"
        
        results.append({
            'Target': target,
            'City': city,
            'Correlation_Raw': corr_raw,
            'Correlation_Transformed': corr_trans,
            'Granger_p_value': granger_p,
            'Variance_Ratio': var_ratio,
            'Selection_Tier': tier
        })

pairings_df = pd.DataFrame(results)
pairings_df.to_csv('city_pairings.csv', index=False)
print(f"\nSaved {len(pairings_df)} pairs to city_pairings.csv")

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
Targets: 100%|██████████| 23/23 [00:03<00:00,  6.92it/s]


Saved 506 pairs to city_pairings.csv





## 4. Summary of Top Pairs

In [11]:
top_pairs = pairings_df[pairings_df['Selection_Tier'] == 'Tier 1 (Strict)'].sort_values('Correlation_Transformed', ascending=False)
top_pairs.head(10)

Unnamed: 0,Target,City,Correlation_Raw,Correlation_Transformed,Granger_p_value,Variance_Ratio,Selection_Tier
207,City_17,City_18,0.990597,0.990597,0.1660581,1.022615,Tier 1 (Strict)
97,City_12,City_18,0.989543,0.989543,0.03781585,1.000213,Tier 1 (Strict)
428,City_8,City_18,0.989489,0.989489,0.001224329,0.991518,Tier 1 (Strict)
450,City_9,City_18,0.989385,0.989385,0.0160614,0.996888,Tier 1 (Strict)
141,City_14,City_18,0.988692,0.988692,0.0005985649,1.003936,Tier 1 (Strict)
274,City_2,City_18,0.987244,0.987244,0.007442489,0.994353,Tier 1 (Strict)
252,City_19,City_18,0.987059,0.987059,0.01136304,0.984416,Tier 1 (Strict)
9,Barcelona,City_18,0.964407,0.964407,0.0001704631,1.124694,Tier 1 (Strict)
296,City_20,City_18,0.878066,0.878066,2.395668e-11,0.734666,Tier 1 (Strict)
494,City_Spurious,City_18,0.867317,0.867317,4.357006e-20,0.803994,Tier 1 (Strict)
