Casual Impact
===

Reference
---
1. [src](https://github.com/raz1470/causal_ai/tree/main/notebooks)
2. [Article](https://towardsdatascience.com/validating-the-causal-impact-of-the-synthetic-control-method-2f3bf185f266)

In [None]:
!pip install -q pytrends

In [None]:
import time
import pandas as pd
from pytrends.request import TrendReq
from pytrends.exceptions import TooManyRequestsError
from requests.exceptions import RequestException

import numpy as np
from scipy.optimize import minimize

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Function to fetch interest by region
def fetch_interest_by_region(keyword, geo='europe', timeframe='2021-06-01 2024-06-01'):  # Specific date range
    pytrends = TrendReq(hl='en-GB', tz=0)
    pytrends.build_payload([keyword], geo=geo, timeframe=timeframe)
    try:
        interest_by_region = pytrends.interest_by_region(resolution='COUNTRY', inc_low_vol=True, inc_geo_code=True)
        return interest_by_region
    except RequestException as e:
        print(f"Request failed: {e}. Waiting for a while before retrying...")
        time.sleep(60)  # Sleep for 60 seconds before retrying
        return fetch_interest_by_region(keyword, geo, timeframe)


In [None]:
# Function to fetch weekly interest
def fetch_weekly_interest(keyword, geo, timeframe='2021-06-01 2024-06-01', retries=5):  # Specific date range
    pytrends = TrendReq(hl='en-GB', tz=0)
    for attempt in range(retries):
        try:
            pytrends.build_payload([keyword], geo=geo, timeframe=timeframe)
            interest_over_time = pytrends.interest_over_time()
            if 'isPartial' in interest_over_time.columns:
                interest_over_time = interest_over_time.drop(columns=['isPartial'])
            return interest_over_time
        except RequestException as e:
            print(f"Request failed (attempt {attempt + 1}/{retries}): {e}. Retrying after a delay...")
            time.sleep(60)  # Sleep for 60 seconds before retrying
    print(f"Failed to fetch data for {geo} after {retries} attempts.")
    return pd.DataFrame()  # Return an empty DataFrame if all retries fail


In [None]:
# Function to fetch the list of European countries
def fetch_european_countries():
    return ['AL', 'AD', 'AM', 'AT', 'AZ', 'BY', 'BE', 'BA', 'BG', \
            'HR', 'CY', 'CZ', 'DK', 'EE', 'FO', 'FI', 'FR', 'GE', \
            'DE', 'GI', 'GR', 'GL', 'HU', 'IS', 'IE', 'IT', 'KZ', \
            'LV', 'LI', 'LT', 'LU', 'MK', 'MT', 'MD', 'MC', 'ME', \
            'NL', 'NO', 'PL', 'PT', 'RO', 'RU', 'RS', 'SK', 'SI', \
            'ES', 'SE', 'CH', 'TR', 'UA', 'GB', 'VA','TW']

In [None]:
# Set the keyword and timeframe
keyword = "iPhone"
timeframe = "2021-06-01 2024-06-01"  # Specific date range

In [None]:
# Fetch the list of European countries
european_countries = fetch_european_countries()


In [None]:
# Dictionary to store data for each country
country_data_dict = {}

In [None]:
# Fetch interest data for each country in Europe
for country_code in european_countries:
    while True:
        try:
            country_data = fetch_weekly_interest(keyword, geo=country_code, timeframe=timeframe)
            if not country_data.empty:
                country_data_dict[country_code] = country_data
                print(f"Successfully fetched data for {country_code}")
            else:
                print(f"No data for {country_code}")
            break
        except TooManyRequestsError:
            print("Too many requests. Retrying after a delay...")
            time.sleep(60)  # Sleep for 60 seconds before retrying

In [None]:
# Convert country_data_dict into a DataFrame where countries are columns
combined_data = pd.concat(country_data_dict.values(), axis=1, keys=country_data_dict.keys())

# Reset the index to make 'date' a column
combined_data.reset_index(inplace=True)

# Rename the columns to keep only the country codes
combined_data.columns = [col[0] for col in combined_data.columns]

# Display the resulting DataFrame
combined_data

In [None]:
combined_data.describe()

In [None]:
combined_data.info()


In [None]:
# save file
import os

output_dir = 'data'
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

In [None]:
combined_data.to_csv(output_dir+'/combined_data.csv', index=False)

In [None]:
# Load data
# df = pd.read_csv(output_dir+'combined_data.csv')
# df['date'] = pd.to_datetime(df['date'])

df=combined_data.copy()
df['date'] = pd.to_datetime(df['date'])

del combined_data

In [None]:
df.info()

In [None]:
df.describe()

# Simulate a placebo intervention


In [None]:
np.random.seed(1234)

# Create intervention flag
mask = (df['date'] >= "2024-04-14") & (df['date'] <= "2024-06-02")
df['intervention'] = mask.astype(int)

row_count = len(df)

# Create intervention uplift
df['uplift_perc'] = np.random.uniform(0.10, 0.20, size=row_count)
df['uplift_abs'] = round(df['uplift_perc'] * df['GB'])
df['y'] = df['GB']
df.loc[df['intervention'] == 1, 'y'] = df['GB'] + df['uplift_abs']

In [None]:
def synth_plot(df, counterfactual):
    
    plt.figure(figsize=(14, 8))
    sns.set_style("white")

    # Create plot
    sns.lineplot(data=df, x='date', y='y', label='Actual', color='b', linewidth=2.5)
    sns.lineplot(data=df, x='date', y=counterfactual, label='Counterfactual', color='r', linestyle='--', linewidth=2.5)
    plt.title('Synthetic Control Method: Actual vs. Counterfactual', fontsize=24)
    plt.xlabel('Date', fontsize=20)
    plt.ylabel('Metric Value', fontsize=20)
    plt.legend(fontsize=16)
    plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m-%d'))
    plt.xticks(rotation=90)
    plt.grid(True, linestyle='--', alpha=0.5)

    # High the intervention point
    intervention_date = '2024-04-07'
    plt.axvline(pd.to_datetime(intervention_date), color='k', linestyle='--', linewidth=1)
    plt.text(pd.to_datetime(intervention_date), plt.ylim()[1]*0.95, 'Intervention', color='k', fontsize=18, ha='right')

    plt.tight_layout()
    plt.show()

synth_plot(df, 'GB')


In [None]:
df[['y','GB']].plot()

In [None]:
# Pre-processing

# Delete the original target column so we don't use it as a feature by accident
del df['GB']

# set feature & targets
X = df.columns[1:50]
y = 'y'

In [None]:
# Regrssion Model Training

def train_reg(df, start_index, reg_class):
    
    df_temp = df.iloc[start_index:].copy().reset_index()
    
    X_pre = df_temp[df_temp['intervention'] == 0][X]
    y_pre = df_temp[df_temp['intervention'] == 0][y]
    
    X_train, X_test, y_train, y_test = train_test_split(X_pre, y_pre, test_size=0.10, random_state=42)
    
    model = reg_class
    model.fit(X_train, y_train)

    yhat_train = model.predict(X_train)
    yhat_test = model.predict(X_test)

    mse_train = mean_squared_error(y_train, yhat_train)
    mse_test = mean_squared_error(y_test, yhat_test)
    print(f"Mean Squared Error train: {round(mse_train, 2)}")
    print(f"Mean Squared Error test: {round(mse_test, 2)}")

    r2_train = r2_score(y_train, yhat_train)
    r2_test = r2_score(y_test, yhat_test)
    print(f"R2 train: {round(r2_train, 2)}")
    print(f"R2 test: {round(r2_test, 2)}")
    
    df_temp['pred'] = model.predict(df_temp.loc[:, X])
    df_temp['delta'] = df_temp['y'] - df_temp['pred']

    pred_lift = df_temp[df_temp['intervention'] == 1]['delta'].sum()
    actual_lift = df_temp[df_temp['intervention'] == 1]['uplift_abs'].sum()
    abs_error_perc = abs(pred_lift - actual_lift) / actual_lift
    print(f"Predicted lift: {round(pred_lift, 2)}")
    print(f"Actual lift: {round(actual_lift, 2)}")
    print(f"Absolute error percentage: {round(abs_error_perc, 2)}")
    
    return df_temp, abs_error_perc

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df_lin_reg_00, pred_lift_lin_reg_00 = train_reg(df, 0, LinearRegression())


In [None]:
df_lin_reg_100, pred_lift_lin_reg_100 = train_reg(df, 100, LinearRegression())


In [None]:
synth_plot(df_lin_reg_100, 'pred')


In [None]:
df_ridge_00, pred_lift_ridge_00 = train_reg(df, 0, RidgeCV())


In [None]:
df_ridge_100, pred_lift_ridge_100 = train_reg(df, 100, RidgeCV())


In [None]:
synth_plot(df_ridge_100, 'pred')


In [None]:
df_lasso_00, pred_lift_lasso_00 = train_reg(df, 0, LassoCV())


In [None]:
df_lasso_100, pred_lift_lasso_100 = train_reg(df, 100, LassoCV())


In [None]:
synth_plot(df_lasso_100, 'pred')


Synthetic control method
---

In [None]:
def synthetic_control(weights, control_units, treated_unit):    
    synthetic = np.dot(control_units.values, weights)
    return np.sqrt(np.sum((treated_unit - synthetic)**2))

In [None]:
def train_synth(df, start_index):
    
    df_temp = df.iloc[start_index:].copy().reset_index()
    
    X_pre = df_temp[df_temp['intervention'] == 0][X]
    y_pre = df_temp[df_temp['intervention'] == 0][y]
    
    X_train, X_test, y_train, y_test = train_test_split(X_pre, y_pre, test_size=0.10, random_state=42)
    
    initial_weights = np.ones(len(X)) / len(X)

    constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})

    bounds = [(0, 1) for _ in range(len(X))]

    result = minimize(synthetic_control, 
                     initial_weights, 
                     args=(X_train, y_train),
                     method='SLSQP', 
                     bounds=bounds, 
                     constraints=constraints,
                     options={'disp': False, 'maxiter': 1000, 'ftol': 1e-9},
    )
    
    optimal_weights = result.x
    
    yhat_train = np.dot(X_train.values, optimal_weights)
    yhat_test = np.dot(X_test.values, optimal_weights)
    
    mse_train = mean_squared_error(y_train, yhat_train)
    mse_test = mean_squared_error(y_test, yhat_test)
    print(f"Mean Squared Error train: {round(mse_train, 2)}")
    print(f"Mean Squared Error test: {round(mse_test, 2)}")

    r2_train = r2_score(y_train, yhat_train)
    r2_test = r2_score(y_test, yhat_test)
    print(f"R2 train: {round(r2_train, 2)}")
    print(f"R2 test: {round(r2_test, 2)}")    
    
    df_temp['pred'] = np.dot(df_temp.loc[:, X].values, optimal_weights)
    df_temp['delta'] = df_temp['y'] - df_temp['pred']

    pred_lift = df_temp[df_temp['intervention'] == 1]['delta'].sum()
    actual_lift = df_temp[df_temp['intervention'] == 1]['uplift_abs'].sum()
    abs_error_perc = abs(pred_lift - actual_lift) / actual_lift
    print(f"Predicted lift: {round(pred_lift, 2)}")
    print(f"Actual lift: {round(actual_lift, 2)}")
    print(f"Absolute error percentage: {round(abs_error_perc, 2)}")
    
    return df_temp, abs_error_perc

In [None]:
df_synth_00, pred_lift_synth_00 = train_synth(df, 0)


In [None]:
df_synth_100, pred_lift_synth_100 = train_synth(df, 100)


In [None]:
synth_plot(df_synth_100, 'pred')


Collate results
---

In [None]:
# run regression experiments
df_lin_reg_00, pred_lift_lin_reg_00 = train_reg(df, 0, LinearRegression())
df_lin_reg_100, pred_lift_lin_reg_100 = train_reg(df, 100, LinearRegression())
df_ridge_00, pred_lift_ridge_00 = train_reg(df, 0, RidgeCV())
df_ridge_100, pred_lift_ridge_100 = train_reg(df, 100, RidgeCV())
df_lasso_00, pred_lift_lasso_00 = train_reg(df, 0, LassoCV())
df_lasso_100, pred_lift_lasso_100 = train_reg(df, 100, LassoCV())

# run synthetic control experiments
df_synth_00, pred_lift_synth_00 = train_synth(df, 0)
df_synth_100, pred_lift_synth_100 = train_synth(df, 100)

In [None]:
experiment_data = {
    "Method": ["Linear", "Linear", "Ridge", "Ridge", "Lasso", "Lasso", "Synthetic Control", "Synthetic Control"],
    "Data Size": ["Large", "Small", "Large", "Small", "Large",  "Small", "Large", "Small"],
    "Value": [pred_lift_lin_reg_00, pred_lift_lin_reg_100, pred_lift_ridge_00, pred_lift_ridge_100,pred_lift_lasso_00, pred_lift_lasso_100, pred_lift_synth_00, pred_lift_synth_100]
}

df_experiments = pd.DataFrame(experiment_data)


In [None]:
experiment_data = {
    "Method": ["Linear", "Linear", "Ridge", "Ridge", "Lasso", "Lasso", "Synthetic Control", "Synthetic Control"],
    "Data Size": ["Large", "Small", "Large", "Small", "Large",  "Small", "Large", "Small"],
    "Value": [pred_lift_lin_reg_00, pred_lift_lin_reg_100, pred_lift_ridge_00, pred_lift_ridge_100,pred_lift_lasso_00, pred_lift_lasso_100, pred_lift_synth_00, pred_lift_synth_100]
}

df_experiments = pd.DataFrame(experiment_data)
# Set the style
sns.set_style="whitegrid"

# Create the bar plot
plt.figure(figsize=(10, 6))
bar_plot = sns.barplot(x="Method", y="Value", hue="Data Size", data=df_experiments, palette="muted")

# Add labels and title
plt.xlabel("Method")
plt.ylabel("Absolute error percentage")
plt.title("Synthetic Controls - Comparison of Methods Across Different Data Sizes")
plt.legend(title="Data Size")

# Show the plot
plt.show()