In [42]:
import pandas as pd
import numpy as np
import plotly.express as px
import klib as kl

# impute missing values with iterative imputer 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [43]:
products_data = pd.read_csv('data/regression_clean_data.csv')



In [44]:
data = pd.read_csv('data/site_changes_cleaned.csv')



# create a date column from month and year and add day as 1
data['date'] = pd.to_datetime(data[['year', 'month']].assign(day=1))
data = kl.clean_column_names(data)

data.head()

Unnamed: 0,month,year,incentive,apr,notes,bank,notes_1,tag,site_changes,date
0,1,2021,0,0.0,,abanca,,,,2021-01-01
1,6,2021,150,0.0,800-1200 deposit,abanca,,increase incentive,,2021-06-01
2,8,2021,150,0.0,800-1200 deposit,abanca,,incentive decrease,,2021-08-01
3,4,2022,300,0.0,if over 1200 deposit,abanca,,"incentive increase, apr decrease",,2022-04-01
4,12,2022,300,0.0,mejor cambia de el sitio,abanca,,,major site change,2022-12-01


In [45]:
mapping_dict = {
    'abanca': 'abanca',
    'banc sabadell': 'banc sabadell',
    'bancosabadell': 'banc sabadell',
    'bancosantander': 'santander',
    'bancsabadell': 'banc sabadell',
    'sabadell': 'banc sabadell',
    'bank': 'bank',
    'bankinter': 'bankinter',
    'bbva': 'bbva',
    'caixabank': 'caixabank',
    'caixa_bank': 'caixabank',
    'date': 'date',
    'evobanco': 'evobanco',
    'evo_banco': 'evobanco',
    'imagin': 'imagin',
    'ing': 'ing',
    'ing-bank': 'ing',
    'ing_bank': 'ing',
    'kutxabank': 'kutxabank',
    'liberbank': 'liberbank',
    'myinvestor': 'myinvestor',
    'n26': 'n26',
    'n26.com': 'n26',
    None: None,  # handling nan (not a number)
    'openbank': 'openbank',
    'orangebank': 'orangebank',
    'pibank': 'pibank',
    'renaultbank': 'renaultbank',
    'revolut': 'revolut',
    'santander': 'santander',
    'value': 'value'
}

# map santander to bancosantander in data 
data.bank = data.bank.map(mapping_dict)
products_data.bank = products_data.bank.map(mapping_dict)



In [46]:
# cast date to datetime
products_data['date'] = pd.to_datetime(products_data['date'])
products_data = products_data[['date', 'bank', 'web_traffic']]
products_data['web_traffic'] = np.log(products_data['web_traffic'].astype(float))

data['date'] = pd.to_datetime(data['date']) 

In [47]:
data.head()

Unnamed: 0,month,year,incentive,apr,notes,bank,notes_1,tag,site_changes,date
0,1,2021,0,0.0,,abanca,,,,2021-01-01
1,6,2021,150,0.0,800-1200 deposit,abanca,,increase incentive,,2021-06-01
2,8,2021,150,0.0,800-1200 deposit,abanca,,incentive decrease,,2021-08-01
3,4,2022,300,0.0,if over 1200 deposit,abanca,,"incentive increase, apr decrease",,2022-04-01
4,12,2022,300,0.0,mejor cambia de el sitio,abanca,,,major site change,2022-12-01


In [48]:

# attach product_data to data
data = pd.merge(products_data, data, on=['bank', 'date'], how='left')

# drop the 124 row 
data = data.drop(124)

data['date'] = pd.to_datetime(data[['year', 'month']].assign(day=1))


# for some reason banks are not having the dates attaching properly to the bank data so we have to re add all of the dates to the banks

for bank in data.bank.unique():
    data.loc[data.bank == bank, 'date'] = list(products_data.date.unique())


In [49]:
data = data[['date','bank','web_traffic','site_changes']]
data['web_traffic'] = np.log(data['web_traffic'].astype(float))

In [50]:
control_df_avg = pd.DataFrame(data.groupby('date')['web_traffic'].mean()).reset_index()
control_df_avg['apr'] = 0
control_df_avg['incentive'] = 0
control_df_avg['site_changes'] = 0

separate the data into increase apr and increase incentive lists 


In [51]:
banks = data.bank.unique()

here we create a dictionary for each bank as a key and each value is a dataframe of the bank's data.

In [52]:
# for each bank we take a subsection of the data that has no incentives 

bank_data_dict = {}

for bank in banks:
    # for each bank attach the relevant apr and incentive data
    
    bank_data = data[data.bank == bank].fillna(0)
    
    # if the site_change column is not 0 it should be 1 
    bank_data['site_changes'] = bank_data.site_changes.apply(lambda x: 1 if x != 0 else 0)

    print(bank_data.site_changes.value_counts())
    print(bank_data.groupby('bank')['date'].count())

    bank_data_dict[bank] = bank_data


bank_data_dict
    

    

site_changes
0    24
1     1
Name: count, dtype: int64
bank
abanca    25
Name: date, dtype: int64
site_changes
0    24
1     1
Name: count, dtype: int64
bank
n26    25
Name: date, dtype: int64
site_changes
0    24
1     1
Name: count, dtype: int64
bank
banc sabadell    25
Name: date, dtype: int64
site_changes
0    25
Name: count, dtype: int64
bank
ing    25
Name: date, dtype: int64
site_changes
0    25
Name: count, dtype: int64
bank
bbva    25
Name: date, dtype: int64
site_changes
0    22
1     3
Name: count, dtype: int64
bank
revolut    25
Name: date, dtype: int64
site_changes
0    25
Name: count, dtype: int64
bank
openbank    25
Name: date, dtype: int64
site_changes
0    25
Name: count, dtype: int64
bank
myinvestor    25
Name: date, dtype: int64
site_changes
0    25
Name: count, dtype: int64
bank
bankinter    25
Name: date, dtype: int64
site_changes
0    25
Name: count, dtype: int64
bank
evobanco    25
Name: date, dtype: int64
site_changes
0    24
1     1
Name: count, dtype: int64
ba

  bank_data = data[data.bank == bank].fillna(0)
  bank_data = data[data.bank == bank].fillna(0)
  bank_data = data[data.bank == bank].fillna(0)
  bank_data = data[data.bank == bank].fillna(0)
  bank_data = data[data.bank == bank].fillna(0)
  bank_data = data[data.bank == bank].fillna(0)


{'abanca':          date    bank  web_traffic  site_changes
 0  2021-12-01  abanca     2.264805             0
 1  2022-01-01  abanca     2.365541             0
 2  2022-02-01  abanca     2.346474             0
 3  2022-03-01  abanca     2.346607             0
 4  2022-04-01  abanca     2.361727             0
 5  2022-05-01  abanca     2.332335             0
 6  2022-06-01  abanca     2.348094             0
 7  2022-07-01  abanca     2.401017             0
 8  2022-08-01  abanca     2.388360             0
 9  2022-09-01  abanca     2.383188             0
 10 2022-10-01  abanca     2.382418             0
 11 2022-11-01  abanca     2.388770             0
 12 2022-12-01  abanca     2.409641             1
 13 2023-01-01  abanca     2.414981             0
 14 2023-02-01  abanca     2.380399             0
 15 2023-03-01  abanca     2.394737             0
 16 2023-04-01  abanca     2.386650             0
 17 2023-05-01  abanca     2.394878             0
 18 2023-06-01  abanca     2.404818     

filling na values


In [53]:
# drop bank_x and bank_y columns
for bank in banks:
    bank_data = bank_data_dict[bank]
    #bank_data = bank_data.drop(columns=['bank_x', 'bank_y'])

    print(bank_data.groupby('bank')['site_changes'].sum())
    bank_data.fillna(0, inplace=True)
    bank_data_dict[bank] = bank_data

bank
abanca    1
Name: site_changes, dtype: int64
bank
n26    1
Name: site_changes, dtype: int64
bank
banc sabadell    1
Name: site_changes, dtype: int64
bank
ing    0
Name: site_changes, dtype: int64
bank
bbva    0
Name: site_changes, dtype: int64
bank
revolut    3
Name: site_changes, dtype: int64
bank
openbank    0
Name: site_changes, dtype: int64
bank
myinvestor    0
Name: site_changes, dtype: int64
bank
bankinter    0
Name: site_changes, dtype: int64
bank
evobanco    0
Name: site_changes, dtype: int64
bank
santander    1
Name: site_changes, dtype: int64


here we create two functions to determine if the event is valid and another to find consecutive dates surrounding the event date 

In [54]:
def check_if_valid_event(bank_data, date, buffer, column):
    date_index = bank_data[bank_data.date == date].index[0]
    for i in range(1, buffer + 1):
        prev_date = bank_data.iloc[date_index - i].date
        next_date = bank_data.iloc[date_index + i].date
        prev_apr = bank_data.iloc[date_index - i][column]
        next_apr = bank_data.iloc[date_index + i][column]

        print(f"Checking date: {date}, Buffer-{i}: {prev_date} (APR: {prev_apr}), Buffer+{i}: {next_date} (APR: {next_apr})")
        
        if prev_apr != 0 or next_apr != 0:
            return False

    return True

def find_consecutive_dates(dates, buffer):
    # Calculate the required length of the sequence
    required_length = (buffer * 2) + 1
    
    # Convert dates to a sorted list of pandas Timestamps
    sorted_dates = sorted(pd.to_datetime(dates))
    
    # Iterate through the sorted dates list
    for i in range(len(sorted_dates) - required_length + 1):
        # Extract the sublist of the required length
        sublist = sorted_dates[i:i + required_length]
        
        
        # Check if the sublist is consecutive months
        if all((sublist[j + 1].year * 12 + sublist[j + 1].month) - (sublist[j].year * 12 + sublist[j].month) == 1 for j in range(len(sublist) - 1)):
            return sublist
    
    # If no such sequence is found
    return None

here we run a loop for each bank then find a control date in the 'ing' series which does not alter or change the website



In [55]:


def check_if_valid_event(bank_data, date, buffer, column):
    try:
        date_index = bank_data[bank_data.date == date].index[0]
    except IndexError:
        print(f"Date {date} not found in bank data.")
        return False
    
    for i in range(1, buffer + 1):
        if date_index - i < 0 or date_index + i >= len(bank_data):
            return False
        
        prev_date = bank_data.iloc[date_index - i].date
        next_date = bank_data.iloc[date_index + i].date
        prev_apr = bank_data.iloc[date_index - i][column]
        next_apr = bank_data.iloc[date_index + i][column]

        print(f"Checking date: {date}, Buffer-{i}: {prev_date} (APR: {prev_apr}), Buffer+{i}: {next_date} (APR: {next_apr})")
        
        if prev_apr != 0 or next_apr != 0:
            return False

    return True

def find_consecutive_dates(dates, buffer):
    required_length = (buffer * 2) + 1
    sorted_dates = sorted(pd.to_datetime(dates))

    for i in range(len(sorted_dates) - required_length + 1):
        sublist = sorted_dates[i:i + required_length]
        if all((sublist[j + 1].year * 12 + sublist[j + 1].month) - (sublist[j].year * 12 + sublist[j].month) == 1 for j in range(len(sublist) - 1)):
            return sublist
    
    return None

In [56]:


buffer = 2

all_regressions_site_change = {'regressions': []}

for bank in banks:
    try:
        bank_data = bank_data_dict[bank].copy()
        bank_data['date'] = pd.to_datetime(bank_data['date'])
        bank_data = bank_data.sort_values(by='date').reset_index(drop=True)
        
        site_change_dates = bank_data[bank_data.site_changes > 0]
        
        for date in site_change_dates.date:
            is_valid = check_if_valid_event(bank_data, date, buffer, 'site_changes')
            if is_valid:
                bank_control = 'avg'
                bank_data_control = control_df_avg.copy()
                bank_data_control['date'] = pd.to_datetime(bank_data_control['date'])
                bank_data_control = bank_data_control.sort_values(by='date').reset_index(drop=True)
                
                control_date = bank_data_control[bank_data_control.date == date]
                if not control_date.empty and control_date.site_changes.iloc[0] == 0 and check_if_valid_event(bank_data_control, date, buffer, 'site_changes'):
                    viable_control_dates = bank_data_control[(bank_data_control.site_changes == 0) & (bank_data_control.site_changes == 0)].date
                    viable_control_dates = viable_control_dates[viable_control_dates != date]
                    for i in range(1, buffer + 1):
                        viable_control_dates = viable_control_dates[viable_control_dates != date + pd.DateOffset(months=i)]
                        viable_control_dates = viable_control_dates[viable_control_dates != date - pd.DateOffset(months=i)]
                    
                    viable_bank_dates = bank_data[(bank_data.site_changes == 0) & (bank_data.site_changes == 0)].date
                    viable_bank_dates = viable_bank_dates[viable_bank_dates != date]
                    for i in range(1, buffer + 1):
                        viable_bank_dates = viable_bank_dates[viable_bank_dates != date + pd.DateOffset(months=i)]
                        viable_bank_dates = viable_bank_dates[viable_bank_dates != date - pd.DateOffset(months=i)]

                    viable_periods = viable_control_dates[viable_control_dates.isin(viable_bank_dates)]

                    control_period = find_consecutive_dates(viable_periods, buffer)
                    if control_period:
                        event_series = bank_data[(bank_data.date >= date - pd.DateOffset(months=buffer)) & (bank_data.date <= date + pd.DateOffset(months=buffer))]
                        event_control_series = bank_data_control[(bank_data_control.date >= date - pd.DateOffset(months=buffer)) & (bank_data_control.date <= date + pd.DateOffset(months=buffer))]

                        control_event_series = bank_data[(bank_data.date >= control_period[0]) & (bank_data.date <= control_period[-1])]
                        control_series = bank_data_control[(bank_data_control.date >= control_period[0]) & (bank_data_control.date <= control_period[-1])]

                        regression = {
                            'banks_for_regression': f'{bank} * {bank_control}', 
                            'bank_of_interest_event_series': event_series, 
                            'control_bank_event_series': event_control_series, 
                            'bank_of_interest_non_event_series': control_event_series, 
                            'control_bank_non_event_series': control_series
                        }
                        all_regressions_site_change['regressions'].append(regression)
                    else:
                        print(f"No valid control period found for bank {bank} and control {bank_control} on date {date}")
    except Exception as e:
        print(f"Error processing bank {bank}: {e}")

all_regressions_site_change


Checking date: 2022-12-01 00:00:00, Buffer-1: 2022-11-01 00:00:00 (APR: 0), Buffer+1: 2023-01-01 00:00:00 (APR: 0)
Checking date: 2022-12-01 00:00:00, Buffer-2: 2022-10-01 00:00:00 (APR: 0), Buffer+2: 2023-02-01 00:00:00 (APR: 0)
Checking date: 2022-12-01 00:00:00, Buffer-1: 2022-11-01 00:00:00 (APR: 0), Buffer+1: 2023-01-01 00:00:00 (APR: 0)
Checking date: 2022-12-01 00:00:00, Buffer-2: 2022-10-01 00:00:00 (APR: 0), Buffer+2: 2023-02-01 00:00:00 (APR: 0)
Checking date: 2022-11-01 00:00:00, Buffer-1: 2022-10-01 00:00:00 (APR: 0), Buffer+1: 2022-12-01 00:00:00 (APR: 0)
Checking date: 2022-11-01 00:00:00, Buffer-2: 2022-09-01 00:00:00 (APR: 0), Buffer+2: 2023-01-01 00:00:00 (APR: 0)
Checking date: 2022-11-01 00:00:00, Buffer-1: 2022-10-01 00:00:00 (APR: 0), Buffer+1: 2022-12-01 00:00:00 (APR: 0)
Checking date: 2022-11-01 00:00:00, Buffer-2: 2022-09-01 00:00:00 (APR: 0), Buffer+2: 2023-01-01 00:00:00 (APR: 0)
Checking date: 2023-05-01 00:00:00, Buffer-1: 2023-04-01 00:00:00 (APR: 0), Buff

{'regressions': [{'banks_for_regression': 'abanca * avg',
   'bank_of_interest_event_series':          date    bank  web_traffic  site_changes
   10 2022-10-01  abanca     2.382418             0
   11 2022-11-01  abanca     2.388770             0
   12 2022-12-01  abanca     2.409641             1
   13 2023-01-01  abanca     2.414981             0
   14 2023-02-01  abanca     2.380399             0,
   'control_bank_event_series':          date  web_traffic  apr  incentive  site_changes
   10 2022-10-01     2.261929    0          0             0
   11 2022-11-01     2.273863    0          0             0
   12 2022-12-01     2.268036    0          0             0
   13 2023-01-01     2.284364    0          0             0
   14 2023-02-01     2.279310    0          0             0,
   'bank_of_interest_non_event_series':         date    bank  web_traffic  site_changes
   0 2021-12-01  abanca     2.264805             0
   1 2022-01-01  abanca     2.365541             0
   2 2022-02-01 

define a function to get the relevant data from the statsmodel output into a dataframe

In [57]:
def extract_data_from_sm(model):
    model_dict = {}

    # Extracting coefficients
    coefficients = model.params
    model_dict['coefficients'] = coefficients

    # Extracting standard errors of coefficients
    std_errors = model.bse
    model_dict['standard_errors'] = std_errors

    # Extracting t-values
    t_values = model.tvalues
    model_dict['t_values'] = t_values

    # Extracting p-values
    p_values = model.pvalues
    model_dict['p_values'] = p_values

    # Extracting R-squared
    r_squared = model.rsquared
    model_dict['r_squared'] = r_squared

    return pd.DataFrame(model_dict)

define a function to get control and event periods for each bank out of the dictionary and into a dataframe that be regressed 

also creates the interation terms for the regression

In [58]:
def get_regression_summary(regression): 
    # Extract the DataFrames
    bank_of_interest_event_series = regression['bank_of_interest_event_series']
    control_bank_event_series = regression['control_bank_event_series']
    bank_of_interest_non_event_series = regression['bank_of_interest_non_event_series']
    control_bank_non_event_series = regression['control_bank_non_event_series']

    # Add a 'type' and 'period' columns to each DataFrame
    bank_of_interest_event_series['type'] = 'bank_of_interest_event'
    control_bank_event_series['type'] = 'control_bank_event'
    bank_of_interest_non_event_series['type'] = 'bank_of_interest_non_event'
    control_bank_non_event_series['type'] = 'control_bank_non_event'

    bank_of_interest_event_series['period'] = 'post'
    control_bank_event_series['period'] = 'post'
    bank_of_interest_non_event_series['period'] = 'pre'
    control_bank_non_event_series['period'] = 'pre'

    # Combine the series into a single DataFrame
    combined_df = pd.concat([
        bank_of_interest_event_series,
        control_bank_event_series,
        bank_of_interest_non_event_series,
        control_bank_non_event_series
    ])

    # Reset index to avoid duplicate indices
    combined_df.reset_index(drop=True, inplace=True)

    # Create dummy variables
    combined_df['treatment'] = combined_df['type'].apply(lambda x: 1 if x in ['bank_of_interest_event', 'bank_of_interest_non_event'] else 0)
    combined_df['post'] = combined_df['period'].apply(lambda x: 1 if x == 'post' else 0)

    # Interaction term: treatment * post
    combined_df['treatment_post'] = combined_df['treatment'] * combined_df['post']

    # Dependent variable
    y = combined_df['web_traffic']

    # Independent variables
    X = combined_df[['treatment', 'post', 'treatment_post']]
    X = sm.add_constant(X)

    # Fit the model
    model = sm.OLS(y, X).fit()

    # Get the summary of the regression
    summary = extract_data_from_sm(model)
    summary['output_title'] = regression['banks_for_regression']
    summary['date_of_event'] = bank_of_interest_event_series.date.median()

    return summary 

### get regrssion outputs for each regression for site changes

In [59]:
all_regression_outputs = []

for regression in all_regressions_site_change['regressions']:
    if len(regression.keys()) > 0:
        summary = get_regression_summary(regression)
        all_regression_outputs.append(summary)
    else: 
        print('No valid regressions found')
        print(regression)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bank_of_interest_event_series['type'] = 'bank_of_interest_event'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_bank_event_series['type'] = 'control_bank_event'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bank_of_interest_non_event_series['type'] = 'bank_of_interest_non_event'
A value is

visualize the regression outputs for each bank

In [60]:
all_incentive_diff_n_diff = pd.concat(all_regression_outputs)

# create a column from the index 
all_incentive_diff_n_diff['variable_name'] = all_incentive_diff_n_diff.index

all_incentive_diff_n_diff = all_incentive_diff_n_diff.reset_index(drop=True)

all_incentive_diff_n_diff

Unnamed: 0,coefficients,standard_errors,t_values,p_values,r_squared,output_title,date_of_event,variable_name
0,2.238794,0.010152,220.530288,2.688359e-29,0.897216,abanca * avg,2022-12-01,const
1,0.098236,0.014357,6.842452,3.955778e-06,0.897216,abanca * avg,2022-12-01,treatment
2,0.034706,0.014357,2.417383,0.02793497,0.897216,abanca * avg,2022-12-01,post
3,0.023505,0.020304,1.157656,0.2639944,0.897216,abanca * avg,2022-12-01,treatment_post
4,2.238794,0.003268,684.99295,3.588963e-37,0.962888,n26 * avg,2022-11-01,const
5,-0.041377,0.004622,-8.951833,1.252621e-07,0.962888,n26 * avg,2022-11-01,treatment
6,0.034083,0.004622,7.373885,1.570011e-06,0.962888,n26 * avg,2022-11-01,post
7,-0.041014,0.006537,-6.274457,1.108245e-05,0.962888,n26 * avg,2022-11-01,treatment_post
8,2.238794,0.006617,338.330831,2.858477e-32,0.889676,banc sabadell * avg,2023-05-01,const
9,0.054176,0.009358,5.789225,2.766917e-05,0.889676,banc sabadell * avg,2023-05-01,treatment


get average statistics

In [61]:
print(all_incentive_diff_n_diff.coefficients.mean())
print(all_incentive_diff_n_diff.p_values.mean())


0.5732350813286867
0.04955799629446449


view pertinent statistics

In [62]:
all_incentive_diff_n_diff.loc[all_incentive_diff_n_diff.variable_name == 'treatment_post', ['coefficients','p_values', 'output_title']]


Unnamed: 0,coefficients,p_values,output_title
3,0.023505,0.263994,abanca * avg
7,-0.041014,1.1e-05,n26 * avg
11,0.010224,0.451041,banc sabadell * avg
15,0.164841,2.7e-05,revolut * avg
19,0.027121,0.055785,santander * avg


In [63]:
all_incentive_diff_n_diff.loc[all_incentive_diff_n_diff.variable_name == 'treatment_post', ['coefficients','p_values', 'output_title']].to_csv('data/site_changes_diff_n_diff.csv', index=False)

In [66]:
data.site_changes.value_counts()

site_changes
major site change     5
major site change     2
Name: count, dtype: int64