In [113]:
import pandas as pd
import numpy as np
import plotly.express as px

# impute missing values with iterative imputer 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [114]:
products_data = pd.read_csv('data/regression_clean_data.csv')
products_data 

Unnamed: 0,bank,date,web_traffic,cross_visitation,search_interest,rank,incentive,apr,mentions
0,abanca,2021-12-01,15202.983835,0.077994,7.75,4.000000,150.0,0.0,1.0
1,abanca,2022-01-01,42183.913207,0.062500,7.80,4.000000,150.0,0.0,1.0
2,abanca,2022-02-01,34498.153115,0.052288,7.25,4.000000,150.0,0.0,2.0
3,abanca,2022-03-01,34546.319021,0.061503,6.75,4.000000,150.0,0.0,2.0
4,abanca,2022-04-01,40508.288420,0.050222,6.75,4.000000,300.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...
270,santander,2023-08-01,34290.160425,0.118914,50.50,3.500000,150.0,0.0,1.0
271,santander,2023-09-01,40388.737821,0.125496,56.25,3.500000,400.0,0.0,3.0
272,santander,2023-10-01,36636.706633,0.126347,58.60,3.500000,400.0,0.0,4.0
273,santander,2023-11-01,35387.630139,0.121212,59.75,1.500000,400.0,0.0,3.0


In [115]:
data = pd.read_csv('data/site_changes.csv')
data.head()

Unnamed: 0,month,year,incentive,apr,notes,bank,notes.1,tag,site changes
0,1,2021,0.0,0.0,,abanca,,,
1,6,2021,150.0,0.0,800-1200 deposit,abanca,,incentive increase,
2,8,2021,150.0,0.0,800-1200 deposit,abanca,,incentive decrease,
3,4,2022,300.0,0.0,if over 1200 deposit,abanca,,"incentive decrease, apr decrease",
4,12,2022,300.0,0.0,mejor cambia de el sitio,abanca,,,major site change


In [139]:
data.groupby('bank')['incentive'].sum()

bank
abanca           1200.0
banc sabadell    1050.0
bankinter           0.0
bbva                0.0
evobanco            0.0
ing                 0.0
myinvestor          0.0
n26                 0.0
openbank            0.0
revolut             0.0
santander         850.0
Name: incentive, dtype: float64

In [140]:
data.groupby('bank')['apr'].sum()

bank
abanca            4.00
banc sabadell    14.00
bankinter        10.00
bbva              4.00
evobanco          7.41
ing               0.00
myinvestor        5.00
n26               2.26
openbank          4.01
revolut           0.00
santander         0.00
Name: apr, dtype: float64

In [116]:
data.tag.value_counts()

tag
increase incentive                  12
increase apr                        12
incentive increase                   3
apr increase                         3
decrease apr                         3
incentive decrease, apr decrease     2
incentive decrease, apr increase     2
incentive decrease                   1
incentive increase, apr increase     1
decrease incentive                   1
decrease incentive                   1
temporary incentive                  1
Name: count, dtype: int64

In [117]:
# rename tags

data.tag = data.tag.replace({'apr increase':'increase apr', 'incentive increase':'increase incentive', 'decrease incentive':'incentive decrease',  'decrease incentive ':'incentive decrease'})

       
data.tag.value_counts()

tag
increase incentive                  15
increase apr                        15
incentive decrease                   3
decrease apr                         3
incentive decrease, apr decrease     2
incentive decrease, apr increase     2
incentive increase, apr increase     1
temporary incentive                  1
Name: count, dtype: int64

In [118]:
mapping_dict = {
    'abanca': 'abanca',
    'banc sabadell': 'banc sabadell',
    'bancosabadell': 'banc sabadell',
    'bancosantander': 'santander',
    'bancsabadell': 'banc sabadell',
    'sabadell': 'banc sabadell',
    'bank': 'bank',
    'bankinter': 'bankinter',
    'bbva': 'bbva',
    'caixabank': 'caixabank',
    'caixa_bank': 'caixabank',
    'date': 'date',
    'evobanco': 'evobanco',
    'evo_banco': 'evobanco',
    'imagin': 'imagin',
    'ing': 'ing',
    'ing-bank': 'ing',
    'ing_bank': 'ing',
    'kutxabank': 'kutxabank',
    'liberbank': 'liberbank',
    'myinvestor': 'myinvestor',
    'n26': 'n26',
    'n26.com': 'n26',
    None: None,  # handling nan (not a number)
    'openbank': 'openbank',
    'orangebank': 'orangebank',
    'pibank': 'pibank',
    'renaultbank': 'renaultbank',
    'revolut': 'revolut',
    'santander': 'santander',
    'value': 'value'
}

# map santander to bancosantander in data 
data.bank = data.bank.map(mapping_dict)
products_data.bank = products_data.bank.map(mapping_dict)



create two lists one for APR and Incentive increases. 

In [119]:
import klib as kl

In [120]:
# cast date to datetime
products_data['date'] = pd.to_datetime(products_data['date'])
products_data = products_data[['date', 'bank', 'web_traffic']]

In [121]:

# create a date column from month and year and add day as 1
data['date'] = pd.to_datetime(data[['year', 'month']].assign(day=1))
data = kl.clean_column_names(data)

# attach product_data to data
data = products_data.merge(data, on=['bank', 'date'], how='left')

# drop row 124
data = data.drop(124)

data['date'] = pd.to_datetime(data[['year', 'month']].assign(day=1))

In [122]:


apr_increase_dates = data[data.tag == 'increase apr']
incentive_increase_dates = data[data.tag == 'increase incentive']


all_dates = data.date.unique()




In [123]:
apr_increase_dates = apr_increase_dates[['date', 'bank', 'apr']]
incentive_increase_dates = incentive_increase_dates[['date', 'bank', 'incentive']]


In [124]:
# attach incentive and apr to the products data
products_data = products_data.reset_index()

products_data['date'] = products_data.date.dt.to_period('M').dt.to_timestamp()


In [125]:
banks = data.bank.unique()

In [126]:
banks[1] = 'bancsabadell'

In [127]:
# for each bank we take a subsection of the data that has no incentives 

bank_data_dict = {}

for bank in banks:
    # for each bank attach the relevant apr and incentive data
    
    apr_data = apr_increase_dates[apr_increase_dates.bank == bank].fillna(0)
    incentive_data = incentive_increase_dates[incentive_increase_dates.bank == bank].fillna(0)

    # merge the apr and incentives data with a copy of the products data specific to the bank
    temp_products_data = products_data[products_data.bank == bank].fillna(0)
    bank_data = temp_products_data.merge(apr_data, on=['date','bank'], how='left')
    bank_data = bank_data.merge(incentive_data, on=['date','bank'], how='left')

    print(bank_data.apr.value_counts())
    print(bank_data.groupby('bank')['date'].count())

    bank_data_dict[bank] = bank_data



bank_data_dict
    

    

apr
2.0    2
Name: count, dtype: int64
bank
abanca    25
Name: date, dtype: int64
Series([], Name: count, dtype: int64)
Series([], Name: date, dtype: int64)
apr
2.0    1
Name: count, dtype: int64
bank
banc sabadell    25
Name: date, dtype: int64
Series([], Name: count, dtype: int64)
bank
ing    25
Name: date, dtype: int64
Series([], Name: count, dtype: int64)
bank
bbva    25
Name: date, dtype: int64
Series([], Name: count, dtype: int64)
bank
revolut    25
Name: date, dtype: int64
apr
1.00    1
1.25    1
1.76    1
Name: count, dtype: int64
bank
openbank    25
Name: date, dtype: int64
apr
1.0    2
2.0    1
Name: count, dtype: int64
bank
myinvestor    25
Name: date, dtype: int64
Series([], Name: count, dtype: int64)
bank
bankinter    25
Name: date, dtype: int64
apr
0.06    1
2.00    1
2.50    1
2.85    1
Name: count, dtype: int64
bank
evobanco    25
Name: date, dtype: int64
Series([], Name: count, dtype: int64)
bank
santander    25
Name: date, dtype: int64


{'abanca':     index       date    bank   web_traffic  apr  incentive
 0       0 2021-12-01  abanca  15202.983835  NaN        NaN
 1       1 2022-01-01  abanca  42183.913207  NaN        NaN
 2       2 2022-02-01  abanca  34498.153115  NaN        NaN
 3       3 2022-03-01  abanca  34546.319021  NaN        NaN
 4       4 2022-04-01  abanca  40508.288420  NaN        NaN
 5       5 2022-05-01  abanca  29791.338180  NaN        NaN
 6       6 2022-06-01  abanca  35087.423687  NaN        NaN
 7       7 2022-07-01  abanca  61969.499304  NaN        NaN
 8       8 2022-08-01  abanca  53939.003961  NaN        NaN
 9       9 2022-09-01  abanca  50991.056414  NaN        NaN
 10     10 2022-10-01  abanca  50567.127100  NaN        NaN
 11     11 2022-11-01  abanca  54180.810911  NaN        NaN
 12     12 2022-12-01  abanca  68183.612577  NaN        NaN
 13     13 2023-01-01  abanca  72370.734829  NaN        NaN
 14     14 2023-02-01  abanca  49474.725810  NaN        NaN
 15     15 2023-03-01  abanca 

In [128]:
# drop bank_x and bank_y columns
for bank in banks:
    bank_data = bank_data_dict[bank]
    #bank_data = bank_data.drop(columns=['bank_x', 'bank_y'])
    bank_data.fillna(0, inplace=True)
    bank_data_dict[bank] = bank_data

In [129]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [130]:
def check_if_valid_event(bank_data, date, buffer):
    date_index = bank_data[bank_data.date == date].index[0]
    for i in range(1, buffer + 1):
        prev_date = bank_data.iloc[date_index - i].date
        next_date = bank_data.iloc[date_index + i].date
        prev_apr = bank_data.iloc[date_index - i].apr
        next_apr = bank_data.iloc[date_index + i].apr

        print(f"Checking date: {date}, Buffer-{i}: {prev_date} (APR: {prev_apr}), Buffer+{i}: {next_date} (APR: {next_apr})")
        
        if prev_apr != 0 or next_apr != 0:
            return False

    return True

def find_consecutive_dates(dates, buffer):
    # Calculate the required length of the sequence
    required_length = (buffer * 2) + 1
    
    # Convert dates to a sorted list of pandas Timestamps
    sorted_dates = sorted(pd.to_datetime(dates))
    
    # Iterate through the sorted dates list
    for i in range(len(sorted_dates) - required_length + 1):
        # Extract the sublist of the required length
        sublist = sorted_dates[i:i + required_length]
        
        
        # Check if the sublist is consecutive months
        if all((sublist[j + 1].year * 12 + sublist[j + 1].month) - (sublist[j].year * 12 + sublist[j].month) == 1 for j in range(len(sublist) - 1)):
            return sublist
    
    # If no such sequence is found
    return None

buffer = 1

# For each event date we verify that there is 1 day before and after that are not events
all_regressions_apr = {'regressions': []}

# For each bank we find an event date
for bank in banks:
    try:
        bank_data = bank_data_dict[bank]
        
        # Find dates where there is an APR increase
        apr_increase_dates = bank_data[bank_data.apr > 0]
        
        for date in apr_increase_dates.date:
            is_valid = check_if_valid_event(bank_data, date, buffer)
            # If the date is valid we check for a control date in another bank
            if is_valid:
                for bank_control in banks:
                    if bank_control != bank:
                        bank_data_control = bank_data_dict[bank_control]
                        control_date = bank_data_control[bank_data_control.date == date]
                        if not control_date.empty and control_date.apr.iloc[0] == 0 and check_if_valid_event(bank_data_control, date, buffer):
                            # Now we have a valid event and control date
                            # We now need to find a control period where neither bank has activity

                            # Create a list of dates that are not events for either bank or the control period
                            viable_control_dates = bank_data_control[(bank_data_control.apr == 0) & (bank_data_control.incentive == 0)].date
                            # Also remove the event dates + buffer
                            viable_control_dates = viable_control_dates[viable_control_dates != date]
                            for i in range(1, buffer + 1):
                                viable_control_dates = viable_control_dates[viable_control_dates != date + pd.DateOffset(i)]
                                viable_control_dates = viable_control_dates[viable_control_dates != date - pd.DateOffset(i)]
                            
                            # Do the same for the bank dates
                            viable_bank_dates = bank_data[(bank_data.apr == 0) & (bank_data.incentive == 0)].date
                            viable_bank_dates = viable_bank_dates[viable_bank_dates != date]
                            for i in range(1, buffer + 1):
                                viable_bank_dates = viable_bank_dates[viable_bank_dates != date + pd.DateOffset(i)]
                                viable_bank_dates = viable_bank_dates[viable_bank_dates != date - pd.DateOffset(i)]

                            # Now we have a list of viable control dates for both banks
                            # We need to find the intersection of these two lists of dates
                            viable_periods = viable_control_dates[viable_control_dates.isin(viable_bank_dates)]

                            # Now we need to find a sequence of consecutive dates that are in the viable periods and (buffer*2)+1 dates long
                            control_period = find_consecutive_dates(viable_periods, buffer)
                            print(control_period)
                            
                            # Now we create each series with the relevant data
                            event_series = bank_data[(bank_data.date >= date - pd.DateOffset(months=buffer)) & (bank_data.date <= date + pd.DateOffset(months=buffer))]
                            event_control_series = bank_data_control[(bank_data_control.date >= date - pd.DateOffset(months=buffer)) & (bank_data_control.date <= date + pd.DateOffset(months=buffer))]

                            control_event_series = bank_data[(bank_data.date >= control_period[0]) & (bank_data.date <= control_period[-1])]
                            control_series = bank_data_control[(bank_data_control.date >= control_period[0]) & (bank_data_control.date <= control_period[-1])]

                            # Now we add it to a dictionary
                            regression = {
                                'banks_for_regression': f'{bank} * {bank_control}', 
                                'event_series': event_series, 
                                'event_control_series': event_control_series, 
                                'control_event_series': control_event_series, 
                                'control_series': control_series
                            }
                            all_regressions['regressions'].append(regression)
    except Exception as e:
        print(f"Error processing bank {bank}: {e}")

all_regressions


Checking date: 2023-07-01 00:00:00, Buffer-1: 2023-06-01 00:00:00 (APR: 0.0), Buffer+1: 2023-08-01 00:00:00 (APR: 0.0)
Checking date: 2023-07-01 00:00:00, Buffer-1: 2023-06-01 00:00:00 (APR: 0.0), Buffer+1: 2023-08-01 00:00:00 (APR: 0.0)
[Timestamp('2021-12-01 00:00:00'), Timestamp('2022-01-01 00:00:00'), Timestamp('2022-02-01 00:00:00')]
Checking date: 2023-07-01 00:00:00, Buffer-1: 2023-06-01 00:00:00 (APR: 0.0), Buffer+1: 2023-08-01 00:00:00 (APR: 0.0)
[Timestamp('2021-12-01 00:00:00'), Timestamp('2022-01-01 00:00:00'), Timestamp('2022-02-01 00:00:00')]
Checking date: 2023-07-01 00:00:00, Buffer-1: 2023-06-01 00:00:00 (APR: 0.0), Buffer+1: 2023-08-01 00:00:00 (APR: 0.0)
[Timestamp('2021-12-01 00:00:00'), Timestamp('2022-01-01 00:00:00'), Timestamp('2022-02-01 00:00:00')]
Checking date: 2023-07-01 00:00:00, Buffer-1: 2023-06-01 00:00:00 (APR: 0.0), Buffer+1: 2023-08-01 00:00:00 (APR: 0.0)
[Timestamp('2021-12-01 00:00:00'), Timestamp('2022-01-01 00:00:00'), Timestamp('2022-02-01 00:00

{'regressions': [{'banks_for_regression': 'abanca * banc sabadell',
   'event_series':     index       date    bank   web_traffic  apr  incentive
   18     18 2023-06-01  abanca  64628.532316  0.0        0.0
   19     19 2023-07-01  abanca  77975.997321  2.0        0.0
   20     20 2023-08-01  abanca  90688.236048  0.0        0.0,
   'event_control_series':     index       date           bank   web_traffic  apr  incentive
   18     68 2023-06-01  banc sabadell  35514.734898  0.0        0.0
   19     69 2023-07-01  banc sabadell  21331.915784  0.0        0.0
   20     70 2023-08-01  banc sabadell  18200.440264  0.0        0.0,
   'control_event_series':    index       date    bank   web_traffic  apr  incentive
   0      0 2021-12-01  abanca  15202.983835  0.0        0.0
   1      1 2022-01-01  abanca  42183.913207  0.0        0.0
   2      2 2022-02-01  abanca  34498.153115  0.0        0.0,
   'control_series':    index       date           bank   web_traffic  apr  incentive
   0     50

In [131]:
def check_if_valid_event(bank_data, date, buffer):
    date_index = bank_data[bank_data.date == date].index[0]
    for i in range(1, buffer + 1):
        prev_date = bank_data.iloc[date_index - i].date
        next_date = bank_data.iloc[date_index + i].date
        prev_incentive = bank_data.iloc[date_index - i].incentive
        next_incentive = bank_data.iloc[date_index + i].incentive

        print(f"Checking date: {date}, Buffer-{i}: {prev_date} (incentive: {prev_incentive}), Buffer+{i}: {next_date} (incentive: {next_incentive})")
        
        if prev_incentive != 0 or next_incentive != 0:
            return False

    return True

def find_consecutive_dates(dates, buffer):
    # Calculate the required length of the sequence
    required_length = (buffer * 2) + 1
    
    # Convert dates to a sorted list of pandas Timestamps
    sorted_dates = sorted(pd.to_datetime(dates))
    
    # Iterate through the sorted dates list
    for i in range(len(sorted_dates) - required_length + 1):
        # Extract the sublist of the required length
        sublist = sorted_dates[i:i + required_length]
        
        
        # Check if the sublist is consecutive months
        if all((sublist[j + 1].year * 12 + sublist[j + 1].month) - (sublist[j].year * 12 + sublist[j].month) == 1 for j in range(len(sublist) - 1)):
            return sublist
    
    # If no such sequence is found
    return None

buffer = 1

# For each event date we verify that there is 1 day before and after that are not events
all_regressions_incentives = {'regressions': []}

# For each bank we find an event date
for bank in banks:
    try:
        bank_data = bank_data_dict[bank]
        
        # Find dates where there is an APR increase
        incentive_increase_dates = bank_data[bank_data.incentive > 0]
        
        for date in incentive_increase_dates.date:
            is_valid = check_if_valid_event(bank_data, date, buffer)
            # If the date is valid we check for a control date in another bank
            if is_valid:
                for bank_control in banks:
                    if bank_control != bank:
                        bank_data_control = bank_data_dict[bank_control]
                        control_date = bank_data_control[bank_data_control.date == date]
                        if not control_date.empty and control_date.incentive.iloc[0] == 0 and check_if_valid_event(bank_data_control, date, buffer):
                            # Now we have a valid event and control date
                            # We now need to find a control period where neither bank has activity

                            # Create a list of dates that are not events for either bank or the control period
                            viable_control_dates = bank_data_control[(bank_data_control.incentive == 0) & (bank_data_control.incentive == 0)].date
                            # Also remove the event dates + buffer
                            viable_control_dates = viable_control_dates[viable_control_dates != date]
                            for i in range(1, buffer + 1):
                                viable_control_dates = viable_control_dates[viable_control_dates != date + pd.DateOffset(i)]
                                viable_control_dates = viable_control_dates[viable_control_dates != date - pd.DateOffset(i)]
                            
                            # Do the same for the bank dates
                            viable_bank_dates = bank_data[(bank_data.incentive == 0) & (bank_data.incentive == 0)].date
                            viable_bank_dates = viable_bank_dates[viable_bank_dates != date]
                            for i in range(1, buffer + 1):
                                viable_bank_dates = viable_bank_dates[viable_bank_dates != date + pd.DateOffset(i)]
                                viable_bank_dates = viable_bank_dates[viable_bank_dates != date - pd.DateOffset(i)]

                            # Now we have a list of viable control dates for both banks
                            # We need to find the intersection of these two lists of dates
                            viable_periods = viable_control_dates[viable_control_dates.isin(viable_bank_dates)]

                            # Now we need to find a sequence of consecutive dates that are in the viable periods and (buffer*2)+1 dates long
                            control_period = find_consecutive_dates(viable_periods, buffer)
                            print(control_period)
                            
                            # Now we create each series with the relevant data
                            event_series = bank_data[(bank_data.date >= date - pd.DateOffset(months=buffer)) & (bank_data.date <= date + pd.DateOffset(months=buffer))]
                            event_control_series = bank_data_control[(bank_data_control.date >= date - pd.DateOffset(months=buffer)) & (bank_data_control.date <= date + pd.DateOffset(months=buffer))]

                            control_event_series = bank_data[(bank_data.date >= control_period[0]) & (bank_data.date <= control_period[-1])]
                            control_series = bank_data_control[(bank_data_control.date >= control_period[0]) & (bank_data_control.date <= control_period[-1])]

                            # Now we add it to a dictionary
                            regression = {
                                'banks_for_regression': f'{bank} * {bank_control}', 
                                'bank_of_interest_event_series': event_series, 
                                'control_bank_event_series': event_control_series, 
                                'bank_of_interest_non_event_series': control_event_series, 
                                'control_bank_non_event_series': control_series
                            }
                            all_regressions_incentives['regressions'].append(regression)
    except Exception as e:
        print(f"Error processing bank {bank}: {e}")

all_regressions_incentives


Checking date: 2023-04-01 00:00:00, Buffer-1: 2023-03-01 00:00:00 (incentive: 0.0), Buffer+1: 2023-05-01 00:00:00 (incentive: 0.0)
Checking date: 2023-04-01 00:00:00, Buffer-1: 2023-03-01 00:00:00 (incentive: 0.0), Buffer+1: 2023-05-01 00:00:00 (incentive: 0.0)
[Timestamp('2021-12-01 00:00:00'), Timestamp('2022-01-01 00:00:00'), Timestamp('2022-02-01 00:00:00')]
Checking date: 2023-04-01 00:00:00, Buffer-1: 2023-03-01 00:00:00 (incentive: 0.0), Buffer+1: 2023-05-01 00:00:00 (incentive: 0.0)
[Timestamp('2021-12-01 00:00:00'), Timestamp('2022-01-01 00:00:00'), Timestamp('2022-02-01 00:00:00')]
Checking date: 2023-04-01 00:00:00, Buffer-1: 2023-03-01 00:00:00 (incentive: 0.0), Buffer+1: 2023-05-01 00:00:00 (incentive: 0.0)
[Timestamp('2021-12-01 00:00:00'), Timestamp('2022-01-01 00:00:00'), Timestamp('2022-02-01 00:00:00')]
Checking date: 2023-04-01 00:00:00, Buffer-1: 2023-03-01 00:00:00 (incentive: 0.0), Buffer+1: 2023-05-01 00:00:00 (incentive: 0.0)
[Timestamp('2021-12-01 00:00:00'), T

{'regressions': [{'banks_for_regression': 'banc sabadell * abanca',
   'bank_of_interest_event_series':     index       date           bank   web_traffic  apr  incentive
   15     65 2023-03-01  banc sabadell  45176.843916  0.0        0.0
   16     66 2023-04-01  banc sabadell  31605.250346  0.0      250.0
   17     67 2023-05-01  banc sabadell  40352.969007  0.0        0.0,
   'control_bank_event_series':     index       date    bank   web_traffic  apr  incentive
   15     15 2023-03-01  abanca  57832.812519  0.0        0.0
   16     16 2023-04-01  abanca  52944.026350  0.0        0.0
   17     17 2023-05-01  abanca  57922.472646  0.0        0.0,
   'bank_of_interest_non_event_series':    index       date           bank   web_traffic  apr  incentive
   0     50 2021-12-01  banc sabadell  20016.553812  0.0        0.0
   1     51 2022-01-01  banc sabadell  20016.553812  0.0        0.0
   2     52 2022-02-01  banc sabadell  20016.553812  0.0        0.0,
   'control_bank_non_event_series'

In [132]:
def extract_data_from_sm(model):
    model_dict = {}

    # Extracting coefficients
    coefficients = model.params
    model_dict['coefficients'] = coefficients

    # Extracting standard errors of coefficients
    std_errors = model.bse
    model_dict['standard_errors'] = std_errors

    # Extracting t-values
    t_values = model.tvalues
    model_dict['t_values'] = t_values

    # Extracting p-values
    p_values = model.pvalues
    model_dict['p_values'] = p_values

    # Extracting R-squared
    r_squared = model.rsquared
    model_dict['r_squared'] = r_squared

    return pd.DataFrame(model_dict)

In [133]:
all_regression_outputs = []

for regression in all_regressions_incentives['regressions']:
    if len(regression.keys()) > 0:
       # Extract the DataFrames
        bank_of_interest_event_series = regression['bank_of_interest_event_series']
        control_bank_event_series = regression['control_bank_event_series']
        bank_of_interest_non_event_series = regression['bank_of_interest_non_event_series']
        control_bank_non_event_series = regression['control_bank_non_event_series']

        # Add a 'type' and 'period' columns to each DataFrame
        bank_of_interest_event_series['type'] = 'bank_of_interest_event'
        control_bank_event_series['type'] = 'control_bank_event'
        bank_of_interest_non_event_series['type'] = 'bank_of_interest_non_event'
        control_bank_non_event_series['type'] = 'control_bank_non_event'

        bank_of_interest_event_series['period'] = 'post'
        control_bank_event_series['period'] = 'post'
        bank_of_interest_non_event_series['period'] = 'pre'
        control_bank_non_event_series['period'] = 'pre'

        # Combine the series into a single DataFrame
        combined_df = pd.concat([
            bank_of_interest_event_series,
            control_bank_event_series,
            bank_of_interest_non_event_series,
            control_bank_non_event_series
        ])

        # Reset index to avoid duplicate indices
        combined_df.reset_index(drop=True, inplace=True)

        # Create dummy variables
        combined_df['treatment'] = combined_df['type'].apply(lambda x: 1 if x in ['bank_of_interest_event', 'bank_of_interest_non_event'] else 0)
        combined_df['post'] = combined_df['period'].apply(lambda x: 1 if x == 'post' else 0)

        # Interaction term: treatment * post
        combined_df['treatment_post'] = combined_df['treatment'] * combined_df['post']

        # Dependent variable
        y = combined_df['web_traffic']

        # Independent variables
        X = combined_df[['treatment', 'post', 'treatment_post']]
        X = sm.add_constant(X)

        # Fit the model
        model = sm.OLS(y, X).fit()

        # Get the summary of the regression
        summary = extract_data_from_sm(model)
        summary['output_title'] = regression['banks_for_regression']

        all_regression_outputs.append(summary)
    else: 
        print('No valid regressions found')
        print(regression)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bank_of_interest_event_series['type'] = 'bank_of_interest_event'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_bank_event_series['type'] = 'control_bank_event'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bank_of_interest_non_event_series['type'] = 'bank_of_interest_non_event'
A value is

In [134]:
all_incentive_diff_n_diff = pd.concat(all_regression_outputs)

# create a column from the index 
all_incentive_diff_n_diff['variable_name'] = all_incentive_diff_n_diff.index

all_incentive_diff_n_diff = all_incentive_diff_n_diff.reset_index(drop=True)

all_incentive_diff_n_diff

Unnamed: 0,coefficients,standard_errors,t_values,p_values,r_squared,output_title,variable_name
0,30628.350052,4552.195312,6.728259,1.483277e-04,0.808972,banc sabadell * abanca,const
1,-10611.796240,6437.776348,-1.648364,1.378902e-01,0.808972,banc sabadell * abanca,treatment
2,25604.753786,6437.776348,3.977267,4.077255e-03,0.808972,banc sabadell * abanca,post
3,-6576.286509,9104.390623,-0.722320,4.906702e-01,0.808972,banc sabadell * abanca,treatment_post
4,37696.032872,2453.016770,15.367214,3.194149e-07,0.879735,banc sabadell * ing,const
...,...,...,...,...,...,...,...
139,6237.144230,2733.213840,2.281982,5.191050e-02,0.966204,santander * bankinter,treatment_post
140,5214.670839,1338.376168,3.896267,4.568645e-03,0.978129,santander * evobanco,const
141,19603.979223,1892.749728,10.357407,6.526658e-06,0.978129,santander * evobanco,treatment
142,4335.503889,1892.749728,2.290585,5.121807e-02,0.978129,santander * evobanco,post


In [138]:
all_incentive_diff_n_diff.loc[all_incentive_diff_n_diff.variable_name == 'treatment_post', ['coefficients','p_values', 'output_title']]


Unnamed: 0,coefficients,p_values,output_title
3,-6576.286509,0.49067,banc sabadell * abanca
7,37466.329991,6.1e-05,banc sabadell * ing
11,30720.81295,0.000231,banc sabadell * bbva
15,14379.283793,0.007495,banc sabadell * revolut
19,16906.264599,0.004972,banc sabadell * openbank
23,12025.439836,0.038698,banc sabadell * myinvestor
27,11815.641983,0.074548,banc sabadell * bankinter
31,17328.189795,0.003497,banc sabadell * evobanco
35,-4747.534214,0.432417,banc sabadell * santander
39,3098.396067,0.755195,santander * abanca


In [43]:
# compare the list of all_dates with an array of dates between 2021-01-01 and 2024-01-01

all_dates_comp = pd.date_range(start='2021-01-01', end='2024-01-01', freq='MS')

# find the missing dates

missing_dates = all_dates[~all_dates.isin(all_dates_comp), '']

missing_dates

  missing_dates = all_dates[~all_dates.isin(all_dates_comp)]


<DatetimeArray>
[]
Length: 0, dtype: datetime64[ns]