In [3]:
import pandas as pd
import klib as kl 
import plotly.express as px
import plotly.graph_objects as go

# import iterative imputer 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# impute missing values with KNN

from sklearn.impute import KNNImputer

In [4]:
cross_visitation_data = pd.read_excel('data/Account Products.xlsx', sheet_name='cross_visitation', header=1)

bank_comps = pd.read_excel('data/Account Products.xlsx', sheet_name='bank_comparison_metrics', header=0)
bank_comps['has_stores'] = bank_comps['number of branches'] > 0

target_var_data = pd.read_excel('data/Account Products.xlsx', sheet_name='web_traffic_accounts', header=0)

## Data Cleanup

In [5]:
cross_visitation_data = kl.data_cleaning(cross_visitation_data)

brick_and_mortar_banks = list(bank_comps.loc[bank_comps.has_stores == True, 'website'])
neo_banks = [x for x in cross_visitation_data.columns if x not in brick_and_mortar_banks]
neo_banks.remove('date')

target_var_data = kl.data_cleaning(target_var_data)

Shape of cleaned data: (25, 16) - Remaining NAs: 25


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.0 MB (-nan%)

Shape of cleaned data: (25, 14) - Remaining NAs: 47


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.0 MB (-nan%)



  mem_perc = round(100 * mem_change / data_mem, 2)
  mem_perc = round(100 * mem_change / data_mem, 2)


In [6]:
# get numerica columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_data = cross_visitation_data.select_dtypes(include=numerics)
num_cols = numeric_data.columns

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_data = target_var_data.select_dtypes(include=numerics)
num_cols_target = numeric_data.columns

### Feature Engineering + Imputation

In [7]:
# impute target var data


# create an instance of the imputer
imp_mean = IterativeImputer(random_state=0)

# fit the imputer
imp_mean.fit(target_var_data[num_cols_target])

# impute the data
imputed_target_var_data = imp_mean.transform(target_var_data[num_cols_target])

# convert the imputed data to a dataframe
target_var_data[num_cols_target] = pd.DataFrame(imputed_target_var_data, columns=num_cols_target)



In [8]:


imputer = KNNImputer(n_neighbors=2)

data_imputed = imputer.fit_transform(cross_visitation_data[num_cols])

cross_visitation_data[num_cols] = pd.DataFrame(data_imputed, columns=num_cols)

cross_visitation_data.head()


Unnamed: 0,date,abanca_com,bancosantander_es,bancsabadell_com,bankinter_com,bbva_es,caixabank_es,evobanco_com,imagin_com,ing_es,kutxabank_es,myinvestor_es,n26_com,openbank_es,pibank_es,revolut_com
0,2021-12-01,0.077994,0.116923,0.070229,0.105769,0.196307,0.162256,0.079812,0.059727,0.120856,0.049426,0.068323,0.067204,0.12828,0.046249,0.071006
1,2022-01-01,0.0625,0.107472,0.084605,0.094186,0.170377,0.171842,0.055328,0.061053,0.12127,0.047898,0.067114,0.078723,0.127622,0.059259,0.052795
2,2022-02-01,0.052288,0.119243,0.059621,0.083083,0.192536,0.171842,0.067478,0.065734,0.098605,0.048193,0.049738,0.055477,0.117399,0.057143,0.04902
3,2022-03-01,0.061503,0.112708,0.067651,0.082576,0.174929,0.179195,0.075472,0.055402,0.120556,0.041714,0.05906,0.056548,0.107581,0.034884,0.051075
4,2022-04-01,0.050222,0.128592,0.064007,0.102992,0.18148,0.179195,0.072148,0.058762,0.11454,0.041714,0.050104,0.052254,0.104252,0.037618,0.045169


## Generic Data Visualizations

In [9]:
# create stacked time series line plot in plotly express 
fig = px.line(cross_visitation_data, x='date', y=num_cols, title='Stacked Time Series Line Plot')
fig.show()

In [10]:
# log transform the data and create a stacked time series line plot in plotly express

import numpy as np

data_log = cross_visitation_data.copy()
data_log[num_cols] = np.log1p(cross_visitation_data[num_cols])

fig = px.line(data_log, x='date', y=num_cols, title='Stacked Time Series Line Plot')
fig.show()

### Cross Visitation Metrics

Avg Visitation

In [11]:
# Calculate the average of the columns
data_avg = cross_visitation_data[num_cols].mean()

# Sort the values
data_avg = data_avg.sort_values(ascending=False)

# Create the bar chart
fig = px.bar(data_avg, 
             x=data_avg.index, 
             y=data_avg.values, 
             title='Average Cross Visitation by Bank', 
             color=data_avg.values,  # Use values for continuous color scale
             color_continuous_scale='ice')  # Use Mako continuous color scale

# Set the template to a dark background
fig.update_layout(template='plotly_white')

# Update axis labels
fig.update_xaxes(title_text="Banks")
fig.update_yaxes(title_text="Average Cross Visitation")

# Hide legend
fig.update_layout(showlegend=False)

# Show the plot
fig.show()

BBVA and the larger banks have higher cross visitation metrics. It would be good to see the breakdown of only neobanks. 

##### Avg Cross Visitation of Neo-Banks

In [12]:
# Calculate the average of the columns
data_avg = cross_visitation_data[neo_banks].mean()

# Sort the values
data_avg = data_avg.sort_values(ascending=False)

# Create the bar chart
fig = px.bar(data_avg, 
             x=data_avg.index, 
             y=data_avg.values, 
             title='Average Cross Visitation by Neo Bank', 
             color=data_avg.values,  # Use values for continuous color scale
             color_continuous_scale='ice')  # Use Mako continuous color scale

# Set the template to a dark background
fig.update_layout(template='plotly_white')

# Update axis labels
fig.update_xaxes(title_text="Banks")
fig.update_yaxes(title_text="Average Cross Visitation")

# Hide legend
fig.update_layout(showlegend=False)

# Show the plot
fig.show()

It is clear that openbank has the best cross visitation traffic out of the neo banks.

## Growth rate of Cross Visitation
Now we look into how the cross visitation changes over time

In [13]:
# regress each column against the date and create a dataframe of the slope 

from sklearn.linear_model import LinearRegression

slope_data = pd.DataFrame()

for col in num_cols:
    # Create the linear regression model
    model = LinearRegression()
    model.fit(cross_visitation_data[['date']], cross_visitation_data[col])
    slope = model.coef_[0]
    slope_data[col] = [slope]

slope_data.index = ['slope']

# plot the slope data in a sorted bar plot in plotly express

slope_data = slope_data.T

# plot sorted values
slope_data = slope_data.sort_values(by='slope', ascending=False)

# Create the bar chart
fig = px.bar(slope_data, 
             x=slope_data.index, 
             y='slope', 
             title='Growth Rate of Cross Visitation by Bank', 
             color=slope_data.slope, 
             color_continuous_scale='ice')

# Set the template to a dark background
fig.update_layout(template='plotly_white')

fig.update_xaxes(title_text="Banks")
fig.update_yaxes(title_text="Growth Rate of Cross Visitation")
fig.update_layout(showlegend=False)

fig.update

# Show the plot
fig.show()

Imagin.com was launched by Caixa Bank and may attribute to the negative growth rate for Caixa Bank. This would be interesting to look into. 

In [14]:
# plot the cross visitation as a time series including only imagin.com and caixa bank 

fig = px.line(cross_visitation_data[['caixabank_es','imagin_com','date']], x='date', y=['caixabank_es','imagin_com'], title='Stacked Time Series Line Plot')
fig.show()

Unfortunately there is no clear trend here, but it does look like some type of advertising June may have increased traffic in Caixa Bank as well. It would be interesting to see the correlation of these to series. 

### Cross Visitation Correlation 

In [15]:

correlation_matrix = cross_visitation_data[num_cols].corr(method='pearson')
heatmap = go.Heatmap(z=correlation_matrix.values,
                        x=correlation_matrix.columns,
                        y=correlation_matrix.columns,
                        colorscale='Viridis')

# Create a figure and add the heatmap to it
fig = go.Figure(data=heatmap)

# update fig size 
fig.update_layout(width=800, height=800)

# Set plot title and axis labels
fig.update_layout(title='Pearson Correlation Coefficient Web Traffic vs Cross Visitation',
                    xaxis_title='Time Series',
                    yaxis_title='Time Series')

# Show the plot
fig.show()

Caixa Bank is interestingly highly correlated across all banks. This may be indicative of some type of advertising strategy, or dependent on ranking or some other feature. 

### Cross Visitation impact on Target Variable

In [16]:
banks = list(cross_visitation_data.columns)
banks.remove('date')

In [17]:
from plotly.subplots import make_subplots


In [18]:
def display_two_axis_plot(date, y1, y2, bank_name):
    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces
    fig.add_trace(
        go.Scatter(x=date, y=y1, name='Cross Visitation'),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=date, y=y2, name='Web Traffic'),
        secondary_y=True,
    )

    # Add figure title
    fig.update_layout(
        title_text=f"Web Traffic vs Cross Visitation - {bank_name}"
    )

    # Set x-axis title
    fig.update_xaxes(title_text="Date")

    # Set y-axes titles
    fig.update_yaxes(title_text="<b>Cross Visitation</b>", secondary_y=False)
    fig.update_yaxes(title_text="<b>Web Traffic</b>", secondary_y=True)

    fig.show()

In [19]:
# concat cross visitation with target var
cross_visitation_data_temp = cross_visitation_data.drop(columns='date')
cross_visitation_concat = pd.concat([cross_visitation_data_temp, target_var_data], axis=1)

for bank in banks: 
    bank = bank.split('_')[0]
    columns = [i for i in list(cross_visitation_concat.columns) if bank in i]

    if len(columns) > 1:
        bank_name = bank.title()
        display_two_axis_plot(cross_visitation_concat['date'], cross_visitation_concat[columns[0]], cross_visitation_concat[columns[1]], bank_name)
    

# Create subplots


The outliers are BBVA and BankInter otherwise most have high correlation between cross visitationa and web traffic. There is some possibility to see normalize the two time series then isolate any deviations from the cross visitation trend. This may isolate advertising pushes or other product releases. Additionally, Bank Sabadell seems to have a high web traffic despite lower cross visitation, this may suggest name brand retention indpendent of other banks. 

RMSE for time series accuracy

# Age of Bank EDA

In [20]:
# create an age of bank column by subtracting the year founded from 2024

bank_comps['age'] = 2024 - bank_comps['age of bank']

# log scale age 
bank_comps['age'] = np.log1p(bank_comps['age'])

# sort by age 
bank_comps = bank_comps.sort_values(by='age', ascending=False)

# plot the age of bank column in a histogram in plotly express sorted

fig = px.bar(bank_comps[['website','age']].drop_duplicates(), x='website', y='age', title='Age of Banks', color='age', color_continuous_scale='ice')

# Set the template to a dark background
fig.update_layout(template='plotly_white')

fig.update_xaxes(title_text="Bank Websites")
fig.update_yaxes(title_text="Age")
fig.update_layout(showlegend=False)

# make a blues color palette gradient
fig.update_traces(marker=dict(color=bank_comps['age'], colorscale='Blues', showscale=True))

# Show the plot
fig.show()

In [21]:
# how to create a continuous correlation map with a single column to another column

bank_comps.columns

Index(['Notes', 'website', 'number of employee', 'number of customers ',
       'assets under management', 'number of branches', 'age of bank',
       'references', 'Unnamed: 8', 'Unnamed: 9', 'has_stores', 'age'],
      dtype='object')

In [22]:
target_var_data_ = target_var_data[num_cols_target].mean().reindex()
index = target_var_data_.index 
index = [i.split('_')[0] for i in index]
target_var_data_.index = index
target_var_data_ = pd.DataFrame(target_var_data_)
target_var_data_['bank'] = target_var_data_.index
target_var_data_.columns = ['web_traffic', 'bank']

In [23]:
bank_comps.head()

Unnamed: 0,Notes,website,number of employee,number of customers,assets under management,number of branches,age of bank,references,Unnamed: 8,Unnamed: 9,has_stores,age
1,,bancosantander.es,212764,,1117000000000.0,8518,1857,https://www.santander.com/en/about-us/key-fact...,https://www.statista.com/statistics/417009/ban...,,True,5.123964
4,,bbva.es,121486,,775000000000.0,1800,1857,https://www.globaldata.com/company-profile/ban...,https://www.bbva.es/en/general/buscador-oficin...,,True,5.123964
2,,bancsabadell.com,19316,,253000000000.0,1594,1881,https://www.globaldata.com/company-profile/ban...,https://www.bancsabadell.com/bsnacional/en/bra...,,True,4.969813
5,,caixabank.es,44863,,607167000000.0,3922,1904,https://www.globaldata.com/company-profile/ban...,https://www.caixabank.com/es/sobre-nosotros/nu...,https://www.caixabank.com/en/about-us.html,True,4.795791
3,,bankinter.com,6138,,83300000000.0,523,1965,https://www.macrotrends.net/stocks/charts/BKNI...,https://www.bankinter.com/webcorporativa/amp/e...,https://www.bankinter.com/www/webcorp/swf/memo...,True,4.094345


In [24]:
bank_comps['bank'] = bank_comps.website.str.split('.').apply(lambda x: x[0])
bank_comps = bank_comps.merge(target_var_data_, left_on='bank', right_on='bank')

In [25]:
correlation_matrix = bank_comps[['web_traffic','age of bank']].corr(method='pearson')
heatmap = go.Heatmap(z=correlation_matrix.values,
                        x=correlation_matrix.columns,
                        y=correlation_matrix.columns,
                        colorscale='ice')

# Create a figure and add the heatmap to it
fig = go.Figure(data=heatmap)

# update fig size 
fig.update_layout(width=800, height=800)

# Set plot title and axis labels
fig.update_layout(title='Pearson Correlation Coefficient Matrix',
                    xaxis_title='Time Series',
                    yaxis_title='Time Series')

# Show the plot
fig.show()

In [26]:
# scatter plot of age and web traffic 

fig = px.scatter(bank_comps, x='age of bank', y='web_traffic', title='Age of Bank vs Web Traffic', color='web_traffic', color_continuous_scale='ice')

fig.update_xaxes(title_text="Bank Age")
fig.update_yaxes(title_text="Average Web Traffic")
fig.update_layout(showlegend=False)
fig.show()

In [27]:
# create a dummy for banks older than 2000 and newer than 2000 

bank_comps['old_bank'] = bank_comps['age of bank'] < 2000

# plot the web traffic over time for old and new banks

old_banks = bank_comps.loc[bank_comps.old_bank == True, 'website']
new_banks = bank_comps.loc[bank_comps.old_bank == False, 'website']

old_banks = list(old_banks)
new_banks = list(new_banks)

old_banks = [i.split('.')[0] for i in old_banks]
new_banks = [i.split('.')[0] for i in new_banks]


old_banks_cols = []
for col1 in old_banks: 
    for col2 in target_var_data.columns:
        if col1 in col2:
            old_banks_cols.append(col2)

old_bank_data = target_var_data[old_banks_cols]

new_banks_cols = []
for col1 in new_banks: 
    for col2 in target_var_data.columns:
        if col1 in col2:
            new_banks_cols.append(col2)

new_bank_data = target_var_data[new_banks_cols]

new_bank_data = pd.DataFrame(new_bank_data.mean(axis=1))
new_bank_data.columns = ['web_traffic'] 


old_bank_data = pd.DataFrame(old_bank_data.mean(axis=1))
old_bank_data.columns = ['web_traffic']


# merge dfs 
old_bank_data['bank'] = 'Founded Before 2000'
new_bank_data['bank'] = 'Founded After 2000'

web_traffic_data = pd.concat([old_bank_data, new_bank_data])
web_traffic_data['date'] = target_var_data['date']

fig = px.line(web_traffic_data, x='date', y='web_traffic', title='Web Traffic Over Time for Banks Before 2000 and Banks After 2000', color='bank')

fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="Average Web Traffic")

# update legend title
fig.update_layout(legend_title_text='Bank Type')


fig.show()

In [28]:
target_var_data

Unnamed: 0,date,abanca_com_es_cuentas_cuenta_online,n26_com_es_es_cuenta_sin_comisiones,bancsabadell_com_cuenta_online,ing_es_cuenta_nocuenta_ing,bbva_es_personas_productos_cuentas_cuenta_online_sin_comisiones_html,revolut_com_es_es_a_radically_better_account,openbank_es_cuenta_ahorro_bienvenida,myinvestor_es_cuentas_tarjetas_cuentas,n26_com_es_es_cuenta_ahorro,bankinter_com_banca_cuentas_tarjetas_cuentas_cuenta_nomina,ing_es_cuenta_nocuenta_ing_1,evobanco_com_cuenta_inteligente,bancosantander_es_particulares_cuentas_tarjetas_cuentas_corrientes_cuenta_online_sin_comisiones
0,2021-12-01,15202.983835,6664.369141,20592.345013,24381.005813,45449.486787,1343.418335,15250.182615,5319.608623,15237.550206,8733.560835,24380.762233,5055.54248,22325.885971
1,2022-01-01,42183.913207,5310.177246,28886.199059,24448.868576,38306.464983,1531.016846,17358.903158,6577.851993,11468.245023,11410.593443,24448.821142,4539.012207,28331.245044
2,2022-02-01,34498.153115,5547.588867,24913.158633,24416.499127,40978.325821,1304.591309,16941.459256,4031.71696,12927.670282,8891.661053,24416.357951,6049.458008,23798.81917
3,2022-03-01,34546.319021,4900.271484,23955.858539,24411.12898,43970.545627,1105.180542,16384.133789,4941.009311,12553.57725,7446.925144,24410.971192,9826.193359,24808.561729
4,2022-04-01,40508.28842,4909.581055,27143.496585,24437.33196,40782.451956,1020.968689,17413.331041,5774.734455,11886.826026,10285.861726,24437.253057,5028.236328,24739.758729
5,2022-05-01,29791.33818,6639.387695,12267.129453,24467.64236,44439.738816,1648.221313,12540.812395,4701.676152,14913.056309,5975.58225,24467.64236,6753.060547,37041.514189
6,2022-06-01,35087.423687,7382.783203,27765.978171,24467.64236,29359.709637,1261.18335,16505.499605,3743.697827,13022.085587,7762.845175,24467.64236,5874.299805,35471.865599
7,2022-07-01,61969.499304,9062.696289,33651.426645,22189.247828,49115.263085,2692.579346,18092.837291,8887.53198,10516.891509,6762.668091,22189.247828,2726.868408,35951.67483
8,2022-08-01,53939.003961,8751.572266,36382.542418,29944.644102,49005.118424,4602.381348,17800.667417,6295.460707,11442.034601,5397.615718,29944.644102,5641.461914,40746.600434
9,2022-09-01,50991.056414,7224.135254,40102.371944,50772.22105,36922.655663,9032.796875,18473.274492,6546.830235,13134.702101,8613.556577,50772.22105,4338.913086,41933.057915


## Brick and Mortar Trends

There are clear trends here that show more recent banks have less customers, while older banks have more customer. It would be interesting to see customer retention metrics. 

In [63]:
# plot a pie chart of the number of banks with stores and without stores

fig = px.pie(bank_comps, names='has_stores', title='Banks with Stores vs. Banks without Stores', color='has_stores')
# update legend title and labels 

fig.update_layout(legend_title_text='Has Stores')

fig.show()

In [64]:
# scatter plot of age and web traffic 

fig = px.scatter(bank_comps, x='age of bank', y='web_traffic', title='Age of Bank vs Web Traffic for Brick and Mortar and Neo Banks', color='has_stores', color_continuous_scale='ice')

fig.update_xaxes(title_text="Bank Age")
fig.update_yaxes(title_text="Average Web Traffic")
# add label for bank name 
fig.update_traces(text=bank_comps['website'])

fig.update_legends(title='Has Stores')

fig.show()

# create a bar chart of the top amounts given the differences in brick and mortar stores 

In [30]:
brick_and_mortar_banks = list(bank_comps.loc[bank_comps.has_stores == True, 'website'])
neo_banks = [x for x in cross_visitation_data.columns if x not in brick_and_mortar_banks]
neo_banks.remove('date')

target_var_data = kl.data_cleaning(target_var_data)

Shape of cleaned data: (25, 14) - Remaining NAs: 0


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.0 MB (-nan%)




invalid value encountered in scalar divide



In [31]:
brick_and_mortar_banks

['bancosantander.es',
 'bbva.es',
 'bancsabadell.com',
 'bankinter.com',
 'ing.es',
 'ing.es',
 'abanca.com',
 'evobanco.com']

In [32]:
all_brick_and_mortar_target_var = ['abanca_com_es_cuentas_cuenta_online',
'bancsabadell_com_cuenta_online',
'ing_es_cuenta_nocuenta_ing',
'bbva_es_personas_productos_cuentas_cuenta_online_sin_comisiones_html',
'bankinter_com_banca_cuentas_tarjetas_cuentas_cuenta_nomina',
'ing_es_cuenta_nocuenta_ing_1', 
'bancosantander_es_particulares_cuentas_tarjetas_cuentas_corrientes_cuenta_online_sin_comisiones']

In [33]:

# Calculate the average of the columns
data_avg_bm = target_var_data[all_brick_and_mortar_target_var].mean()
target_var_cols = [i.split('_')[0] for i in list(all_brick_and_mortar_target_var) if i != 'date']

# Sort the values
data_avg_bm = data_avg_bm.sort_values(ascending=False)
data_avg_bm.index = ['abanca', 'bbva', 'bancosantander', 'bancsabadell', 'ing', 'ing_1', 'bankinter']

# Create the bar chart
fig = px.bar(data_avg_bm, 
             x=data_avg_bm.index, 
             y=data_avg_bm.values, 
             title='Average Web Traffic for Brick and Mortar', 
             color=data_avg_bm.values,  # Use values for continuous color scale
             color_continuous_scale='ice')  # Use Mako continuous color scale

# Set the template to a dark background
fig.update_layout(template='plotly_white')

# Update axis labels
fig.update_xaxes(title_text="Banks")
fig.update_yaxes(title_text="Average Web Traffic")

# Hide legend
fig.update_layout(showlegend=False)

# Show the plot
fig.show()

In [34]:

# Calculate the average of the columns
all_columns = [i for i in target_var_data.columns if i not in all_brick_and_mortar_target_var and i != 'date']
data_avg_neo = target_var_data[all_columns].mean()

# Sort the values
data_avg_neo = data_avg_neo.sort_values(ascending=False)
data_avg_neo.index = ['openbank', 'n26', 'myinvestor', 'n26_1', 'evobanco', 'revolut']

# Create the bar chart
fig = px.bar(data_avg_neo, 
             x=data_avg_neo.index, 
             y=data_avg_neo.values, 
             title='Average Web Traffic for Neo Bank', 
             color=data_avg_neo.values,  # Use values for continuous color scale
             color_continuous_scale='ice')  # Use Mako continuous color scale

# Set the template to a dark background
fig.update_layout(template='plotly_white')

# Update axis labels
fig.update_xaxes(title_text="Banks")
fig.update_yaxes(title_text="Average Cross Visitation")

# Hide legend
fig.update_layout(showlegend=False)

# Show the plot
fig.show()

In [35]:
# create a scatter plot of avg_data_neo and avg_data_bm with different colors for the two values 

data_avg_neo = pd.DataFrame(data_avg_neo)
data_avg_neo['bank'] = data_avg_neo.index
data_avg_neo['bank_type'] = 'neo'
data_avg_neo.columns = ['web_traffic', 'bank', 'bank_type']

data_avg_bm = pd.DataFrame(data_avg_bm)
data_avg_bm['bank'] = data_avg_bm.index
data_avg_bm['bank_type'] = 'brick and mortar'
data_avg_bm.columns = ['web_traffic', 'bank', 'bank_type']

data_avg = pd.concat([data_avg_neo, data_avg_bm])

fig = px.scatter(data_avg, x='bank', y='web_traffic', title='Average Web Traffic for Neo Bank vs Brick and Mortar', color='web_traffic', color_continuous_scale='ice')
fig.show()

In [36]:
# plot a bar graph of avg web traffic for neo banks and brick and mortar banks

data_grouped = data_avg.groupby('bank_type')['web_traffic'].mean()
fig = px.bar(data_grouped, x=['Brick and Mortar', 'Neo Bank'], y=data_grouped.values, title='Average Web Traffic for Neo Bank vs Brick and Mortar', color=data_grouped.values, color_continuous_scale='ice')

fig.update_xaxes(title_text="Bank Type")
fig.update_yaxes(title_text="Average Web Traffic")

fig.show()

In [37]:
data_avg

Unnamed: 0,web_traffic,bank,bank_type
openbank,16451.17368,openbank,neo
n26,10692.587891,n26,neo
myinvestor,9177.442746,myinvestor,neo
n26_1,6991.379395,n26_1,neo
evobanco,6785.137695,evobanco,neo
revolut,5878.37207,revolut,neo
abanca,54317.650012,abanca,brick and mortar
bbva,39683.31389,bbva,brick and mortar
bancosantander,39051.034753,bancosantander,brick and mortar
bancsabadell,30402.099404,bancsabadell,brick and mortar


In [38]:
time_series_comp = pd.DataFrame(target_var_data[all_brick_and_mortar_target_var].mean(axis=1))
time_series_comp['date'] = target_var_data['date']

time_series_comp.columns = ['Brick and Mortar', 'date']

time_series_comp['Neo Bank'] = target_var_data[[col for col in target_var_data.columns if col not in all_brick_and_mortar_target_var and col != 'date']].mean(axis=1)

#log scale the data
time_series_comp['Brick and Mortar'] = np.log1p(time_series_comp['Brick and Mortar'])
time_series_comp['Neo Bank'] = np.log1p(time_series_comp['Neo Bank'])

# plot time series of web traffic for brick and mortar and neo banks


fig = px.line(time_series_comp, x='date', y=['Brick and Mortar', 'Neo Bank'], title='Web Traffic for Brick and Mortar vs Neo Banks')

fig.update_xaxes(title_text="Bank Type")
fig.update_yaxes(title_text="Web Traffic")

#update legend title
fig.update_layout(legend_title_text='Bank Type')

fig.show()

# APR Causal Analysis

In [39]:
summaries_data = pd.read_csv('/Users/andrewbennett/Documents/bse/term3/deloitte_digital_banking/way_back_scraper/summarized_text_2.csv')

# drop all unnamed columns
summaries_data = summaries_data.loc[:, ~summaries_data.columns.str.contains('^Unnamed')]
summaries_data['bank_name'] = summaries_data['site'].astype(str).str.split('.').apply(lambda x: x[0])
summaries_data

Unnamed: 0,site,year,month,text,preprocessed_text,text_rates,NIR,APR,NIR_len,APR_len,APR_value,NIR_value,translated_text,summaries,APR.1,cleaned_summary_data,bank_name
0,myinvestor.es,2023,7,\n\n\n\n\n\n\n\nCuenta remunerada 2 % TAE dura...,remunerado 2% APR 12 mes 50.000€cuenta tarjet...,"{'30% APR', '0,3% APR', '2% APR', '2% NIR'}",[2.0],"[0.3, 0.3, 2.0]",1.0,3.0,,2.0,"paid 2% APR 12 month €50,000card accountpaid a...","{\n ""Interest Rate"": ""2% APR"",\n ""Conditions...",2.0,"{'APR': ' 2%', 'Term': ' 12 months', 'Amount':...",myinvestor
1,myinvestor.es,2022,6,\n\n\n\n\n\n\n\nCuenta remunerada 1 % TIN/TAE ...,remunerado 1% NIR / APR 15000€ | MyInvestor1/...,"{'1% APR', '1% NIR', '10% APR', '10% NIR'}","[1.0, 0.1]","[1.0, 0.1]",2.0,2.0,,,"paid 1% NIR / APR €15,000 | MyInvestor1/6This ...","{\n ""Interest Rate"": ""1% NIR / APR for the fi...",1.0,{},myinvestor
2,myinvestor.es,2022,10,\n\n\n\n\n\n\n\nCuenta remunerada 1 % TIN/TAE ...,remunerado 1% NIR / APR 12 mes 30000€ | myinv...,"{'1% APR', '30% APR', '1% NIR'}",[1.0],"[1.0, 0.3]",1.0,2.0,,1.0,"paid 1% NIR / APR 12 month €30,000 | myinvesto...","{\n ""Interest Rate"": ""1% APR"",\n ""Conditions...",1.0,{},myinvestor
3,myinvestor.es,2021,1,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCuenta remun...,remunerado 1% NIR / APR 15.000â¬ | MyInvesto...,{'1% NIR'},[1.0],[],1.0,0.0,,1.0,"remunerated 1% NIR / APR 15,000â¬ | MyInvestor...","{\n ""Interest Rate"": ""1% NIR / APR"",\n ""Cond...",1.0,{},myinvestor
4,myinvestor.es,2022,11,\n\n\n\n\n\n\n\nCuenta remunerada 1 % TIN/TAE ...,remunerado 1% NIR / APR 12 mes 30000€ | myinv...,"{'1% APR', '30% APR', '1% NIR'}",[1.0],"[1.0, 0.3]",1.0,2.0,,1.0,"paid 1% NIR / APR 12 month €30,000 | myinvesto...","{\n ""Interest Rate"": ""1% APR"",\n ""Conditions...",1.0,"{'APR': ' 1% ', 'Conditions': ' ', 'Addition...",myinvestor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,abanca.com,2022,9,\n\n\n\n\n\n\n\n Cuent...,Online Comisiones | ...,"{'0% APR', '0% NIR'}",[],[],0.0,0.0,,,Online Commissions | €300 payroll | ABANCA ABA...,"{\n ""Interest Rate"": ""0% NIR, 0% APR"",\n ""Co...",0.0,"{'APR and Conditions Summary': '', 'Product': ...",abanca
389,abanca.com,2023,12,\n\n\n\n\n\n\n\n Cuenta On...,Online Comisiones | 300€...,"{'2% I', '0% APR', '2% APR'}",[],[2.0],0.0,1.0,2.00,,Online Commissions | €300 payroll + 2% APR | a...,"{\n ""Interest Rate"": ""2% APR"",\n ""Conditions...",2.0,{},abanca
390,abanca.com,2023,4,\n\n\n\n\n\n\n\n Cue...,Online Comisiones ...,"{'0% APR', '0% NIR'}",[],[],0.0,0.0,,,Online Commissions | €300 payroll | ABANCA ABA...,"{\n ""Interest Rate"": ""0% NIR, 0% APR"",\n ""Co...",0.0,{},abanca
391,abanca.com,2022,5,\n\n\n\n\n\n\n\n Cuent...,Online Comisiones | ...,"{'0% APR', '0% NIR'}",[],[],0.0,0.0,,,Online Commissions | €300 payroll | ABANCA ABA...,"{\n ""Interest Rate"": ""0% NIR, 0% APR"",\n ...",0.0,"{'Summary': '', 'Product': ' Online Commission...",abanca


In [40]:
print(summaries_data.shape)
summaries_data = summaries_data.loc[~(summaries_data.bank_name == ' página visitado ) ')]
print(summaries_data.shape)

(393, 17)
(393, 17)


In [41]:
target_var_data['month'] = target_var_data.date.dt.month
target_var_data['year'] = target_var_data.date.dt.year
summaries_data['month'] = summaries_data['month'].astype(int)
summaries_data['year'] = summaries_data['year'].astype(int)

target_var_data['month'] = target_var_data['month'].astype(int)
target_var_data['year'] = target_var_data['year'].astype(int)
target_var_data

Unnamed: 0,date,abanca_com_es_cuentas_cuenta_online,n26_com_es_es_cuenta_sin_comisiones,bancsabadell_com_cuenta_online,ing_es_cuenta_nocuenta_ing,bbva_es_personas_productos_cuentas_cuenta_online_sin_comisiones_html,revolut_com_es_es_a_radically_better_account,openbank_es_cuenta_ahorro_bienvenida,myinvestor_es_cuentas_tarjetas_cuentas,n26_com_es_es_cuenta_ahorro,bankinter_com_banca_cuentas_tarjetas_cuentas_cuenta_nomina,ing_es_cuenta_nocuenta_ing_1,evobanco_com_cuenta_inteligente,bancosantander_es_particulares_cuentas_tarjetas_cuentas_corrientes_cuenta_online_sin_comisiones,month,year
0,2021-12-01,15202.983835,6664.369141,20592.345013,24381.005813,45449.486787,1343.418335,15250.182615,5319.608623,15237.549805,8733.560835,24380.762233,5055.54248,22325.885971,12,2021
1,2022-01-01,42183.913207,5310.177246,28886.199059,24448.868576,38306.464983,1531.016846,17358.903158,6577.851993,11468.245117,11410.593443,24448.821142,4539.012207,28331.245044,1,2022
2,2022-02-01,34498.153115,5547.588867,24913.158633,24416.499127,40978.325821,1304.591309,16941.459256,4031.71696,12927.669922,8891.661053,24416.357951,6049.458008,23798.81917,2,2022
3,2022-03-01,34546.319021,4900.271484,23955.858539,24411.12898,43970.545627,1105.180542,16384.133789,4941.009311,12553.577148,7446.925144,24410.971192,9826.193359,24808.561729,3,2022
4,2022-04-01,40508.28842,4909.581055,27143.496585,24437.33196,40782.451956,1020.968689,17413.331041,5774.734455,11886.826172,10285.861726,24437.253057,5028.236328,24739.758729,4,2022
5,2022-05-01,29791.33818,6639.387695,12267.129453,24467.64236,44439.738816,1648.221313,12540.812395,4701.676152,14913.056641,5975.58225,24467.64236,6753.060547,37041.514189,5,2022
6,2022-06-01,35087.423687,7382.783203,27765.978171,24467.64236,29359.709637,1261.18335,16505.499605,3743.697827,13022.085938,7762.845175,24467.64236,5874.299805,35471.865599,6,2022
7,2022-07-01,61969.499304,9062.696289,33651.426645,22189.247828,49115.263085,2692.579346,18092.837291,8887.53198,10516.891602,6762.668091,22189.247828,2726.868408,35951.67483,7,2022
8,2022-08-01,53939.003961,8751.572266,36382.542418,29944.644102,49005.118424,4602.381348,17800.667417,6295.460707,11442.03418,5397.615718,29944.644102,5641.461914,40746.600434,8,2022
9,2022-09-01,50991.056414,7224.135254,40102.371944,50772.22105,36922.655663,9032.796875,18473.274492,6546.830235,13134.702148,8613.556577,50772.22105,4338.913086,41933.057915,9,2022


In [42]:
all_banks = list(set(summaries_data.bank_name))
target_data = target_var_data.copy()

for bank in all_banks: 
    bank_data = summaries_data.loc[summaries_data.bank_name == bank]
    target_data = target_data.join((bank_data.set_index(['month', 'year']))['APR.1'], on=['month', 'year'], rsuffix=f'_{bank}')

target_data

Unnamed: 0,date,abanca_com_es_cuentas_cuenta_online,n26_com_es_es_cuenta_sin_comisiones,bancsabadell_com_cuenta_online,ing_es_cuenta_nocuenta_ing,bbva_es_personas_productos_cuentas_cuenta_online_sin_comisiones_html,revolut_com_es_es_a_radically_better_account,openbank_es_cuenta_ahorro_bienvenida,myinvestor_es_cuentas_tarjetas_cuentas,n26_com_es_es_cuenta_ahorro,...,APR.1_ing,APR.1_n26,APR.1_bankinter,APR.1_myinvestor,APR.1_bbva,APR.1_evobanco,APR.1_bancsabadell,APR.1_abanca,APR.1_revolut,APR.1_openbank
0,2021-12-01,15202.983835,6664.369141,20592.345013,24381.005813,45449.486787,1343.418335,15250.182615,5319.608623,15237.549805,...,0.0,0.0,2.0,1.0,0.0,0.01,0.0,0.0,0.0,0.53
1,2022-01-01,42183.913207,5310.177246,28886.199059,24448.868576,38306.464983,1531.016846,17358.903158,6577.851993,11468.245117,...,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.53
2,2022-02-01,34498.153115,5547.588867,24913.158633,24416.499127,40978.325821,1304.591309,16941.459256,4031.71696,12927.669922,...,0.0,0.0,2.0,1.0,0.0,0.01,0.0,0.0,0.0,0.53
3,2022-03-01,34546.319021,4900.271484,23955.858539,24411.12898,43970.545627,1105.180542,16384.133789,4941.009311,12553.577148,...,0.0,0.0,2.0,1.0,0.0,0.01,0.0,0.0,0.0,0.53
4,2022-04-01,40508.28842,4909.581055,27143.496585,24437.33196,40782.451956,1020.968689,17413.331041,5774.734455,11886.826172,...,0.0,0.0,2.0,1.0,0.0,0.01,0.0,0.0,0.0,1.0
5,2022-05-01,29791.33818,6639.387695,12267.129453,24467.64236,44439.738816,1648.221313,12540.812395,4701.676152,14913.056641,...,0.0,0.0,2.0,1.0,0.0,0.01,0.0,0.0,0.0,1.0
6,2022-06-01,35087.423687,7382.783203,27765.978171,24467.64236,29359.709637,1261.18335,16505.499605,3743.697827,13022.085938,...,0.0,0.0,2.0,1.0,0.0,0.01,0.0,0.0,0.0,1.0
7,2022-07-01,61969.499304,9062.696289,33651.426645,22189.247828,49115.263085,2692.579346,18092.837291,8887.53198,10516.891602,...,0.0,0.0,2.0,1.0,0.0,0.01,0.0,0.0,0.0,1.0
8,2022-08-01,53939.003961,8751.572266,36382.542418,29944.644102,49005.118424,4602.381348,17800.667417,6295.460707,11442.03418,...,0.0,0.0,2.0,1.0,0.0,0.01,1.0,0.0,0.0,1.0
9,2022-09-01,50991.056414,7224.135254,40102.371944,50772.22105,36922.655663,9032.796875,18473.274492,6546.830235,13134.702148,...,0.0,0.0,2.0,1.0,0.0,0.01,1.0,0.0,0.0,1.0


In [43]:
def display_two_axis_plot(date, y1, y2, bank_name):
    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces
    fig.add_trace(
        go.Scatter(x=date, y=y1, name='Cross Visitation'),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=date, y=y2, name='Web Traffic'),
        secondary_y=True,
    )

    # Add figure title
    fig.update_layout(
        title_text=f"Web Traffic vs Cross Visitation - {bank_name}"
    )

    # Set x-axis title
    fig.update_xaxes(title_text="Date")

    # Set y-axes titles
    fig.update_yaxes(title_text="<b>Cross Visitation</b>", secondary_y=False)
    fig.update_yaxes(title_text="<b>Web Traffic</b>", secondary_y=True)

    fig.show()

In [44]:
for bank in all_banks: 
    # plot all columns against the date in a line plot with separate axis for each bank
    bank_cols = [i for i in target_data.columns if bank in i]
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    if len(bank_cols) > 1:
        for bank_col in bank_cols:
            if 'APR.1' in bank_col:
                fig.add_trace(
                    go.Scatter(x=target_data.date, y=target_data[bank_col], name='APR'),
                    secondary_y=False,
                )
            else:
                fig.add_trace(
                    go.Scatter(x=target_data.date, y=target_data[bank_col], name='Site Traffic'),
                    secondary_y=True,
                )
        bank_name = bank_cols[0].split('_')[0]
        fig.update_layout(
            title_text=f"Web Traffic vs Cross Visitation - {bank_name}"
        )

        # Set x-axis title
        fig.update_xaxes(title_text="Date")

        # Set y-axes titles
        fig.update_yaxes(title_text="<b>APR</b>", secondary_y=False)
        fig.update_yaxes(title_text="<b>Web Traffic</b>", secondary_y=True)

        fig.show()
    else:
        print('something went wrong with the bank')
        print(bank_cols)
    
    

something went wrong with the bank
['bancosantander_es_particulares_cuentas_tarjetas_cuentas_corrientes_cuenta_online_sin_comisiones']


In general, most banks have steady trendlines for APR; however, when there are changes to APR, especially in the positive direction, there is typically a spike in web traffic. This is promising for a diff in diff analysis on the impact of APR changes or other incentives on web traffic.

In [51]:
apr_cols = [i for i in target_data.columns if 'APR.1' in i]
web_traffic_cols = [i for i in target_data.columns if 'APR.1' not in i and i != 'date']

In [55]:
# Plot average APR and Average web traffic in a line chart 

apr_avg = target_data[apr_cols].mean(axis=1)
apr_avg = pd.DataFrame(apr_avg)

apr_avg['web_traffic'] = target_data[web_traffic_cols].mean(axis=1)
apr_avg['date'] = target_data['date']

apr_avg.columns = ['APR', 'Web Traffic', 'Date']



In [57]:
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
    go.Scatter(x=apr_avg.Date, y=apr_avg['APR'], name='APR'),
    secondary_y=False,
)
fig.add_trace(
    go.Scatter(x=apr_avg.Date, y=apr_avg['Web Traffic'], name='Site Traffic'),
    secondary_y=True,
)

fig.update_layout(
    title_text=f"Web Traffic vs APR"
)

# Set x-axis title
fig.update_xaxes(title_text="Date")

# Set y-axes titles
fig.update_yaxes(title_text="<b>APR</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>Web Traffic</b>", secondary_y=True)

fig.show()