In [75]:
import pandas as pd
import klib as kl 
import plotly.express as px
import plotly.graph_objects as go

# import iterative imputer 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# impute missing values with KNN

from sklearn.impute import KNNImputer

In [69]:
cross_visitation_data = pd.read_excel('data/Account Products.xlsx', sheet_name='cross_visitation', header=1)

bank_comps = pd.read_excel('data/Account Products.xlsx', sheet_name='bank_comparison_metrics', header=0)
bank_comps['has_stores'] = bank_comps['number of branches'] > 0

target_var_data = pd.read_excel('data/Account Products.xlsx', sheet_name='web_traffic_accounts', header=0)

## Data Cleanup

In [72]:
cross_visitation_data = kl.data_cleaning(cross_visitation_data)

brick_and_mortar_banks = list(bank_comps.loc[bank_comps.has_stores == True, 'website'])
neo_banks = [x for x in cross_visitation_data.columns if x not in brick_and_mortar_banks]
neo_banks.remove('date')

target_var_data = kl.data_cleaning(target_var_data)

Shape of cleaned data: (25, 16) - Remaining NAs: 25


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.0 MB (-nan%)

Shape of cleaned data: (25, 14) - Remaining NAs: 47


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.0 MB (-nan%)




invalid value encountered in scalar divide


invalid value encountered in scalar divide



In [77]:
# get numerica columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_data = cross_visitation_data.select_dtypes(include=numerics)
num_cols = numeric_data.columns

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_data = target_var_data.select_dtypes(include=numerics)
num_cols_target = numeric_data.columns

Index(['abanca_com_es_cuentas_cuenta_online',
       'n26_com_es_es_cuenta_sin_comisiones', 'bancsabadell_com_cuenta_online',
       'ing_es_cuenta_nocuenta_ing',
       'bbva_es_personas_productos_cuentas_cuenta_online_sin_comisiones_html',
       'revolut_com_es_es_a_radically_better_account',
       'openbank_es_cuenta_ahorro_bienvenida',
       'myinvestor_es_cuentas_tarjetas_cuentas', 'n26_com_es_es_cuenta_ahorro',
       'bankinter_com_banca_cuentas_tarjetas_cuentas_cuenta_nomina',
       'ing_es_cuenta_nocuenta_ing_1', 'evobanco_com_cuenta_inteligente',
       'bancosantander_es_particulares_cuentas_tarjetas_cuentas_corrientes_cuenta_online_sin_comisiones'],
      dtype='object')

### Feature Engineering + Imputation

In [79]:
# impute target var data


# create an instance of the imputer
imp_mean = IterativeImputer(random_state=0)

# fit the imputer
imp_mean.fit(target_var_data[num_cols_target])

# impute the data
imputed_target_var_data = imp_mean.transform(target_var_data[num_cols_target])

# convert the imputed data to a dataframe
target_var_data[num_cols_target] = pd.DataFrame(imputed_target_var_data, columns=num_cols_target)



In [82]:


imputer = KNNImputer(n_neighbors=2)

data_imputed = imputer.fit_transform(cross_visitation_data[num_cols])

cross_visitation_data[num_cols] = pd.DataFrame(data_imputed, columns=num_cols)

cross_visitation_data.head()


Unnamed: 0,date,abanca_com,bancosantander_es,bancsabadell_com,bankinter_com,bbva_es,caixabank_es,evobanco_com,imagin_com,ing_es,kutxabank_es,myinvestor_es,n26_com,openbank_es,pibank_es,revolut_com
0,2021-12-01,0.077994,0.116923,0.070229,0.105769,0.196307,0.162256,0.079812,0.059727,0.120856,0.049426,0.068323,0.067204,0.12828,0.046249,0.071006
1,2022-01-01,0.0625,0.107472,0.084605,0.094186,0.170377,0.171842,0.055328,0.061053,0.12127,0.047898,0.067114,0.078723,0.127622,0.059259,0.052795
2,2022-02-01,0.052288,0.119243,0.059621,0.083083,0.192536,0.171842,0.067478,0.065734,0.098605,0.048193,0.049738,0.055477,0.117399,0.057143,0.04902
3,2022-03-01,0.061503,0.112708,0.067651,0.082576,0.174929,0.179195,0.075472,0.055402,0.120556,0.041714,0.05906,0.056548,0.107581,0.034884,0.051075
4,2022-04-01,0.050222,0.128592,0.064007,0.102992,0.18148,0.179195,0.072148,0.058762,0.11454,0.041714,0.050104,0.052254,0.104252,0.037618,0.045169


## Generic Data Visualizations

In [83]:
# create stacked time series line plot in plotly express 
fig = px.line(cross_visitation_data, x='date', y=num_cols, title='Stacked Time Series Line Plot')
fig.show()

In [84]:
# log transform the data and create a stacked time series line plot in plotly express

import numpy as np

data_log = cross_visitation_data.copy()
data_log[num_cols] = np.log1p(cross_visitation_data[num_cols])

fig = px.line(data_log, x='date', y=num_cols, title='Stacked Time Series Line Plot')
fig.show()

### Cross Visitation Metrics

Avg Visitation

In [85]:
# Calculate the average of the columns
data_avg = cross_visitation_data[num_cols].mean()

# Sort the values
data_avg = data_avg.sort_values(ascending=False)

# Create the bar chart
fig = px.bar(data_avg, 
             x=data_avg.index, 
             y=data_avg.values, 
             title='Average Cross Visitation by Bank', 
             color=data_avg.values,  # Use values for continuous color scale
             color_continuous_scale='ice')  # Use Mako continuous color scale

# Set the template to a dark background
fig.update_layout(template='plotly_white')

# Update axis labels
fig.update_xaxes(title_text="Banks")
fig.update_yaxes(title_text="Average Cross Visitation")

# Hide legend
fig.update_layout(showlegend=False)

# Show the plot
fig.show()

BBVA and the larger banks have higher cross visitation metrics. It would be good to see the breakdown of only neobanks. 

##### Avg Cross Visitation of Neo-Banks

In [86]:
# Calculate the average of the columns
data_avg = cross_visitation_data[neo_banks].mean()

# Sort the values
data_avg = data_avg.sort_values(ascending=False)

# Create the bar chart
fig = px.bar(data_avg, 
             x=data_avg.index, 
             y=data_avg.values, 
             title='Average Cross Visitation by Neo Bank', 
             color=data_avg.values,  # Use values for continuous color scale
             color_continuous_scale='ice')  # Use Mako continuous color scale

# Set the template to a dark background
fig.update_layout(template='plotly_white')

# Update axis labels
fig.update_xaxes(title_text="Banks")
fig.update_yaxes(title_text="Average Cross Visitation")

# Hide legend
fig.update_layout(showlegend=False)

# Show the plot
fig.show()

It is clear that openbank has the best cross visitation traffic out of the neo banks.

## Growth rate of Cross Visitation
Now we look into how the cross visitation changes over time

In [87]:
# regress each column against the date and create a dataframe of the slope 

from sklearn.linear_model import LinearRegression

slope_data = pd.DataFrame()

for col in num_cols:
    # Create the linear regression model
    model = LinearRegression()
    model.fit(cross_visitation_data[['date']], cross_visitation_data[col])
    slope = model.coef_[0]
    slope_data[col] = [slope]

slope_data.index = ['slope']

# plot the slope data in a sorted bar plot in plotly express

slope_data = slope_data.T

# plot sorted values
slope_data = slope_data.sort_values(by='slope', ascending=False)

# Create the bar chart
fig = px.bar(slope_data, 
             x=slope_data.index, 
             y='slope', 
             title='Growth Rate of Cross Visitation by Bank', 
             color=slope_data.slope, 
             color_continuous_scale='ice')

# Set the template to a dark background
fig.update_layout(template='plotly_white')

fig.update_xaxes(title_text="Banks")
fig.update_yaxes(title_text="Growth Rate of Cross Visitation")
fig.update_layout(showlegend=False)

fig.update

# Show the plot
fig.show()

Imagin.com was launched by Caixa Bank and may attribute to the negative growth rate for Caixa Bank. This would be interesting to look into. 

In [63]:
# plot the cross visitation as a time series including only imagin.com and caixa bank 

fig = px.line(cross_visitation_data[['caixabank_es','imagin_com','date']], x='date', y=['caixabank_es','imagin_com'], title='Stacked Time Series Line Plot')
fig.show()

Unfortunately there is no clear trend here, but it does look like some type of advertising June may have increased traffic in Caixa Bank as well. It would be interesting to see the correlation of these to series. 

### Cross Visitation Correlation 

In [67]:

correlation_matrix = cross_visitation_data[num_cols].corr(method='pearson')
heatmap = go.Heatmap(z=correlation_matrix.values,
                        x=correlation_matrix.columns,
                        y=correlation_matrix.columns,
                        colorscale='Viridis')

# Create a figure and add the heatmap to it
fig = go.Figure(data=heatmap)

# update fig size 
fig.update_layout(width=800, height=800)

# Set plot title and axis labels
fig.update_layout(title='Pearson Correlation Coefficient Matrix',
                    xaxis_title='Time Series',
                    yaxis_title='Time Series')

# Show the plot
fig.show()

Caixa Bank is interestingly highly correlated across all banks. This may be indicative of some type of advertising strategy, or dependent on ranking or some other feature. 

### Cross Visitation impact on Target Variable

In [91]:
banks = list(cross_visitation_data.columns)
banks.remove('date')

In [106]:
[i for i in list(cross_visitation_concat.columns)]

['abanca_com',
 'bancosantander_es',
 'bancsabadell_com',
 'bankinter_com',
 'bbva_es',
 'caixabank_es',
 'evobanco_com',
 'imagin_com',
 'ing_es',
 'kutxabank_es',
 'myinvestor_es',
 'n26_com',
 'openbank_es',
 'pibank_es',
 'revolut_com',
 'date',
 'abanca_com_es_cuentas_cuenta_online',
 'n26_com_es_es_cuenta_sin_comisiones',
 'bancsabadell_com_cuenta_online',
 'ing_es_cuenta_nocuenta_ing',
 'bbva_es_personas_productos_cuentas_cuenta_online_sin_comisiones_html',
 'revolut_com_es_es_a_radically_better_account',
 'openbank_es_cuenta_ahorro_bienvenida',
 'myinvestor_es_cuentas_tarjetas_cuentas',
 'n26_com_es_es_cuenta_ahorro',
 'bankinter_com_banca_cuentas_tarjetas_cuentas_cuenta_nomina',
 'ing_es_cuenta_nocuenta_ing_1',
 'evobanco_com_cuenta_inteligente',
 'bancosantander_es_particulares_cuentas_tarjetas_cuentas_corrientes_cuenta_online_sin_comisiones']

In [103]:
from plotly.subplots import make_subplots


In [123]:
def display_two_axis_plot(date, y1, y2, bank_name):
    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces
    fig.add_trace(
        go.Scatter(x=date, y=y1, name='Cross Visitation'),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=date, y=y2, name='Web Traffic'),
        secondary_y=True,
    )

    # Add figure title
    fig.update_layout(
        title_text=f"Web Traffic vs Cross Visitation - {bank_name}"
    )

    # Set x-axis title
    fig.update_xaxes(title_text="Date")

    # Set y-axes titles
    fig.update_yaxes(title_text="<b>Cross Visitation</b>", secondary_y=False)
    fig.update_yaxes(title_text="<b>Web Traffic</b>", secondary_y=True)

    fig.show()

In [124]:
# concat cross visitation with target var
cross_visitation_data_temp = cross_visitation_data.drop(columns='date')
cross_visitation_concat = pd.concat([cross_visitation_data_temp, target_var_data], axis=1)

for bank in banks: 
    bank = bank.split('_')[0]
    columns = [i for i in list(cross_visitation_concat.columns) if bank in i]

    if len(columns) > 1:
        bank_name = bank.title()
        display_two_axis_plot(cross_visitation_concat['date'], cross_visitation_concat[columns[0]], cross_visitation_concat[columns[1]], bank_name)
    

# Create subplots


The outliers are BBVA and BankInter otherwise most have high correlation between cross visitationa and web traffic. There is some possibility to see normalize the two time series then isolate any deviations from the cross visitation trend. This may isolate advertising pushes or other product releases. Additionally, Bank Sabadell seems to have a high web traffic despite lower cross visitation, this may suggest name brand retention indpendent of other banks. 

# Age of Bank EDA

In [160]:
# create an age of bank column by subtracting the year founded from 2024

bank_comps['age'] = 2024 - bank_comps['age of bank']

# log scale age 
bank_comps['age'] = np.log1p(bank_comps['age'])

# sort by age 
bank_comps = bank_comps.sort_values(by='age', ascending=False)

# plot the age of bank column in a histogram in plotly express sorted

fig = px.bar(bank_comps.drop_duplicates(), x='website', y='age', title='Age of Banks', color='age', color_continuous_scale='ice')

# Set the template to a dark background
fig.update_layout(template='plotly_white')

fig.update_xaxes(title_text="Bank Websites")
fig.update_yaxes(title_text="Age")
fig.update_layout(showlegend=False)

# make a blues color palette gradient
fig.update_traces(marker=dict(color=bank_comps['age'], colorscale='Blues', showscale=True))

# Show the plot
fig.show()

In [125]:
# how to create a continuous correlation map with a single column to another column

bank_comps.columns

Index(['Notes', 'website', 'number of employee', 'number of customers ',
       'assets under management', 'number of branches', 'age of bank',
       'references', 'Unnamed: 8', 'Unnamed: 9', 'has_stores'],
      dtype='object')

In [146]:
target_var_data_ = target_var_data[num_cols_target].mean().reindex()
index = target_var_data_.index 
index = [i.split('_')[0] for i in index]
target_var_data_.index = index
target_var_data_ = pd.DataFrame(target_var_data_)
target_var_data_['bank'] = target_var_data_.index
target_var_data_.columns = ['web_traffic', 'bank']

In [147]:
bank_comps.head()

Unnamed: 0,Notes,website,number of employee,number of customers,assets under management,number of branches,age of bank,references,Unnamed: 8,Unnamed: 9,has_stores,bank
0,,abanca.com,5946,,72148000000.0,690,2011,https://www.abancacorporacionbancaria.com/file...,,,True,abanca
1,,bancosantander.es,212764,,1117000000000.0,8518,1857,https://www.santander.com/en/about-us/key-fact...,https://www.statista.com/statistics/417009/ban...,,True,bancosantander
2,,bancsabadell.com,19316,,253000000000.0,1594,1881,https://www.globaldata.com/company-profile/ban...,https://www.bancsabadell.com/bsnacional/en/bra...,,True,bancsabadell
3,,bankinter.com,6138,,83300000000.0,523,1965,https://www.macrotrends.net/stocks/charts/BKNI...,https://www.bankinter.com/webcorporativa/amp/e...,https://www.bankinter.com/www/webcorp/swf/memo...,True,bankinter
4,,bbva.es,121486,,775000000000.0,1800,1857,https://www.globaldata.com/company-profile/ban...,https://www.bbva.es/en/general/buscador-oficin...,,True,bbva


In [153]:
bank_comps['bank'] = bank_comps.website.str.split('.').apply(lambda x: x[0])
bank_comps = bank_comps.merge(target_var_data_, left_on='bank', right_on='bank')

In [155]:
correlation_matrix = bank_comps[['web_traffic','age of bank']].corr(method='pearson')
heatmap = go.Heatmap(z=correlation_matrix.values,
                        x=correlation_matrix.columns,
                        y=correlation_matrix.columns,
                        colorscale='ice')

# Create a figure and add the heatmap to it
fig = go.Figure(data=heatmap)

# update fig size 
fig.update_layout(width=800, height=800)

# Set plot title and axis labels
fig.update_layout(title='Pearson Correlation Coefficient Matrix',
                    xaxis_title='Time Series',
                    yaxis_title='Time Series')

# Show the plot
fig.show()

In [161]:
# scatter plot of age and web traffic 

fig = px.scatter(bank_comps, x='age of bank', y='web_traffic', title='Age of Bank vs Web Traffic', color='web_traffic', color_continuous_scale='ice')
fig.show()

There are clear trends here that show more recent banks have less customers, while older banks have more customer. It would be interesting to see customer retention metrics. 

In [76]:
# plot a pie chart of the number of banks with stores and without stores

fig = px.pie(bank_comps, names='has_stores', title='Banks with Stores vs. Banks without Stores', color='has_stores')
fig.show()

# create a bar chart of the top amounts given the differences in brick and mortar stores 

In [None]:
brick_and_mortar_banks = list(bank_comps.loc[bank_comps.has_stores == True, 'website'])
neo_banks = [x for x in cross_visitation_data.columns if x not in brick_and_mortar_banks]
neo_banks.remove('date')

target_var_data = kl.data_cleaning(target_var_data)

In [176]:
brick_and_mortar_banks

['abanca.com',
 'bancosantander.es',
 'bancsabadell.com',
 'bankinter.com',
 'bbva.es',
 'caixabank.es',
 'evobanco.com',
 'ing.es',
 'kutxabank.es']

In [177]:
all_brick_and_mortar_target_var = ['abanca_com_es_cuentas_cuenta_online',
'bancsabadell_com_cuenta_online',
'ing_es_cuenta_nocuenta_ing',
'bbva_es_personas_productos_cuentas_cuenta_online_sin_comisiones_html',
'bankinter_com_banca_cuentas_tarjetas_cuentas_cuenta_nomina',
'ing_es_cuenta_nocuenta_ing_1', 
'bancosantander_es_particulares_cuentas_tarjetas_cuentas_corrientes_cuenta_online_sin_comisiones']

In [178]:

# Calculate the average of the columns
data_avg = target_var_data[all_brick_and_mortar_target_var].mean()
target_var_cols = [i.split('_')[0] for i in list(all_brick_and_mortar_target_var) if i != 'date']

# Sort the values
data_avg = data_avg.sort_values(ascending=False)

# Create the bar chart
fig = px.bar(data_avg, 
             x=data_avg.index, 
             y=data_avg.values, 
             title='Average Web Traffic for Brick and Mortar', 
             color=data_avg.values,  # Use values for continuous color scale
             color_continuous_scale='ice')  # Use Mako continuous color scale

# Set the template to a dark background
fig.update_layout(template='plotly_white')

# Update axis labels
fig.update_xaxes(title_text="Banks")
fig.update_yaxes(title_text="Average Cross Visitation")

# Hide legend
fig.update_layout(showlegend=False)

# Show the plot
fig.show()

In [181]:

# Calculate the average of the columns
all_columns = [i for i in target_var_data.columns if i not in all_brick_and_mortar_target_var and i != 'date']
data_avg = target_var_data[all_columns].mean()

# Sort the values
data_avg = data_avg.sort_values(ascending=False)

# Create the bar chart
fig = px.bar(data_avg, 
             x=data_avg.index, 
             y=data_avg.values, 
             title='Average Web Traffic for Neo Bank', 
             color=data_avg.values,  # Use values for continuous color scale
             color_continuous_scale='ice')  # Use Mako continuous color scale

# Set the template to a dark background
fig.update_layout(template='plotly_white')

# Update axis labels
fig.update_xaxes(title_text="Banks")
fig.update_yaxes(title_text="Average Cross Visitation")

# Hide legend
fig.update_layout(showlegend=False)

# Show the plot
fig.show()

In [162]:
# create a bar chart of the top amounts given the differences in brick and mortar stores 

target_var_data.head()

Unnamed: 0,date,abanca_com_es_cuentas_cuenta_online,n26_com_es_es_cuenta_sin_comisiones,bancsabadell_com_cuenta_online,ing_es_cuenta_nocuenta_ing,bbva_es_personas_productos_cuentas_cuenta_online_sin_comisiones_html,revolut_com_es_es_a_radically_better_account,openbank_es_cuenta_ahorro_bienvenida,myinvestor_es_cuentas_tarjetas_cuentas,n26_com_es_es_cuenta_ahorro,bankinter_com_banca_cuentas_tarjetas_cuentas_cuenta_nomina,ing_es_cuenta_nocuenta_ing_1,evobanco_com_cuenta_inteligente,bancosantander_es_particulares_cuentas_tarjetas_cuentas_corrientes_cuenta_online_sin_comisiones
0,2021-12-01,15202.983835,6664.369141,20592.345013,24381.005813,45449.486787,1343.418335,15250.182615,5319.608623,15237.550206,8733.560835,24380.762233,5055.54248,22325.885971
1,2022-01-01,42183.913207,5310.177246,28886.199059,24448.868576,38306.464983,1531.016846,17358.903158,6577.851993,11468.245023,11410.593443,24448.821142,4539.012207,28331.245044
2,2022-02-01,34498.153115,5547.588867,24913.158633,24416.499127,40978.325821,1304.591309,16941.459256,4031.71696,12927.670282,8891.661053,24416.357951,6049.458008,23798.81917
3,2022-03-01,34546.319021,4900.271484,23955.858539,24411.12898,43970.545627,1105.180542,16384.133789,4941.009311,12553.57725,7446.925144,24410.971192,9826.193359,24808.561729
4,2022-04-01,40508.28842,4909.581055,27143.496585,24437.33196,40782.451956,1020.968689,17413.331041,5774.734455,11886.826026,10285.861726,24437.253057,5028.236328,24739.758729


In [171]:
target_var_cols = [i.split('_')[0] for i in list(target_var_data.columns) if i != 'date']
td_list = list(target_var_data.columns)
td_list.remove('date')
target_var_lookups = dict(zip(td_list, target_var_cols))
target_var_lookups

{'abanca_com_es_cuentas_cuenta_online': 'abanca',
 'n26_com_es_es_cuenta_sin_comisiones': 'n26',
 'bancsabadell_com_cuenta_online': 'bancsabadell',
 'ing_es_cuenta_nocuenta_ing': 'ing',
 'bbva_es_personas_productos_cuentas_cuenta_online_sin_comisiones_html': 'bbva',
 'revolut_com_es_es_a_radically_better_account': 'revolut',
 'openbank_es_cuenta_ahorro_bienvenida': 'openbank',
 'myinvestor_es_cuentas_tarjetas_cuentas': 'myinvestor',
 'n26_com_es_es_cuenta_ahorro': 'n26',
 'bankinter_com_banca_cuentas_tarjetas_cuentas_cuenta_nomina': 'bankinter',
 'ing_es_cuenta_nocuenta_ing_1': 'ing',
 'evobanco_com_cuenta_inteligente': 'evobanco',
 'bancosantander_es_particulares_cuentas_tarjetas_cuentas_corrientes_cuenta_online_sin_comisiones': 'bancosantander'}