In [100]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm

In [136]:
data = pd.read_csv('data/regression_clean_data.csv')
data.head()

Unnamed: 0,bank,date,web_traffic,number_of_employee,assets_under_management,number_of_branches,age,has_stores,founded_before_2000,cross_visitation,search_interest,rank,category_best_salary_accounts_helmycash,category_best_savings_accounts_helpmycash,incentive,apr,mentions
0,abanca,2021-12-01,15202.983835,5946,72148000000,690,13,1,0,0.077994,17.652686,4.0,0,0,150.0,0.0,1.0
1,abanca,2021-12-01,15202.983835,5946,72148000000,690,13,1,0,0.077994,17.652686,4.0,1,0,150.0,0.0,1.0
2,abanca,2021-12-01,15202.983835,5946,72148000000,690,13,1,0,0.077994,17.652686,4.0,0,1,150.0,0.0,1.0
3,abanca,2022-01-01,42183.913207,5946,72148000000,690,13,1,0,0.0625,17.652686,4.0,0,0,150.0,0.0,1.0
4,abanca,2022-01-01,42183.913207,5946,72148000000,690,13,1,0,0.0625,17.652686,4.0,1,0,150.0,0.0,1.0


In [142]:
# plot incentives (where incentives are > 0) for all banks over time  with plotly in a stacked line chart
# get only banks who have had a non zero incentive
incentive_banks = data.loc[data['incentive'] != 0, 'bank'].unique()

plot_data = data[data['bank'].isin(incentive_banks)]

fig = px.line(plot_data, x='date', y='incentive', color='bank', title='Initial Deposit Incentives Over Time')
fig.show()


In [143]:
# plot incentives (where incentives are > 0) for all banks over time  with plotly in a stacked line chart
# get only banks who have had a non zero incentive
incentive_banks = data.loc[data['apr'] != 0, 'bank'].unique()

plot_data = data[data['bank'].isin(incentive_banks)]

fig = px.line(plot_data, x='date', y='apr', color='bank', title='APR Changes Over Time')
fig.show()


In [146]:
import pandas as pd
import plotly.express as px
from scipy.signal import savgol_filter


In [154]:
data.groupby('bank')['date'].count()

bank
abanca            75
banc sabadell    100
bankinter         75
bbva              75
evobanco         100
ing              200
myinvestor        25
n26              150
openbank          75
revolut           25
santander         50
Name: date, dtype: int64

In [155]:

# Assuming your data is in a DataFrame called data

# Filter banks with non-zero rank
rank_banks = data.loc[data['rank'] != 0, 'bank'].unique()
plot_data = data[data['bank'].isin(rank_banks)]

# Define a function to apply Savitzky-Golay filtering for smoothing
def apply_savgol_filter(group, window=25, polyorder=2):
    group = group.sort_values('date')  # Ensure data is sorted by date
    if len(group) >= window:  # Savitzky-Golay filter requires the window size to be less than or equal to the length of the data
        group['rank_smooth'] = savgol_filter(group['rank'], window_length=window, polyorder=polyorder)
    else:
        group['rank_smooth'] = group['rank']  # If not enough data points, keep the original
    return group

# Apply Savitzky-Golay filter to each bank's data
smoothed_data = plot_data.groupby('bank').apply(apply_savgol_filter).reset_index(drop=True)

# Plot the smoothed data
fig = px.line(smoothed_data, x='date', y='rank_smooth', color='bank', title='Rank Changes Over Time (Smoothed)')
fig.show()






In [113]:
ind_vars = ['cross_visitation', 'rank',
       'category_best_salary_accounts_helmycash',
       'category_best_savings_accounts_helpmycash', 'incentive', 'apr',
       'mentions', 'search_interest']

In [114]:

# Ensure the relevant columns are numeric
data[ind_vars] = data[ind_vars].apply(pd.to_numeric, errors='coerce')
data['web_traffic'] = pd.to_numeric(data['web_traffic'], errors='coerce')

data.isna().sum()

bank                                         0
date                                         0
web_traffic                                  0
number_of_employee                           0
assets_under_management                      0
number_of_branches                           0
age                                          0
has_stores                                   0
founded_before_2000                          0
cross_visitation                             0
search_interest                              0
rank                                         0
category_best_salary_accounts_helmycash      0
category_best_savings_accounts_helpmycash    0
incentive                                    0
apr                                          0
mentions                                     0
dtype: int64

In [115]:
# find the minimum date 
min_date = data['date'].min()

# for each bank on the minimum date, set apr and incentive to 0 if it is missing

for bank in data['bank'].unique():
    bank_data = data[data['bank'] == bank]
    min_date_data = bank_data[bank_data['date'] == min_date]
    if min_date_data['apr'].isnull().values[0]:
        data.loc[(data['bank'] == bank) & (data['date'] == min_date), 'apr'] = 0
    if min_date_data['incentive'].isnull().values[0]:
        data.loc[(data['bank'] == bank) & (data['date'] == min_date), 'incentive'] = 0


In [116]:
# ffill then bfill the missing values in the apr and incentive columns
data['apr'] = data['apr'].ffill()
data['apr'] = data['apr'].bfill()

data['incentive'] = data['incentive'].ffill().bfill()

In [117]:
data.to_csv('data/final_regression_data.csv', index=False)

In [118]:
# scale all the independent variables with standard scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[ind_vars] = scaler.fit_transform(data[ind_vars])



In [119]:
data.isna().sum()

bank                                         0
date                                         0
web_traffic                                  0
number_of_employee                           0
assets_under_management                      0
number_of_branches                           0
age                                          0
has_stores                                   0
founded_before_2000                          0
cross_visitation                             0
search_interest                              0
rank                                         0
category_best_salary_accounts_helmycash      0
category_best_savings_accounts_helpmycash    0
incentive                                    0
apr                                          0
mentions                                     0
dtype: int64

### Linear Model according to each bank entity
The linear model is given by: y is web traffic

$$
y_{\text{bank\_entity}} = x_{\text{cross\_visitation}} + x_{\text{rank}} + x_{\text{category\_best\_salary\_accounts\_helmycash}} + x_{\text{category\_best\_savings\_accounts\_helpmycash}} + x_{\text{incentive}} + x_{\text{apr}} + x_{\text{mentions}} + \text{error}
$$


In [120]:
for bank in data['bank'].unique():
    bank_data = data[data['bank'] == bank]
    X = bank_data[ind_vars]
    y = bank_data['web_traffic']
    X = sm.add_constant(X)  # Add a constant term for the intercept
    model = sm.OLS(y, X)
    results = model.fit()
    print(f"Results for bank: {bank}")
    print(results.summary2())

Results for bank: abanca
                                  Results: Ordinary least squares
Model:                         OLS                         Adj. R-squared:                0.593     
Dependent Variable:            web_traffic                 AIC:                           1613.4354 
Date:                          2024-06-16 19:03            BIC:                           1634.2928 
No. Observations:              75                          Log-Likelihood:                -797.72   
Df Model:                      8                           F-statistic:                   14.47     
Df Residuals:                  66                          Prob (F-statistic):            5.86e-12  
R-squared:                     0.637                       Scale:                         1.1523e+08
----------------------------------------------------------------------------------------------------
                                            Coef.     Std.Err.    t    P>|t|     [0.025     0.975]  


#### Summary

Results for bank: abanca: significant: 
rank                                   5186.9470    5.7074 0.0000    3373.9167   6999.9772
mentions                                   -5382.1622   2062.6982 -2.6093 0.0111   -9497.1326  -1267.1918
R-squared:                        0.367

Results for bank: n26: significant: 
const                                      9134.7559  1829.6434  4.9926 0.0000  5518.3284 12751.1835
cross_visitation                          33108.1521 18322.7143  1.8069 0.0729 -3108.0681 69324.3722
mentions                                   -292.3357   178.6778 -1.6361 0.1040  -645.5058    60.8343
R-squared:                     0.060 

Results for bank: banc sabadell: siginificant: 
cross_visitation                          343840.3510 80413.5802  4.2759 0.0000 184177.3026 503503.3994
incentive                                    -53.2545    11.8460 -4.4956 0.0000    -76.7750    -29.7340
R-squared:                      0.221 

Results for bank: ing: significant: 
cross_visitation                          -47390.1347 22526.1680 -2.1038 0.0367 -91817.7646 -2962.5049
rank                                       -1096.7980   147.0817 -7.4571 0.0000  -1386.8824  -806.7136
incentive                                    144.4547    12.5542 11.5064 0.0000    119.6944   169.2150
R-squared:                       0.240   

Results for bank: revolut: significant: 
const                                     -7759.5954  3070.3320 -2.5273 0.0196 -14144.7004  -1374.4905
rank                                       1057.2469   312.5336  3.3828 0.0028    407.2977   1707.1961
cross_visitation                          84822.0029 41966.6178  2.0212 0.0562  -2452.3565 172096.3623
mentions                                   1461.7336   767.5966  1.9043 0.0707   -134.5709   3058.0381
R-squared:                       0.566

Results for bank: openbank: significant: 
rank                                        766.9014   380.6155  2.0149 0.0478     7.5943   1526.2085
mentions                                    -98.3868   314.7380 -0.3126 0.7555  -726.2719    529.4984
R-squared:                      0.073

Results for bank: myinvestor
rank                                        966.3996   287.2364  3.3645 0.0029    369.0589   1563.7403
mentions                                    583.4922   247.7155  2.3555 0.0283     68.3396   1098.6448
R-squared:                       0.574  

Results for bank: bankinter
const                                       26074.8737  4863.5510  5.3613 0.0000   16369.8089  35779.9386
cross_visitation                          -157096.1322 45686.5443 -3.4386 0.0010 -248262.2118 -65930.0526
rank                                          927.6395   285.2200  3.2524 0.0018     358.4918   1496.7872
incentive                                      -7.2608     4.1869 -1.7341 0.0874     -15.6157      1.0942
mentions                                     -749.9725   345.9620 -2.1678 0.0337   -1440.3289    -59.6161
R-squared:                        0.308

Results for bank: evobanco
cross_visitation                          97350.1382 27569.6923  3.5311 0.0006 42609.8678 152090.4085
rank                                        313.5837   122.4134  2.5617 0.0120    70.5291    556.6384
mentions                                    616.0292   139.0300  4.4309 0.0000   339.9819    892.0764
R-squared:                      0.310  

Results for bank: santander
const                                      50188.6673  17834.1419  2.8142 0.0072   14268.8617  86108.4728
rank                                       -1580.4667    453.3324 -3.4863 0.0011   -2493.5250   -667.4083
R-squared:                        0.245

Almost 2/3 of bank entity are highly test statistics significance of rank variable at 5% and most of rank variables are test statisticlly significant at 1%. This highly indicating that rank determine the web traffic in most of bank entities.

Second most important feature is mentions, that mentions is word count from news articles(tucapital). All most in every bank entity, mentions features is test significant at 10% and 80% of statistically significantly mentions also significant at 1%. This indicating mentions variable is highly possible not be zero in coefficient. Which is possiblly because, each time news article contains 'mentions' means the corresponding information will driven web traffic changes. To most of banks is having negative influence from 'mentions', only evobanco is influenced with positive value. This is possible because of that tucapital contains more articles from evobanco which indicating that tucapital news has more positive influence on web traffic of evobanco.

Third most important feature is cross_visitation. This feature is kind of competitive feature across different bank entities. cross_visition is from helpmycash website which is a percentage value of user click count for different bank entities after user browsers the rank published at that month. About half of bank entities has test statistic significant between 10% and 1%, in positive and negative coefficiant. cross_visitation feature has highest coefficent value around 20,000 to 100,000.

Forth most important feature is incentive, about 1/3 of bank entities has test statistic significant after 1%. As data is in-complete, the web traffic of the bank that contains incentive features all shows high test statistical significance. This is reseaonable cause incentive is directly related with clients investment. They have 2 positve coeffients and 1 negative coefficient of incentive.


In [112]:
import plotly.graph_objects as go

# Assuming 'data' is your DataFrame and 'ind_vars' is a list of independent variable names.
for bank in data['bank'].unique():
    bank_data = data[data['bank'] == bank]
    fig = go.Figure()
    
    # Add web traffic trace
    fig.add_trace(go.Scatter(x=bank_data['date'], y=bank_data['web_traffic'], mode='lines', name='web_traffic', yaxis='y1'))
    
    # Add independent variables traces
    for ind_var in ind_vars:
        fig.add_trace(go.Scatter(x=bank_data['date'], y=bank_data[ind_var], mode='lines', name=ind_var, yaxis='y2'))
    
    # Update layout for secondary y-axis
    fig.update_layout(
        title=f"{bank} Web Traffic and Independent Variables",
        xaxis_title="Date",
        yaxis=dict(
            title="Web Traffic",
            titlefont=dict(color="#000000"),
            tickfont=dict(color="#000000")
        ),
        yaxis2=dict(
            title="Independent Variables",
            titlefont=dict(color="#000000"),
            tickfont=dict(color="#000000"),
            overlaying='y',
            side='right'
        )
        # move legend to the right so there is no overlap with the secondary y-axis
        ,legend=dict(x=1.1, y=1)
        
    )
    
    fig.show()


In [125]:
data

Unnamed: 0,bank,date,web_traffic,number_of_employee,assets_under_management,number_of_branches,age,has_stores,founded_before_2000,cross_visitation,search_interest,rank,category_best_salary_accounts_helmycash,category_best_savings_accounts_helpmycash,incentive,apr,mentions
0,abanca,2021-12-01,15202.983835,5946,72148000000,690,13,1,0,-0.517925,0.002140,-0.365932,-0.679366,-0.679366,0.152826,-1.105008,-0.551783
1,abanca,2021-12-01,15202.983835,5946,72148000000,690,13,1,0,-0.517925,0.002140,-0.365932,1.471960,-0.679366,0.152826,-1.105008,-0.551783
2,abanca,2021-12-01,15202.983835,5946,72148000000,690,13,1,0,-0.517925,0.002140,-0.365932,-0.679366,1.471960,0.152826,-1.105008,-0.551783
3,abanca,2022-01-01,42183.913207,5946,72148000000,690,13,1,0,-0.914170,0.002140,-0.365932,-0.679366,-0.679366,0.152826,-1.105008,-0.551783
4,abanca,2022-01-01,42183.913207,5946,72148000000,690,13,1,0,-0.914170,0.002140,-0.365932,1.471960,-0.679366,0.152826,-1.105008,-0.551783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,santander,2023-10-01,36636.706633,212764,1117000000000,8518,167,1,1,0.718608,8.208999,-1.089809,1.471960,-0.679366,2.125450,-1.105008,-0.551783
946,santander,2023-11-01,35387.630139,212764,1117000000000,8518,167,1,1,0.587299,0.000723,-1.451748,-0.679366,-0.679366,2.125450,-1.105008,-0.186491
947,santander,2023-11-01,35387.630139,212764,1117000000000,8518,167,1,1,0.587299,0.000723,-1.089809,1.471960,-0.679366,2.125450,-1.105008,-0.186491
948,santander,2023-12-01,29101.697517,212764,1117000000000,8518,167,1,1,1.558683,0.002140,-1.229914,-0.679366,-0.679366,2.125450,-1.105008,-0.551783


array(['abanca', 'n26', 'banc sabadell', 'ing', 'bbva', 'revolut',
       'openbank', 'myinvestor', 'bankinter', 'evobanco', 'santander'],
      dtype=object)