In [33]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
import pandas as pd
import plotly.express as px
from scipy.signal import savgol_filter


In [34]:
data = pd.read_csv('data/regression_clean_data.csv')
data.head()

Unnamed: 0,bank,date,web_traffic,cross_visitation,search_interest,rank,incentive,apr,mentions
0,abanca,2021-12-01,15202.983835,0.077994,7.75,4.0,150.0,0.0,1.0
1,abanca,2022-01-01,42183.913207,0.0625,7.8,4.0,150.0,0.0,1.0
2,abanca,2022-02-01,34498.153115,0.052288,7.25,4.0,150.0,0.0,2.0
3,abanca,2022-03-01,34546.319021,0.061503,6.75,4.0,150.0,0.0,2.0
4,abanca,2022-04-01,40508.28842,0.050222,6.75,4.0,300.0,0.0,2.0


In [44]:
# Adding lag variable
data['web_traffic_lag'] = data['web_traffic'].shift(1)

# Dropping NaN values that result from lag
df_lag = data.dropna()

# Regression analysis with lag
X_lag = df_lag[['web_traffic_lag', 'cross_visitation', 'search_interest', 'rank', 'incentive', 'apr', 'mentions']]
y_lag = df_lag['web_traffic']
X_lag = sm.add_constant(X_lag)
model_lag = sm.OLS(y_lag, X_lag).fit()
lag_regression_summary = model_lag.summary()
print(lag_regression_summary)

                            OLS Regression Results                            
Dep. Variable:            web_traffic   R-squared:                       0.727
Model:                            OLS   Adj. R-squared:                  0.720
Method:                 Least Squares   F-statistic:                     101.2
Date:                Wed, 19 Jun 2024   Prob (F-statistic):           2.81e-71
Time:                        10:32:53   Log-Likelihood:                -2900.2
No. Observations:                 274   AIC:                             5816.
Df Residuals:                     266   BIC:                             5845.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             5339.6388   2865.460  

In [41]:
pooled_data = pd.get_dummies(data, columns=['bank'], drop_first=True)


# cast the dummy columns to int 
pooled_data[[ 'bank_banc sabadell', 'bank_bankinter',
       'bank_bbva', 'bank_evobanco', 'bank_ing', 'bank_myinvestor', 'bank_n26',
       'bank_openbank', 'bank_revolut', 'bank_santander']] = pooled_data[[ 'bank_banc sabadell', 'bank_bankinter',
       'bank_bbva', 'bank_evobanco', 'bank_ing', 'bank_myinvestor', 'bank_n26',
       'bank_openbank', 'bank_revolut', 'bank_santander']].astype(int)



In [43]:
X = pooled_data[['cross_visitation', 'search_interest', 'rank',
       'incentive', 'apr', 'mentions', 'bank_banc sabadell', 'bank_bankinter',
       'bank_bbva', 'bank_evobanco', 'bank_ing', 'bank_myinvestor', 'bank_n26',
       'bank_openbank', 'bank_revolut', 'bank_santander']]
y = pooled_data['web_traffic']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
pooled_regression_summary = model.summary()
print(pooled_regression_summary)

                            OLS Regression Results                            
Dep. Variable:            web_traffic   R-squared:                       0.763
Model:                            OLS   Adj. R-squared:                  0.748
Method:                 Least Squares   F-statistic:                     51.97
Date:                Wed, 19 Jun 2024   Prob (F-statistic):           4.46e-71
Time:                        10:30:52   Log-Likelihood:                -2890.9
No. Observations:                 275   AIC:                             5816.
Df Residuals:                     258   BIC:                             5877.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const               4.213e+04   4989

In [47]:
from linearmodels.panel import PanelOLS


In [51]:
fixed_effects = data.copy()

fixed_effects.date = pd.to_datetime(fixed_effects.date)

fixed_effects['web_traffic_lag'] = fixed_effects['web_traffic'].shift(1)

# Convert the DataFrame to a panel DataFrame
fixed_effects.set_index(['bank', 'date'], inplace=True)

# Define the dependent and independent variables
y = fixed_effects['web_traffic']
X = fixed_effects[['cross_visitation', 'search_interest', 'rank', 'incentive', 'apr', 'mentions', 'web_traffic_lag']]
X = sm.add_constant(X)

# Create and fit the fixed effects model
model = PanelOLS(y, X, entity_effects=True)
fe_results = model.fit()

# Display the summary
print(fe_results)

                          PanelOLS Estimation Summary                           
Dep. Variable:            web_traffic   R-squared:                        0.1922
Estimator:                   PanelOLS   R-squared (Between):              0.4918
No. Observations:                 274   R-squared (Within):               0.1922
Date:                Wed, Jun 19 2024   R-squared (Overall):              0.4206
Time:                        11:09:39   Log-likelihood                   -2853.4
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      8.7037
Entities:                          11   P-value                           0.0000
Avg Obs:                       24.909   Distribution:                   F(7,256)
Min Obs:                       24.000                                           
Max Obs:                       25.000   F-statistic (robust):             8.7037
                            



Inputs contain missing values. Dropping rows with missing observations.



In [52]:
from statsmodels.stats.outliers_influence import variance_inflation_factor


### Multicollinearity check
VIF = 1: No multicollinearity.
1 < VIF < 5: Moderate multicollinearity.
VIF >= 5: High multicollinearity. Variables with VIFs above 5 require further investigation and potentially should be removed or combined with other variables.

In [58]:
X.fillna(0, inplace=True)

vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif)

           Variable        VIF
0             const  23.765031
1  cross_visitation   1.977825
2   search_interest   1.816889
3              rank   1.340604
4         incentive   1.457948
5               apr   1.298543
6          mentions   1.157477
7   web_traffic_lag   1.939567


In [59]:
fixed_effects = data.copy()

fixed_effects.date = pd.to_datetime(fixed_effects.date)

fixed_effects['web_traffic_lag'] = fixed_effects['web_traffic'].shift(1)

# Convert the DataFrame to a panel DataFrame
fixed_effects.set_index(['bank', 'date'], inplace=True)

# Define the dependent and independent variables
y = fixed_effects['web_traffic']
X = fixed_effects[['cross_visitation', 'search_interest', 'rank', 'incentive', 'apr', 'mentions']]
X = sm.add_constant(X)

# Create and fit the fixed effects model
model = PanelOLS(y, X, entity_effects=True)
fe_results = model.fit()

# Display the summary
print(fe_results)

                          PanelOLS Estimation Summary                           
Dep. Variable:            web_traffic   R-squared:                        0.0798
Estimator:                   PanelOLS   R-squared (Between):             -0.0958
No. Observations:                 275   R-squared (Within):               0.0798
Date:                Wed, Jun 19 2024   R-squared (Overall):             -0.0506
Time:                        11:22:33   Log-likelihood                   -2890.9
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      3.7270
Entities:                          11   P-value                           0.0014
Avg Obs:                       25.000   Distribution:                   F(6,258)
Min Obs:                       25.000                                           
Max Obs:                       25.000   F-statistic (robust):             3.7270
                            