In [30]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
import pandas as pd
import plotly.express as px
from scipy.signal import savgol_filter
from sklearn.preprocessing import StandardScaler


In [31]:
data = pd.read_csv('data/regression_clean_data_final_1.csv')
data.head()

Unnamed: 0,bank,date,web_traffic,cross_visitation,search_interest,rank,incentive,apr,mentions,app_installs,ppc_spend
0,abanca,2021-12-01,15202.983835,0.077994,7.75,4.0,150.0,0.0,1.0,43605.699637,1506.110026
1,abanca,2022-01-01,42183.913207,0.0625,7.8,4.0,150.0,0.0,1.0,43605.699637,1506.110026
2,abanca,2022-02-01,34498.153115,0.052288,7.25,4.0,150.0,0.0,2.0,48647.434439,32750.639361
3,abanca,2022-03-01,34546.319021,0.061503,6.75,4.0,150.0,0.0,2.0,26687.227413,15754.216964
4,abanca,2022-04-01,40508.28842,0.050222,6.75,4.0,300.0,0.0,2.0,4254.184995,6542.970638


In [32]:
# log incentive, ppc, app install 
epsilon = 1e-6
data['incentive'] = data['incentive'] + epsilon
data['log_incentive'] = np.log(data['incentive'])
data['log_ppc'] = np.log(data['ppc_spend'])
data['log_app_installs'] = np.log(data['app_installs'])
data['log_web_traffic'] = np.log(data['web_traffic'])

In [33]:
data['date'] = pd.to_datetime(data['date'])

# create a dummy column for january 
data['month_1'] = np.where(data['date'].dt.month == 1, 1, 0)

data['log_web_traffic_lag'] = data['log_web_traffic'].shift(1)


In [34]:
# scale all of the variables with standard scaler

scaler = StandardScaler()
data[['month_1','cross_visitation', 'search_interest', 'apr', 'mentions', 'log_web_traffic_lag', 'log_web_traffic','log_incentive', 'log_ppc']] = scaler.fit_transform(data[['month_1','cross_visitation', 'search_interest', 'apr', 'mentions', 'log_web_traffic_lag', 'log_web_traffic', 'log_incentive', 'log_ppc' ]])


In [35]:
data.log_web_traffic_lag = data.log_web_traffic_lag.bfill()

In [42]:

all_banks = []

for bank in data['bank'].unique():

    data_bank = data[data['bank'] == bank]

    y = data_bank['log_web_traffic']

    ind_vars = ['log_web_traffic_lag', 'cross_visitation', 'search_interest', 'apr', 'mentions', 'log_incentive', 'log_ppc', 'month_1']

    X = data_bank[ind_vars]

    X = sm.add_constant(X)

    # Create and fit the fixed effects model
    model = sm.OLS(y, X).fit()
    pooled_regression_summary = model.summary()
    print(pooled_regression_summary)

    # for each bank, get all of the coefficients and the variable names 
    coefficients = model.params
    coefficient_names = coefficients.index
    p_values = model.pvalues

    # create a dictionary with the coefficients and the variable names
    coefficient_dict = dict(zip(coefficient_names, coefficients))

    coefficient_dict['bank_name'] = bank
    

    all_banks.append(coefficient_dict)

# create a df from the dictionary
    
df = pd.DataFrame(all_banks)

df

                            OLS Regression Results                            
Dep. Variable:        log_web_traffic   R-squared:                       0.756
Model:                            OLS   Adj. R-squared:                  0.634
Method:                 Least Squares   F-statistic:                     6.193
Date:                Sun, 30 Jun 2024   Prob (F-statistic):            0.00100
Time:                        18:39:05   Log-Likelihood:                 4.2663
No. Observations:                  25   AIC:                             9.467
Df Residuals:                      16   BIC:                             20.44
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  -5.6512    

Unnamed: 0,const,log_web_traffic_lag,cross_visitation,search_interest,apr,mentions,log_incentive,log_ppc,month_1,bank_name
0,-5.651167,0.305758,-0.068958,-1.001011,0.262801,0.012985,4.744462,0.130225,0.163449,abanca
1,,0.039918,0.254753,1.184493,0.105996,-0.098819,0.509005,0.004647,-0.037117,n26
2,,0.342462,0.212654,1.487692,-0.337362,0.224902,0.231866,0.25722,0.03018,banc sabadell
3,,0.503491,-0.164822,0.033586,0.168958,0.057741,0.391846,-0.472107,-0.027522,ing
4,,-0.144553,0.233489,-0.002974,-0.293252,0.062135,-0.532714,-0.01965,-0.003009,bbva
5,,0.41346,1.073306,0.44252,-0.414061,1.351691,-0.425058,-0.502437,0.246849,revolut
6,,-0.17424,0.116085,0.041682,-0.045788,-0.070645,-0.047004,-0.072015,0.087435,openbank
7,,0.407146,0.681275,0.072803,0.023966,0.149477,0.024602,-0.113964,0.07141,myinvestor
8,,0.525848,-0.607411,1.834646,0.331976,-0.343933,-0.146054,-0.054657,-0.00825,bankinter
9,,0.203701,0.768419,-5.921268,-2.579818,0.185493,1.135001,0.067348,0.005958,evobanco


In [37]:
df.dtypes

const                  float64
log_web_traffic_lag    float64
cross_visitation       float64
search_interest        float64
apr                    float64
mentions               float64
log_incentive          float64
log_ppc                float64
month_1                float64
bank_name               object
dtype: object

In [38]:
# drop const col 
df = df.drop(columns=['const'])

# get all columns not bank_name
cols = df.columns.tolist()
cols.remove('bank_name')

# normalize the coefficients to 0-1
df[cols] = df[cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [40]:


# plot each bank as a bar with coefficients stacked on top of each other
fig = go.Figure()
for col in df.columns:
    if col not in ['bank_name','month_1', 'log_web_traffic_lag']:
        fig.add_trace(go.Bar(x=df['bank_name'], y=df[col], name=col))
# add a title
fig.update_layout(title='Coefficients Weights for each Bank')
fig.update_layout(barmode='stack')
fig.show()