In [299]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np

import klib as kl


In [300]:
bank_names_mapping = mapping_dict = {
    'abanca': 'abanca',
    'banc sabadell': 'banc sabadell',
    'bancosabadell': 'banc sabadell',
    'bancosantander': 'santander',
    'bancsabadell': 'banc sabadell',
    'bank': 'bank',
    'bankinter': 'bankinter',
    'bbva': 'bbva',
    'caixabank': 'caixabank',
    'date': 'date',
    'evobanco': 'evobanco',
    'imagin': 'imagin',
    'ing': 'ing',
    'ing-bank': 'ing',
    'kutxabank': 'kutxabank',
    'liberbank': 'liberbank',
    'myinvestor': 'myinvestor',
    'n26': 'n26',
    'n26.com': 'n26',
    None: None,  # handling nan (not a number)
    'openbank': 'openbank',
    'orangebank': 'orangebank',
    'pibank': 'pibank',
    'renaultbank': 'renaultbank',
    'revolut': 'revolut',
    'santander': 'santander',
    'value': 'value'
}

### Web Traffic Cleanup

In [301]:

products_data = pd.read_excel('./data/Account Products.xlsx', sheet_name='web_traffic_accounts')
products_data.Date = pd.to_datetime(products_data.Date)
products_data.set_index('Date', inplace=True)

imputer = IterativeImputer(max_iter=10, random_state=0)
imputed_data = imputer.fit_transform(products_data)
imputed_data = pd.DataFrame(imputed_data, columns=products_data.columns, index=products_data.index)
products_data = imputed_data.copy()

# get bank name from columns 
products_data.columns = products_data.columns.str.split('.').str[0]

final_cols = products_data.columns

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_data = products_data.select_dtypes(include=numerics)
num_cols_target = numeric_data.columns

# melt the data so bank names are in a single column
products_data = products_data.reset_index().melt(id_vars='Date', var_name='Bank', value_name='Web Traffic')

products_data = kl.clean_column_names(products_data)

products_data.head()

Unnamed: 0,date,bank,web_traffic
0,2021-12-01,abanca,15202.983835
1,2022-01-01,abanca,42183.913207
2,2022-02-01,abanca,34498.153115
3,2022-03-01,abanca,34546.319021
4,2022-04-01,abanca,40508.28842


### Bank Information Cleanup

In [302]:
bank_comps = pd.read_excel('data/Account Products.xlsx', sheet_name='bank_comparison_metrics', header=0)
bank_comps['age'] = 2024 - bank_comps['age of bank']
bank_comps['has_stores'] = bank_comps['number of branches'] > 0
bank_comps['has_stores'] = bank_comps['has_stores'].astype(int)
bank_comps['founded_before_2000'] = bank_comps['age of bank'] < 2000
bank_comps['founded_before_2000'] = bank_comps['founded_before_2000'].astype(int)
bank_comps['bank'] = bank_comps.website.str.split('.').apply(lambda x: x[0])

bank_comps = bank_comps[['bank', 'number of employee', 'assets under management', 'number of branches', 'age', 'has_stores', 'founded_before_2000']]

bank_comps = kl.clean_column_names(bank_comps)

bank_comps.head()

Unnamed: 0,bank,number_of_employee,assets_under_management,number_of_branches,age,has_stores,founded_before_2000
0,abanca,5946,72148000000,690,13,1,0
1,bancosantander,212764,1117000000000,8518,167,1,1
2,bancsabadell,19316,253000000000,1594,143,1,1
3,bankinter,6138,83300000000,523,59,1,1
4,bbva,121486,775000000000,1800,167,1,1


### Cross Visitation Classification

In [303]:
cross_visitation_data = pd.read_excel('data/Account Products.xlsx', sheet_name='cross_visitation', header=1)

cross_visitation_data = kl.data_cleaning(cross_visitation_data)

# get numerica columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_data = cross_visitation_data.select_dtypes(include=numerics)
num_cols = numeric_data.columns

# impute missing values
imputer = IterativeImputer(max_iter=10, random_state=0)
imputed_data = imputer.fit_transform(cross_visitation_data[num_cols])
imputed_data = pd.DataFrame(imputed_data, columns=num_cols, index=cross_visitation_data.index)
cross_visitation_data[num_cols] = imputed_data.copy()
cross_visitation_data.head()

# reformat column names
all_cols = cross_visitation_data.columns
cross_visitation_data.columns = cross_visitation_data.columns.str.split('_').str[0]

# melt the data so bank names are in a single column
cross_visitation_data = cross_visitation_data.reset_index().melt(id_vars='date', var_name='Bank', value_name='Cross Visitation')

cross_visitation_data = kl.clean_column_names(cross_visitation_data)

# drop where Bank is == inde
cross_visitation_data = cross_visitation_data.loc[~cross_visitation_data.bank.isin(['index'])]

cross_visitation_data.head()

Shape of cleaned data: (25, 16) - Remaining NAs: 25


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.0 MB (-nan%)



  mem_perc = round(100 * mem_change / data_mem, 2)


Unnamed: 0,date,bank,cross_visitation
25,2021-12-01,abanca,0.077994
26,2022-01-01,abanca,0.0625
27,2022-02-01,abanca,0.052288
28,2022-03-01,abanca,0.061503
29,2022-04-01,abanca,0.050222


### Account Ranking Preprocessing

In [304]:

file_path = './data/Account Products.xlsx'
df = pd.read_excel(file_path, sheet_name='ranking_position', skiprows=7)


In [305]:
df['Unnamed: 0'] = df['Unnamed: 0'].fillna(method='ffill')

# Rename the columns for clarity:
df.columns = ['Category', 'Bank Name'] + list(df.columns[2:])

transformed_df = pd.melt(df, id_vars=['Category', 'Bank Name'], var_name='Date', value_name='Rank')
transformed_df['Date'] = pd.to_datetime(transformed_df['Date'])

  df['Unnamed: 0'] = df['Unnamed: 0'].fillna(method='ffill')


In [306]:
rank_df = transformed_df

rank_df = pd.get_dummies(rank_df, columns=['Category'], drop_first=True)

rank_df = kl.clean_column_names(rank_df)

rank_df.bank_name = rank_df.bank_name.str.lower()

# rename the bank name column to bank
rank_df = rank_df.rename(columns={'bank_name': 'bank'})

rank_df.head()

Unnamed: 0,bank,date,rank,category_best_salary_accounts_helmycash,category_best_savings_accounts_helpmycash
0,bbva,2021-12-01,1.0,False,False
1,ing,2021-12-01,2.0,False,False
2,bankinter,2021-12-01,3.0,False,False
3,abanca,2021-12-01,4.0,False,False
4,liberbank,2021-12-01,5.0,False,False


### Bank Incentives Clean Up

In [307]:
data = pd.read_csv('data/site_changes.csv')
# rename tags

data.tag = data.tag.replace({'apr increase':'increase apr', 'incentive increase':'increase incentive', 'decrease incentive':'incentive decrease',  'decrease incentive ':'incentive decrease'})

# create a date column from month and year and add day as 1
data['date'] = pd.to_datetime(data[['year', 'month']].assign(day=1))

data = kl.clean_column_names(data)

apr_increase_dates = data[data.tag == 'increase apr']
incentive_increase_dates = data[data.tag == 'increase incentive']

incentive_increase_dates.head()

Unnamed: 0,month,year,incentive,apr,notes,bank,notes_1,tag,site_changes,date
1,6,2021,150.0,0.0,800-1200 deposit,abanca,,increase incentive,,2021-06-01
9,9,2022,0.0,1.0,por el primer año,banc sabadell,,increase incentive,,2022-09-01
13,4,2023,250.0,2.0,por el primer año,banc sabadell,250â¬ por domiciliar tu nÃ³mina,increase incentive,,2023-04-01
18,7,2023,0.0,5.0,añadir un balance maximo a 10000 por un total ...,bankinter,,increase incentive,,2023-07-01
21,7,2022,0.0,0.0,recibes 350 cuando recmoiendes un amigo a BBVA...,bbva,,increase incentive,,2022-07-01


In [308]:
banks = incentive_increase_dates.bank.unique()

# create a ddataframe of dates 01-01-2021 to 12-1-2023
dates = pd.date_range(start='2021-01-01', end='2023-12-01', freq='MS')

# create a dataframe with all banks and dates
banks_dates = pd.DataFrame([(bank, date) for bank in banks for date in dates], columns=['bank', 'date'])

banks_dates.dropna(inplace=True)

# issue occurring on merge of dfs

In [309]:
# make the incentives increases continuous such that for the given time frame each bank has an incentive for each month and year
incentive_increase_dates = pd.merge(banks_dates, incentive_increase_dates, on=['bank', 'date'], how='left')

apr_increase_dates = pd.merge(banks_dates, apr_increase_dates, on=['bank', 'date'], how='left')

# rename columns 
new_cols = list(incentive_increase_dates.columns)
new_cols[1] = 'final_date'

incentive_increase_dates.columns = new_cols
apr_increase_dates.columns = new_cols

# only select the columns that are needed
incentive_increase_dates = incentive_increase_dates[['final_date', 'bank', 'incentive']]
apr_increase_dates = apr_increase_dates[['final_date', 'bank', 'apr']]

# rename the columns
incentive_increase_dates.columns = ['date', 'bank', 'incentive']
apr_increase_dates.columns = ['date', 'bank', 'apr']



In [310]:
# strip all non numeric characters from the apr and incentive column
apr_increase_dates.apr = apr_increase_dates.apr.str.replace('%', '')
incentive_increase_dates.incentive = incentive_increase_dates.incentive.str.replace('€', '')


AttributeError: Can only use .str accessor with string values!

In [None]:

# convert the apr and incentive columns to numeric
apr_increase_dates.apr = pd.to_numeric(apr_increase_dates.apr)
incentive_increase_dates.incentive = pd.to_numeric(incentive_increase_dates.incentive)

ValueError: Unable to parse string "0%" at position 4

In [None]:
# ffill then bfill the missing  values
incentive_increases = incentive_increase_dates.ffill().bfill()
apr_increases = apr_increase_dates.ffill().bfill()

### News Release Clean Up

In [None]:
news_data = pd.read_csv('text_files/tucapital-news_with_text-translated.csv')
news_data

Unnamed: 0.1,Unnamed: 0,title,url,date,text
0,0,"Tomorrow, last day of the account at 2.89% APR...",https://www.tucapital.es/depositos/manana-ulti...,23/04/2024,"Tomorrow, April 24, 2024, the marketing period..."
1,1,ING gives you up to €120 if you direct your pa...,https://www.tucapital.es/cuentas/ing-te-da-has...,22/04/2024,"ING now asks, in addition to direct debiting a..."
2,2,Banco BIG deposits: slight increase in normal ...,https://www.tucapital.es/depositos/depositos-d...,22/04/2024,Banco BIG remodels its deposit offer with slig...
3,3,The future of Bitcoin: predictions for the nex...,https://www.tucapital.es/blogs/guias/el-futuro...,22/04/2024,"Since its launch, Bitcoin has seen positive an..."
4,4,Eco: Cetelem only allows maximum daily transfe...,https://www.tucapital.es/blogs/eco/eco-cetelem...,19/04/2024,"Due to the disappearance of Orange Bank, and t..."
...,...,...,...,...,...
1020,1020,Pibank and Pichincha maintain your deposit at ...,https://www.tucapital.es/cuentas/pibank-y-pich...,04/01/2021,"Pibank, the online office of Banco Pichincha, ..."
1021,1021,"New year, new taxes. This is how the Treasury ...",https://www.tucapital.es/blogs/noticias/ano-nu...,04/01/2021,I wish that with this first post of the year w...
1022,1022,"Big Bank, RenaultBank...and OrangeBank?, new a...",https://www.tucapital.es/cuentas/banco-big-ren...,31/12/2020,"As we told you yesterday (see here), 2020 has ..."
1023,1023,"Goodbye 2020, black year of banking fees. News...",https://www.tucapital.es/blogs/noticias/adios-...,30/12/2020,2020 has been a black year. It has been the ye...


In [None]:
bank_names = list(products_data.bank.unique())
bank_names.remove('ing')
bank_names.append('ing-bank')
bank_names

['abanca',
 'n26',
 'bancsabadell',
 'bbva',
 'revolut',
 'openbank',
 'myinvestor',
 'bankinter',
 'evobanco',
 'bancosantander',
 'ing-bank']

In [None]:
# search for each of the bank names in the news data

news_data['text_nospace'] = news_data['text'].str.replace('ING', 'ing-bank')
news_data['text_nospace'] = news_data['text_nospace'].str.replace(' ', '').str.lower()


for bank in bank_names:
    news_data[bank] = 0

for index, row in news_data.iterrows():
    for bank in bank_names:
        if bank in row['text_nospace']:
            news_data.at[index, bank] = news_data.at[index, bank] + 1

news_data.head()


Unnamed: 0.1,Unnamed: 0,title,url,date,text,text_nospace,abanca,n26,bancsabadell,bbva,revolut,openbank,myinvestor,bankinter,evobanco,bancosantander,ing-bank
0,0,"Tomorrow, last day of the account at 2.89% APR...",https://www.tucapital.es/depositos/manana-ulti...,23/04/2024,"Tomorrow, April 24, 2024, the marketing period...","tomorrow,april24,2024,themarketingperiodforthe...",0,0,0,0,0,0,0,0,0,0,0
1,1,ING gives you up to €120 if you direct your pa...,https://www.tucapital.es/cuentas/ing-te-da-has...,22/04/2024,"ING now asks, in addition to direct debiting a...","ing-banknowasks,inadditiontodirectdebitingapay...",0,0,0,0,0,0,0,0,0,0,1
2,2,Banco BIG deposits: slight increase in normal ...,https://www.tucapital.es/depositos/depositos-d...,22/04/2024,Banco BIG remodels its deposit offer with slig...,bancobigremodelsitsdepositofferwithslightincre...,0,0,0,0,0,0,1,0,0,0,1
3,3,The future of Bitcoin: predictions for the nex...,https://www.tucapital.es/blogs/guias/el-futuro...,22/04/2024,"Since its launch, Bitcoin has seen positive an...","sinceitslaunch,bitcoinhasseenpositiveandnegati...",0,0,0,0,1,0,0,0,0,0,0
4,4,Eco: Cetelem only allows maximum daily transfe...,https://www.tucapital.es/blogs/eco/eco-cetelem...,19/04/2024,"Due to the disappearance of Orange Bank, and t...","duetothedisappearanceoforangebank,andthetransf...",0,1,0,0,0,0,0,0,0,0,0


In [None]:
news_data['month_date'] = pd.to_datetime(news_data['date'])
# set month date to the first day of the month
news_data['month_date'] = news_data['month_date'].dt.to_period('M').dt.to_timestamp()
news_data = news_data.groupby('month_date')[bank_names].sum()

# melt the df to have the bank names as a column
news_data = news_data.reset_index().melt(id_vars='month_date', value_vars=bank_names, var_name='bank', value_name='mentions')

news_data.columns = ['date', 'bank', 'mentions']
news_data.head()

  news_data['month_date'] = pd.to_datetime(news_data['date'])


Unnamed: 0,date,bank,mentions
0,2020-12-01,abanca,0
1,2021-01-01,abanca,3
2,2021-02-01,abanca,4
3,2021-03-01,abanca,0
4,2021-04-01,abanca,4


### Data Consolidation by Bank

In [None]:
dfs = [products_data, bank_comps, cross_visitation_data, rank_df, incentive_increases, apr_increases, news_data]

# map the bank names to a single name

for df in dfs:
    df['bank'] = df['bank'].map(bank_names_mapping)

In [None]:
all_data = pd.merge(banks_dates, products_data, on=['bank', 'date'], how='left')
all_data

Unnamed: 0,bank,date,web_traffic
0,abanca,2021-12-01,15202.983835
1,abanca,2022-01-01,42183.913207
2,abanca,2022-02-01,34498.153115
3,abanca,2022-03-01,34546.319021
4,abanca,2022-04-01,40508.288420
...,...,...,...
145,santander,2023-08-01,34290.160425
146,santander,2023-09-01,40388.737821
147,santander,2023-10-01,36636.706633
148,santander,2023-11-01,35387.630139


In [None]:
# list all dfs
bank_names = products_data.bank.unique()
dates = pd.date_range(start='2021-12-01', end='2023-12-01', freq='MS')

banks_dates = pd.DataFrame([(bank, date) for bank in bank_names for date in dates], columns=['bank', 'date'])

all_data = pd.merge(banks_dates, products_data, on=['bank', 'date'], how='left')
all_data = pd.merge(all_data, bank_comps, on=['bank'], how='left')
all_data = pd.merge(all_data, cross_visitation_data, on=['bank', 'date'], how='left')
all_data = pd.merge(all_data, rank_df, on=['bank', 'date'], how='left')
all_data = pd.merge(all_data, incentive_increases, on=['bank', 'date'], how='left')
all_data = pd.merge(all_data, apr_increases, on=['bank', 'date'], how='left')
all_data = pd.merge(all_data, news_data, on=['bank', 'date'], how='left')

all_data.head()


Unnamed: 0,bank,date,web_traffic,number_of_employee,assets_under_management,number_of_branches,age,has_stores,founded_before_2000,cross_visitation,rank,category_best_salary_accounts_helmycash,category_best_savings_accounts_helpmycash,incentive,apr,mentions
0,abanca,2021-12-01,15202.983835,5946,72148000000,690,13,1,0,0.077994,4.0,False,False,300€,0%,1
1,abanca,2021-12-01,15202.983835,5946,72148000000,690,13,1,0,0.077994,4.0,True,False,300€,0%,1
2,abanca,2021-12-01,15202.983835,5946,72148000000,690,13,1,0,0.077994,,False,True,300€,0%,1
3,abanca,2022-01-01,42183.913207,5946,72148000000,690,13,1,0,0.0625,4.0,False,False,300€,0%,1
4,abanca,2022-01-01,42183.913207,5946,72148000000,690,13,1,0,0.0625,4.0,True,False,300€,0%,1


In [None]:
# ffill and bfill these columns 'category', 'rank'
all_data['rank'] = all_data['rank'].ffill().bfill()
all_data['apr'] = all_data['apr'].ffill().bfill()
all_data['incentive'] = all_data['incentive'].ffill().bfill()

#fill category with mode
all_data['category_best_salary_accounts_helmycash'] = all_data['category_best_salary_accounts_helmycash'].fillna(all_data['category_best_salary_accounts_helmycash'].mode()[0])
all_data['category_best_savings_accounts_helpmycash'] = all_data['category_best_savings_accounts_helpmycash'].fillna(all_data['category_best_savings_accounts_helpmycash'].mode()[0])

all_data['category_best_salary_accounts_helmycash'] = all_data['category_best_salary_accounts_helmycash'].astype(int)
all_data['category_best_savings_accounts_helpmycash'] = all_data['category_best_savings_accounts_helpmycash'].astype(int)

all_data.head()

Unnamed: 0,bank,date,web_traffic,number_of_employee,assets_under_management,number_of_branches,age,has_stores,founded_before_2000,cross_visitation,rank,category_best_salary_accounts_helmycash,category_best_savings_accounts_helpmycash,incentive,apr,mentions
0,abanca,2021-12-01,15202.983835,5946,72148000000,690,13,1,0,0.077994,4.0,0,0,300€,0%,1
1,abanca,2021-12-01,15202.983835,5946,72148000000,690,13,1,0,0.077994,4.0,1,0,300€,0%,1
2,abanca,2021-12-01,15202.983835,5946,72148000000,690,13,1,0,0.077994,4.0,0,1,300€,0%,1
3,abanca,2022-01-01,42183.913207,5946,72148000000,690,13,1,0,0.0625,4.0,0,0,300€,0%,1
4,abanca,2022-01-01,42183.913207,5946,72148000000,690,13,1,0,0.0625,4.0,1,0,300€,0%,1


In [None]:
all_data.cross_visitation.isna().sum()

0

In [None]:
all_data.web_traffic.isna().sum()

0

In [None]:
# save the data to a csv file

all_data.to_csv('data/regression_clean_data.csv', index=False)