In [None]:
numbers_to_remove = [79, 80, 81]
# adding columns for which there are NaNs in SARIMA
numbers_to_remove += [1, 4, 15, 16, 17, 18, 26, 28, 29, 40, 42, 47, 54, 59, 67, 71, 77]

In [None]:
import pandas as pd

data_raw = pd.read_csv("D:/AGH/bankomaty_2022/data/modelling_data/atm_data.csv").rename(columns={"Values": "date"})


In [None]:
from utils import ends_with_number

cols_to_drop = data_raw.columns[data_raw.columns.str.startswith('ATM_N_')]
df = data_raw.loc[:, ~data_raw.columns.isin(cols_to_drop)]

columns_to_keep = [column for column in df.columns if not ends_with_number(column, ['{:02d}'.format(num) for num in numbers_to_remove])]

# Remove the columns from the data frame
df = df[columns_to_keep]

In [None]:
df

In [None]:
import pandas as pd

# Replace 'ATM_W_XX' with 'ATM XX' in column names
new_column_names = [col.replace('ATM_W_', 'ATM ') for col in df.columns]
df.columns = new_column_names

# Calculate summary statistics for each ATM column
summary_statistics = df.describe().transpose()

# Calculate aggregated statistics for all ATMs
df['total_withdrawals'] = df.iloc[:, 1:].sum(axis=1)
aggregated_statistics = df['total_withdrawals'].describe().to_frame().transpose()

# Rename the index of aggregated_statistics
aggregated_statistics.index = ['All ATMs']

# Combine summary and aggregated statistics
combined_statistics = pd.concat([summary_statistics, aggregated_statistics], axis=0)

# Rename the summary statistics columns
combined_statistics.columns = [
    'Count', 'Mean', 'Standard Deviation', 'Minimum', '25th Percentile',
    'Median', '75th Percentile', 'Maximum'
]

# Format the index (first column) with proper capitalization
combined_statistics.index = combined_statistics.index.str.replace('_', ' ').str.title()

# Round the numeric values and remove decimals
combined_statistics = combined_statistics.round(0).astype(int)

# Export the combined statistics as a CSV file
combined_statistics.to_csv('D:/AGH/bankomaty_2022/data/EDA_results/atm_summary_statistics.csv')

print("Summary and aggregated statistics exported to 'atm_summary_statistics.csv'")


In [None]:
import pandas as pd

from utils import summary_statistics_summary

# Assuming 'data' is your DataFrame containing ATM withdrawal data
summary_stats_across_atms = summary_statistics_summary(df)
print(summary_stats_across_atms)


In [None]:
import pandas as pd

# Assuming 'data' is your DataFrame containing ATM withdrawal data

# Split dataset into pre-COVID and during-COVID periods
pre_covid_data = df[df['date'] < '2020-03-01']
during_covid_data = df[df['date'] >= '2020-03-01']

# Apply the summary_statistics_summary function
pre_covid_summary = summary_statistics_summary(pre_covid_data)
during_covid_summary = summary_statistics_summary(during_covid_data)

# Save the results to CSV files
pre_covid_summary.to_csv('D:/AGH/bankomaty_2022/data/EDA_results/pre_covid_summary.csv', index=True)
during_covid_summary.to_csv('D:/AGH/bankomaty_2022/data/EDA_results/during_covid_summary.csv', index=True)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming you have a dataframe `df` with columns 'date', 'ATM XX', and 'total_withdrawals'
# The date column should be in datetime format
df['date'] = pd.to_datetime(df['date'])

# Split the dataset into pre-COVID and during COVID
pre_covid_date = pd.to_datetime("2020-03-01")  # Assuming the pre-COVID period ends on February 29th, 2020
pre_covid_data = df[df['date'] < pre_covid_date]
covid_data = df[df['date'] >= pre_covid_date]

# Selected ATMs
selected_atms = ['ATM 20', 'ATM 36', 'ATM 46', 'ATM 75']

def plot_atm_withdrawals(pre_covid_data, covid_data, atm_id):
    plt.figure(figsize=(12, 6))
    plt.plot(pre_covid_data['date'], pre_covid_data[atm_id], label='Pre-COVID', color='blue')
    plt.plot(covid_data['date'], covid_data[atm_id], label='COVID', color='red')
    plt.xlabel('Date')
    plt.ylabel('Withdrawal Amount')
    plt.title(f'Cash Withdrawal Trends for {atm_id}')
    plt.legend()
    plt.show()

# Plot the selected ATMs
for atm_id in selected_atms:
    plot_atm_withdrawals(pre_covid_data, covid_data, atm_id)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming you have a dataframe `df` with columns 'date', 'ATM XX', and 'total_withdrawals'
# The date column should be in datetime format
df['date'] = pd.to_datetime(df['date'])

# Split the dataset into pre-COVID and during COVID
pre_covid_date = pd.to_datetime("2020-03-01")  # Assuming the pre-COVID period ends on February 29th, 2020
pre_covid_data = df[df['date'] < pre_covid_date]
covid_data = df[df['date'] >= pre_covid_date]

# Selected ATMs
selected_atms = ['ATM 20', 'ATM 36', 'ATM 46', 'ATM 75']

def plot_atm_withdrawals(ax, pre_covid_data, covid_data, atm_id):
    ax.plot(pre_covid_data['date'], pre_covid_data[atm_id], label='Pre-COVID', color='blue', linewidth=0.6)
    ax.plot(covid_data['date'], covid_data[atm_id], label='COVID', color='red', linewidth=0.6)
    ax.scatter(pre_covid_data['date'], pre_covid_data[atm_id], color='blue', s=10, facecolors='none')
    ax.scatter(covid_data['date'], covid_data[atm_id], color='red', s=10, facecolors='none')
    ax.set_title(f'Cash Withdrawal Trends for {atm_id}')
    ax.legend()

# Plot the selected ATMs
fig, axes = plt.subplots(2, 2, figsize=(16, 12), sharex=True, sharey=True)
axes = axes.flatten()

for i, atm_id in enumerate(selected_atms):
    plot_atm_withdrawals(axes[i], pre_covid_data, covid_data, atm_id)

# Set common y-axis limit
max_withdrawal = max(df[selected_atms].max())
min_withdrawal = min(df[selected_atms].min())
for ax in axes:
    ax.set_ylim(min_withdrawal, max_withdrawal)

plt.xlabel('Date')
plt.ylabel('Withdrawal Amount')
plt.tight_layout()
plt.savefig("D:/AGH/bankomaty_2022/pics/combined_atm_withdrawal_trends_same_scale_thinner_lines_datapoints.png")
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Remove the 'total_withdrawals' column
df = df.drop(columns=['total_withdrawals'])

# Convert the 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Reshape data into long format
df_long = df.melt(id_vars='date', var_name='ATM', value_name='withdrawals')

# Create a binary variable to indicate pre-COVID and during COVID periods
cutoff_date = pd.to_datetime('2020-03-01')  # Assuming this date as the start of the COVID period
df_long['period'] = df_long['date'].apply(lambda x: 'pre-COVID' if x < cutoff_date else 'during COVID')

# Create the boxplot
plt.figure(figsize=(20, 10))
sns.boxplot(x='ATM', y='withdrawals', hue='period', data=df_long)
plt.xticks(rotation=90)
plt.title('Cash Withdrawal Trends for ATMs: Pre-COVID vs During COVID')

# Save the plot to the specified folder
output_path = 'D:/AGH/bankomaty_2022/pics/boxplot_cash_withdrawals.png'
plt.savefig(output_path, dpi=300, bbox_inches='tight')

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing the data
df['date'] = pd.to_datetime(df['date'])
df['total_withdrawals'] = df.iloc[:, 1:-1].sum(axis=1)

# Add a column to indicate if the date is within the COVID period
# Assuming the COVID period starts from 2020-03-01
df['is_covid_period'] = df['date'].apply(lambda x: 1 if x >= pd.Timestamp('2020-03-01') else 0)

# Extracting seasonal components
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['week'] = df['date'].dt.isocalendar().week
df['day_of_month'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek

# Function to update the legend labels
def update_legend_labels(ax):
    legend = ax.get_legend()
    new_labels = ['Pre-COVID', 'During COVID']
    for t, l in zip(legend.texts, new_labels):
        t.set_text(l)
    legend.set_title('Period')

# Yearly seasonality
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(x='year', y='total_withdrawals', hue='is_covid_period', data=df, ax=ax)
ax.set_title('Yearly Seasonality in ATM Cash Withdrawals')
ax.set_ylabel('Total Withdrawals')
ax.set_xlabel('Year')
update_legend_labels(ax)
plt.tight_layout()
plt.savefig('D:\\AGH\\bankomaty_2022\\pics\\yearly_seasonality.png')
plt.show()

# Monthly seasonality
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(x='month', y='total_withdrawals', hue='is_covid_period', data=df, ax=ax)
ax.set_title('Monthly Seasonality in ATM Cash Withdrawals')
ax.set_ylabel('Total Withdrawals')
ax.set_xlabel('Month')

# Update x-axis labels to display month names
ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

update_legend_labels(ax)
plt.tight_layout()
plt.savefig('D:\\AGH\\bankomaty_2022\\pics\\monthly_seasonality.png')
plt.show()


# Day of the month seasonality
fig, ax = plt.subplots(figsize=(12, 6))
sns.boxplot(x='day_of_month', y='total_withdrawals', hue='is_covid_period', data=df, ax=ax)
ax.set_title('Day of the Month Seasonality in ATM Cash Withdrawals')
ax.set_ylabel('Total Withdrawals')
ax.set_xlabel('Day of the Month')
update_legend_labels(ax)

plt.tight_layout()
plt.savefig('D:\\AGH\\bankomaty_2022\\pics\\day_of_month_seasonality.png')
plt.show()

# Weekly seasonality
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(x='day_of_week', y='total_withdrawals', hue='is_covid_period', data=df, ax=ax)
ax.set_title('Weekly Seasonality in ATM Cash Withdrawals')
ax.set_ylabel('Total Withdrawals')
ax.set_xlabel('Day of Week')
update_legend_labels(ax)

# Update x-axis labels with abbreviated names of weekdays
weekday_labels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
ax.set_xticklabels(weekday_labels)

plt.tight_layout()
plt.savefig('D:\\AGH\\bankomaty_2022\\pics\\weekly_seasonality.png')
plt.show()


In [None]:
yearly_seasonality = df.groupby(['year', 'is_covid_period'])['total_withdrawals'].mean().reset_index()
yearly_seasonality['is_covid_period'] = yearly_seasonality['is_covid_period'].map({0: 'Pre-COVID', 1: 'During COVID'})
yearly_seasonality.pivot_table(index='year', columns='is_covid_period', values='total_withdrawals')


In [None]:
monthly_seasonality = df.pivot_table(values='total_withdrawals', index='month', columns='is_covid_period', aggfunc='mean')
monthly_seasonality.columns = ['Pre-COVID', 'During COVID']
monthly_seasonality


In [None]:
day_of_month_seasonality = df.groupby(['is_covid_period', 'day_of_month'])['total_withdrawals'].mean().unstack(level=0)
day_of_month_seasonality.columns = ['Pre-COVID', 'During COVID']
day_of_month_seasonality


In [None]:
day_of_month_seasonality = df.groupby(['is_covid_period', 'day_of_week'])['total_withdrawals'].mean().unstack(level=0)
day_of_month_seasonality.columns = ['Pre-COVID', 'During COVID']
day_of_month_seasonality

In [None]:
df.columns
