In [None]:
# import the needed libraries
import duckdb # for data processing adn querying (sql based)
import pandas as pd # for data manipulation
import numpy as np # for numerical operations
from matplotlib import pyplot as plt # for data visualization
import seaborn as sns # for enhanced data visualization

### Step 1. Quick overview of the Data
* Examing the data schema (Features)
* Display some values in the data

In [None]:
# Overview of the accounts data
accounts_data = pd.read_csv("../data/raw/DS_Case_Study1_data/accounts.csv")
accounts_data.head()



Unnamed: 0,account_id,customer_id,account_type,status,open_date,close_date
0,1,1,Checking,Active,2021-06-15,
1,2,2,Savings,Active,2023-05-26,
2,3,3,Checking,Active,2023-01-05,
3,4,4,Checking,Active,2021-06-01,
4,5,5,Savings,Active,2022-03-20,


In [None]:
# display the columns in the accounts data - verify feature names and data schema
accounts_data.columns

Index(['account_id', 'customer_id', 'account_type', 'status', 'open_date',
       'close_date'],
      dtype='object')

In [None]:
# overview of the customers data
customers_data = pd.read_csv("../data/raw/DS_Case_Study1_data/customers.csv")
customers_data.head()

Unnamed: 0,customer_id,age,income_bracket,region,cust_creation_date,is_active
0,1,52,Medium,East,07/10/2021,True
1,2,42,Low,North,12/01/2021,True
2,3,68,Low,West,28/02/2023,True
3,4,56,High,North,26/11/2022,True
4,5,38,Medium,East,20/09/2022,True


In [9]:
# display the columns in the customers data - verify feature names and data schema
customers_data.columns

Index(['customer_id', 'age', 'income_bracket', 'region', 'cust_creation_date',
       'is_active'],
      dtype='object')

In [6]:
# overview of the interactions data
interactions_data = pd.read_csv("../data/raw/DS_Case_Study1_data/interactions.csv")
interactions_data.head()

Unnamed: 0,interaction_id,customer_id,interaction_date,interaction_type
0,1,1038,2025-10-15,Complaint
1,2,1636,2025-10-25,Technical Support
2,3,883,2025-09-01,Product Inquiry
3,4,1121,2025-08-22,Service Inquiry
4,5,317,2025-09-29,Product Inquiry


In [11]:
# display the columns in the interactions data - verify feature names and data schema
interactions_data.columns

Index(['interaction_id', 'customer_id', 'interaction_date',
       'interaction_type'],
      dtype='object')

In [7]:
# overview of the transactions data
transactions_data = pd.read_csv("../data/raw/DS_Case_Study1_data/transactions.csv")
transactions_data.head()

Unnamed: 0,transaction_id,account_id,transaction_date,transaction_type,amount
0,1,2895,2025-06-15,Debit,52.08
1,2,572,2024-12-29,Credit,353.66
2,3,2853,2024-10-24,Credit,74.89
3,4,827,2025-05-26,Debit,1.94
4,5,3344,2025-07-15,Transfer,76.28


In [12]:
# display the columns in the transaction data - verify feature names and data schema
transactions_data.columns

Index(['transaction_id', 'account_id', 'transaction_date', 'transaction_type',
       'amount'],
      dtype='object')

### Step 2- Converting the data into duckdb tables for efficient querying and processing



In [15]:
# Create or open a DuckDB database file
con = duckdb.connect("../data/raw/bank_churn_data.db")

In [16]:
# Load the CSV data into SQL base tables

con.sql("""
    CREATE OR REPLACE TABLE customer AS 
    SELECT * FROM read_csv_auto('../data/raw/DS_Case_Study1_data/customers.csv');
""")

con.sql("""
    CREATE OR REPLACE TABLE account AS 
    SELECT * FROM read_csv_auto('../data/raw/DS_Case_Study1_data/accounts.csv');
""")

con.sql("""
    CREATE OR REPLACE TABLE transactions AS 
    SELECT * FROM read_csv_auto('../data/raw/DS_Case_Study1_data/transactions.csv');
""")

con.sql("""
    CREATE OR REPLACE TABLE interaction AS 
    SELECT * FROM read_csv_auto('../data/raw/DS_Case_Study1_data/interactions.csv');
""")


In [19]:
# show all the tables in the database - verify that the tables were created successfully
con.sql("SHOW TABLES")

┌──────────────┐
│     name     │
│   varchar    │
├──────────────┤
│ account      │
│ customer     │
│ interaction  │
│ transactions │
└──────────────┘

The tables have been created successfully. We can begin to query and analyze the data using sql

In [None]:
# Checking the structure of the customer table - shema and data types
con.sql("DESCRIBE customer")


┌────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│    column_name     │ column_type │  null   │   key   │ default │  extra  │
│      varchar       │   varchar   │ varchar │ varchar │ varchar │ varchar │
├────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ customer_id        │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ age                │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ income_bracket     │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ region             │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ cust_creation_date │ DATE        │ YES     │ NULL    │ NULL    │ NULL    │
│ is_active          │ BOOLEAN     │ YES     │ NULL    │ NULL    │ NULL    │
└────────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

In [22]:
# preview the first 5 rows of the customer table
con.sql("SELECT * FROM customer LIMIT 5")


┌─────────────┬───────┬────────────────┬─────────┬────────────────────┬───────────┐
│ customer_id │  age  │ income_bracket │ region  │ cust_creation_date │ is_active │
│    int64    │ int64 │    varchar     │ varchar │        date        │  boolean  │
├─────────────┼───────┼────────────────┼─────────┼────────────────────┼───────────┤
│           1 │    52 │ Medium         │ East    │ 2021-10-07         │ true      │
│           2 │    42 │ Low            │ North   │ 2021-01-12         │ true      │
│           3 │    68 │ Low            │ West    │ 2023-02-28         │ true      │
│           4 │    56 │ High           │ North   │ 2022-11-26         │ true      │
│           5 │    38 │ Medium         │ East    │ 2022-09-20         │ true      │
└─────────────┴───────┴────────────────┴─────────┴────────────────────┴───────────┘

In [23]:
# checking the accont table structure
con.sql("DESCRIBE account")

┌──────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name  │ column_type │  null   │   key   │ default │  extra  │
│   varchar    │   varchar   │ varchar │ varchar │ varchar │ varchar │
├──────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ account_id   │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ customer_id  │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ account_type │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ status       │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ open_date    │ DATE        │ YES     │ NULL    │ NULL    │ NULL    │
│ close_date   │ DATE        │ YES     │ NULL    │ NULL    │ NULL    │
└──────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

In [24]:
con.sql("SELECT * FROM account LIMIT 5")

┌────────────┬─────────────┬──────────────┬─────────┬────────────┬────────────┐
│ account_id │ customer_id │ account_type │ status  │ open_date  │ close_date │
│   int64    │    int64    │   varchar    │ varchar │    date    │    date    │
├────────────┼─────────────┼──────────────┼─────────┼────────────┼────────────┤
│          1 │           1 │ Checking     │ Active  │ 2021-06-15 │ NULL       │
│          2 │           2 │ Savings      │ Active  │ 2023-05-26 │ NULL       │
│          3 │           3 │ Checking     │ Active  │ 2023-01-05 │ NULL       │
│          4 │           4 │ Checking     │ Active  │ 2021-06-01 │ NULL       │
│          5 │           5 │ Savings      │ Active  │ 2022-03-20 │ NULL       │
└────────────┴─────────────┴──────────────┴─────────┴────────────┴────────────┘

In [25]:
# chekcing the transactions table structure
con.sql("DESCRIBE transactions")

┌──────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│   column_name    │ column_type │  null   │   key   │ default │  extra  │
│     varchar      │   varchar   │ varchar │ varchar │ varchar │ varchar │
├──────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ transaction_id   │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ account_id       │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ transaction_date │ DATE        │ YES     │ NULL    │ NULL    │ NULL    │
│ transaction_type │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ amount           │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
└──────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

In [None]:
# preview the first 5 rows of the transactions table
con.sql("SELECT * FROM transactions LIMIT 5")

┌────────────────┬────────────┬──────────────────┬──────────────────┬────────┐
│ transaction_id │ account_id │ transaction_date │ transaction_type │ amount │
│     int64      │   int64    │       date       │     varchar      │ double │
├────────────────┼────────────┼──────────────────┼──────────────────┼────────┤
│              1 │       2895 │ 2025-06-15       │ Debit            │  52.08 │
│              2 │        572 │ 2024-12-29       │ Credit           │ 353.66 │
│              3 │       2853 │ 2024-10-24       │ Credit           │  74.89 │
│              4 │        827 │ 2025-05-26       │ Debit            │   1.94 │
│              5 │       3344 │ 2025-07-15       │ Transfer         │  76.28 │
└────────────────┴────────────┴──────────────────┴──────────────────┴────────┘

In [27]:
# check the first 5 rows of the interactions table
con.sql("DESCRIBE interaction")

┌──────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│   column_name    │ column_type │  null   │   key   │ default │  extra  │
│     varchar      │   varchar   │ varchar │ varchar │ varchar │ varchar │
├──────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ interaction_id   │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ customer_id      │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ interaction_date │ DATE        │ YES     │ NULL    │ NULL    │ NULL    │
│ interaction_type │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
└──────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

In [28]:
# preview the first 5 rows of the interactions table
con.sql("SELECT * FROM interaction LIMIT 5")

┌────────────────┬─────────────┬──────────────────┬───────────────────┐
│ interaction_id │ customer_id │ interaction_date │ interaction_type  │
│     int64      │    int64    │       date       │      varchar      │
├────────────────┼─────────────┼──────────────────┼───────────────────┤
│              1 │        1038 │ 2025-10-15       │ Complaint         │
│              2 │        1636 │ 2025-10-25       │ Technical Support │
│              3 │         883 │ 2025-09-01       │ Product Inquiry   │
│              4 │        1121 │ 2025-08-22       │ Service Inquiry   │
│              5 │         317 │ 2025-09-29       │ Product Inquiry   │
└────────────────┴─────────────┴──────────────────┴───────────────────┘

In [None]:
# checking the number of rows in each table
con.sql("""
SELECT
    (SELECT COUNT(*) FROM customer) AS customer_rows,
    (SELECT COUNT(*) FROM account) AS account_rows,
    (SELECT COUNT(*) FROM transactions) AS transactions_rows,
    (SELECT COUNT(*) FROM interaction) AS interaction_rows
""").df()


Unnamed: 0,customer_rows,account_rows,transactions_rows,interaction_rows
0,2000,4500,100000,8000


### Joining all 4 tables 
(customer, account, transactions, interaction) into a single dataset ready for analysis and feature engineering.

In [None]:
# merge all the data into a single table for analysis
merged_data = con.sql("""
    SELECT 
        c.*,
        a.*,
        t.*,
        i.*
    FROM customer c
    LEFT JOIN account a ON c.customer_id = a.customer_id
    LEFT JOIN transactions t ON c.customer_id = t.customer_id
    LEFT JOIN interaction i ON c.customer_id = i.customer_id
""").df()

### We are done with the data Schema - Let's proceed to Task
#### Part A:
* Define the target variable
* feature engineering

In [35]:
customers_data.describe()

Unnamed: 0,customer_id,age
count,2000.0,2000.0
mean,1000.5,45.0845
std,577.494589,14.203811
min,1.0,18.0
25%,500.75,35.0
50%,1000.5,45.0
75%,1500.25,55.0
max,2000.0,80.0


In [36]:
customers_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         2000 non-null   int64 
 1   age                 2000 non-null   int64 
 2   income_bracket      2000 non-null   object
 3   region              2000 non-null   object
 4   cust_creation_date  2000 non-null   object
 5   is_active           2000 non-null   bool  
dtypes: bool(1), int64(2), object(3)
memory usage: 80.2+ KB


In [37]:
customers_data.isnull().sum()

customer_id           0
age                   0
income_bracket        0
region                0
cust_creation_date    0
is_active             0
dtype: int64

Customers_data has no missing values

In [38]:
accounts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   account_id    4500 non-null   int64 
 1   customer_id   4500 non-null   int64 
 2   account_type  4500 non-null   object
 3   status        4500 non-null   object
 4   open_date     4500 non-null   object
 5   close_date    242 non-null    object
dtypes: int64(2), object(4)
memory usage: 211.1+ KB


In [39]:
accounts_data.isnull().sum()

account_id         0
customer_id        0
account_type       0
status             0
open_date          0
close_date      4258
dtype: int64

* accounts data has 4258 missing values in the 'close_date' column out of 4500 row.
* Meaning that 94 percent of the data is missing. we can consider droping this row. or meet with the team is the values can gotten and it is not an ommission.

In [40]:
interactions_data.isnull().sum()

interaction_id      0
customer_id         0
interaction_date    0
interaction_type    0
dtype: int64

In [41]:
interactions_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   interaction_id    8000 non-null   int64 
 1   customer_id       8000 non-null   int64 
 2   interaction_date  8000 non-null   object
 3   interaction_type  8000 non-null   object
dtypes: int64(2), object(2)
memory usage: 250.1+ KB


* interactions data has no missing values

In [42]:
# checking missing values for transactions data
transactions_data.isnull().sum()

transaction_id      0
account_id          0
transaction_date    0
transaction_type    0
amount              0
dtype: int64

In [44]:
transactions_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   transaction_id    100000 non-null  int64  
 1   account_id        100000 non-null  int64  
 2   transaction_date  100000 non-null  object 
 3   transaction_type  100000 non-null  object 
 4   amount            100000 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 3.8+ MB


* transactions data has no missing values