In [18]:
# Install Great Expectations (if not already installed)
# %pip install great_expectations

# Import required libraries
import great_expectations as gx
import pandas as pd
import warnings

# Suppress Great Expectations warnings
warnings.filterwarnings("ignore", message="`result_format` configured at the Validator-level*")

# ------------------------------
# STEP 1: Load datasets / Ladda datafiler
# ------------------------------

# Load transactions data / Ladda transaktioner
df_transactions = pd.read_csv("./data/transactions.csv")

# Load customer data / Ladda kunddata
df_customers = pd.read_csv("./data/sebank_customers_with_accounts.csv")

# ------------------------------
# STEP 2: Clean the data / Rensa datan
# ------------------------------

# Remove duplicates / Ta bort dubbletter
df_transactions.drop_duplicates(inplace=True)
df_customers.drop_duplicates(inplace=True)

# Remove critical nulls / Ta bort viktiga nullvärden
df_transactions.dropna(subset=["transaction_id", "timestamp", "amount", "sender_account", "receiver_account"], inplace=True)
df_customers.dropna(subset=["BankAccount"], inplace=True)

# Convert amount to numeric / Konvertera belopp till numeriskt
df_transactions["amount"] = pd.to_numeric(df_transactions["amount"], errors="coerce")

# Create temporary string column for timestamp validation / Temporär sträng för datumvalidering
df_transactions["timestamp_str"] = df_transactions["timestamp"].astype(str)

# ------------------------------
# STEP 3: Create Great Expectations context / Skapa GE-kontext
# ------------------------------

context = gx.get_context()
data_source = context.data_sources.add_pandas(name="pandas")
data_asset = data_source.add_dataframe_asset(name="transactions_data")
batch_definition = data_asset.add_batch_definition_whole_dataframe(name="batch_def")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df_transactions})

suite = gx.core.expectation_suite.ExpectationSuite(name="transactions_suite")
validator = context.get_validator(batch=batch, expectation_suite=suite)

# ------------------------------
# STEP 4: Add expectations / Lägg till regler
# ------------------------------

# Column existence / Kontrollera att kolumner finns
validator.expect_column_to_exist("transaction_id")
validator.expect_column_to_exist("timestamp")
validator.expect_column_to_exist("amount")
validator.expect_column_to_exist("currency")
validator.expect_column_to_exist("sender_account")
validator.expect_column_to_exist("receiver_account")
validator.expect_column_to_exist("transaction_type")

# Null checks / Kontrollera nullvärden
validator.expect_column_values_to_not_be_null("transaction_id")
validator.expect_column_values_to_not_be_null("timestamp")
validator.expect_column_values_to_not_be_null("amount")
validator.expect_column_values_to_not_be_null("sender_account")
validator.expect_column_values_to_not_be_null("receiver_account")

# Unique transaction IDs / Unika transaktions-ID:n
validator.expect_column_values_to_be_unique("transaction_id")

# Amount range / Beloppsintervall
validator.expect_column_values_to_be_between("amount", min_value=0.01, max_value=100000)

# ✅ FIXED: Validate timestamp format using the temporary string column
validator.expect_column_values_to_match_strftime_format("timestamp_str", "%Y-%m-%d %H:%M:%S")

# Now convert the original timestamp to datetime / Nu konvertera till datetime
df_transactions["timestamp"] = pd.to_datetime(df_transactions["timestamp"], errors="coerce")

# Currency check / Validera valuta
valid_currencies = ["SEK"]
validator.expect_column_values_to_be_in_set("currency", valid_currencies)

# Transaction type check / Validera transaktionstyp
valid_types = ["outgoing", "incoming"]
validator.expect_column_values_to_be_in_set("transaction_type", valid_types)

# ------------------------------
# STEP 5: Cross-validate accounts / Kontrollera konton mot kundfil
# ------------------------------

# valid_accounts = set(df_customers["BankAccount"].astype(str))

# # Check sender and receiver accounts against known customer accounts
# invalid_sender = df_transactions[~df_transactions["sender_account"].astype(str).isin(valid_accounts)]
# invalid_receiver = df_transactions[~df_transactions["receiver_account"].astype(str).isin(valid_accounts)]

# if not invalid_sender.empty:
#     print("⚠️ Warning: sender_account(s) not found in customer data!")
#     print("⚠️ Varning: Avsändarkonton finns inte i kunddata!")
#     print(invalid_sender[["transaction_id", "sender_account"]])

# if not invalid_receiver.empty:
#     print("⚠️ Warning: receiver_account(s) not found in customer data!")
#     print("⚠️ Varning: Mottagarkonton finns inte i kunddata!")
#     print(invalid_receiver[["transaction_id", "receiver_account"]])

# ------------------------------
# STEP 6: Run validation / Kör validering
# ------------------------------

results = validator.validate()
print(results)

# ------------------------------
# Optional: Save suite / Spara regeluppsättning
# ------------------------------
# context.save_expectation_suite(expectation_suite=suite)


Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 69.58it/s]
Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 137.23it/s]
Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 137.75it/s]
Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 151.37it/s]
Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 116.58it/s]
Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 137.99it/s]
Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 131.83it/s]
Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 179.58it/s]
Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 179.27it/s]
Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 193.88it/s]
Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 182.38it/s]
Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 283.17it/s]
Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 66.73it/s]
Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 83.26it/s]
Calculating Metrics: 100%|██████████| 8/8 [00:01<00

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_to_exist",
        "kwargs": {
          "batch_id": "pandas-transactions_data",
          "column": "transaction_type"
        },
        "meta": {}
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_in_set",
        "kwargs": {
          "batch_id": "pandas-transactions_data",
          "column": "transaction_type",
          "value_set": [
            "outgoing",
            "incoming"
          ]
        },
        "meta": {}
      },
      "result": {
        "element_count": 100000,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "partial_unexpected_list": [],
        "missing_count": 0,



