# MongoDB Data Transformations and Enrichments

This notebook demonstrates various data transformation and enrichment techniques in MongoDB.

In [1]:
import sys
!{sys.executable} -m pip install pandas pymongo --quiet

import pandas as pd
from pymongo import MongoClient
from datetime import datetime
import json
import time
import warnings
warnings.filterwarnings('ignore')

def print_mongo(obj):
    """Pretty print MongoDB output"""
    print(json.dumps(obj, indent=2, default=str))

client = MongoClient('mongodb://admin:admin@router1:27017/businessdb?authSource=admin')
db = client.businessdb
print("Successfully connected to MongoDB")

## 1. Field Derivation
Adding computed fields to documents

In [2]:
# Add full name and subscription age for customers
pipeline = [
    {
        "$addFields": {
            "fullName": {
                "$concat": ["$firstName", " ", "$lastName"]
            },
            "subscriptionAgeInYears": {
                "$floor": {
                    "$divide": [
                        {"$subtract": ["$$NOW", "$subscriptionDate"]},
                        (365 * 24 * 60 * 60 * 1000)
                    ]
                }
            }
        }
    }
]

results = list(db.customers.aggregate(pipeline))
print("Sample transformed customer records:")
for doc in results[:2]:
    print_mongo(doc)

## 2. Data Normalization
Standardize field formats

In [3]:
# Normalize organization data
pipeline = [
    {
        "$project": {
            "_id": 1,
            "organizationId": 1,
            "name": {"$toUpper": "$name"},
            "website": {"$toLower": "$website"},
            "country": {"$toUpper": "$country"},
            "employeeCategory": {
                "$switch": {
                    "branches": [
                        {"case": {"$lt": ["$numberOfEmployees", 100]}, "then": "SMALL"},
                        {"case": {"$lt": ["$numberOfEmployees", 1000]}, "then": "MEDIUM"},
                        {"case": {"$gte": ["$numberOfEmployees", 1000]}, "then": "LARGE"}
                    ],
                    "default": "UNKNOWN"
                }
            }
        }
    }
]

results = list(db.organizations.aggregate(pipeline))
print("Normalized organization records:")
for doc in results[:2]:
    print_mongo(doc)

## 3. Data Enrichment
Combine data from multiple collections

In [4]:
# Enrich customer data with company information
pipeline = [
    {
        "$lookup": {
            "from": "organizations",
            "localField": "company",
            "foreignField": "name",
            "as": "companyInfo"
        }
    },
    {
        "$unwind": {
            "path": "$companyInfo",
            "preserveNullAndEmptyArrays": True
        }
    },
    {
        "$project": {
            "fullName": {"$concat": ["$firstName", " ", "$lastName"]},
            "email": 1,
            "companyName": "$company",
            "companyIndustry": "$companyInfo.industry",
            "companySize": "$companyInfo.numberOfEmployees"
        }
    }
]

results = list(db.customers.aggregate(pipeline))
print("Enriched customer records:")
for doc in results[:2]:
    print_mongo(doc)

## 4. Historical Snapshots
Create point-in-time snapshots of data

In [5]:
# Create a snapshot of organizations
snapshot_time = datetime.now()

pipeline = [
    {
        "$addFields": {
            "snapshotMetadata": {
                "timestamp": snapshot_time,
                "version": "1.0"
            }
        }
    },
    {"$out": "organizations_snapshot"}
]

db.organizations.aggregate(pipeline)

# Verify snapshot
snapshot_count = db.organizations_snapshot.count_documents({})
print(f"Created snapshot with {snapshot_count} records at {snapshot_time}")

# Show sample
sample = db.organizations_snapshot.find_one()
print("\nSample snapshot record:")
print_mongo(sample)