# MongoDB Data Transformations and Enrichments

This notebook demonstrates various data transformation and enrichment techniques in MongoDB.

In [1]:
import sys
!{sys.executable} -m pip install pandas pymongo --quiet

import pandas as pd
from pymongo import MongoClient
from datetime import datetime
import json
import time
import warnings
warnings.filterwarnings('ignore')

def print_mongo(obj):
    """Pretty print MongoDB output"""
    print(json.dumps(obj, indent=2, default=str))

client = MongoClient('mongodb://admin:admin@router1:27017/businessdb?authSource=admin')
db = client.businessdb
print("Successfully connected to MongoDB")

Successfully connected to MongoDB


## 1. Field Derivation
Add a 'fullName' field by concatenating 'firstName' and 'lastName,' and calculate 'subscriptionAgeInYears' as the number of years since the subscription date for each customer.

In [3]:
pipeline = [
    {
        "$addFields": {
            "fullName": {
                "$concat": ["$firstName", " ", "$lastName"]
            },
            "subscriptionAgeInYears": {
                "$floor": {
                    "$divide": [
                        {"$subtract": ["$$NOW", "$subscriptionDate"]},
                        (365 * 24 * 60 * 60 * 1000)
                    ]
                }
            }
        }
    }
]

results = list(db.customers.aggregate(pipeline))
print("Sample transformed customer records:")
for doc in results[:2]:
    print_mongo(doc)

Sample transformed customer records:
{
  "_id": "678ae70138e354c7775126ef",
  "customerId": "1Ef7b82A4CAAD10",
  "firstName": "Preston",
  "lastName": "Lozano",
  "company": "Vega-Gentry",
  "city": "East Jimmychester",
  "country": "Djibouti",
  "phone1": "5153435776",
  "phone2": "686-620-1820x944",
  "email": "vmata@colon.com",
  "subscriptionDate": "2021-04-23 00:00:00",
  "website": "http://www.hobbs.com/",
  "fullName": "Preston Lozano",
  "subscriptionAgeInYears": 3.0
}
{
  "_id": "678ae70138e354c7775126f0",
  "customerId": "6F94879bDAfE5a6",
  "firstName": "Roy",
  "lastName": "Berry",
  "company": "Murillo-Perry",
  "city": "Isabelborough",
  "country": "Antigua and Barbuda",
  "phone1": "+1-539-402-0259",
  "phone2": "(496)978-3969x58947",
  "email": "beckycarr@hogan.com",
  "subscriptionDate": "2020-03-25 00:00:00",
  "website": "http://www.lawrence.com/",
  "fullName": "Roy Berry",
  "subscriptionAgeInYears": 4.0
}


## 2. Data Normalization
Normalize organization records by converting 'name' and 'country' fields to uppercase, 'website' to lowercase, and categorizing organizations into SMALL, MEDIUM, or LARGE based on the number of employees.

In [4]:
pipeline = [
    {
        "$project": {
            "_id": 1,
            "organizationId": 1,
            "name": {"$toUpper": "$name"},
            "website": {"$toLower": "$website"},
            "country": {"$toUpper": "$country"},
            "employeeCategory": {
                "$switch": {
                    "branches": [
                        {"case": {"$lt": ["$numberOfEmployees", 100]}, "then": "SMALL"},
                        {"case": {"$lt": ["$numberOfEmployees", 1000]}, "then": "MEDIUM"},
                        {"case": {"$gte": ["$numberOfEmployees", 1000]}, "then": "LARGE"}
                    ],
                    "default": "UNKNOWN"
                }
            }
        }
    }
]

results = list(db.organizations.aggregate(pipeline))
print("Normalized organization records:")
for doc in results[:2]:
    print_mongo(doc)

Normalized organization records:
{
  "_id": "678ae70038e354c777512628",
  "organizationId": "0bFED1ADAE4bcC1",
  "name": "HESTER LTD",
  "website": "http://sullivan-reed.com/",
  "country": "CHINA",
  "employeeCategory": "LARGE"
}
{
  "_id": "678ae70038e354c777512629",
  "organizationId": "2bFC1Be8a4ce42f",
  "name": "HOLDER-SELLERS",
  "website": "https://becker.com/",
  "country": "TURKMENISTAN",
  "employeeCategory": "MEDIUM"
}


## 3. Data Enrichment
Join customer data with organization details from the 'organizations' collection using the 'company' field, and add fields for full name, company industry, and company size.

In [5]:
pipeline = [
    {
        "$lookup": {
            "from": "organizations",
            "localField": "company",
            "foreignField": "name",
            "as": "companyInfo"
        }
    },
    {
        "$unwind": {
            "path": "$companyInfo",
            "preserveNullAndEmptyArrays": True
        }
    },
    {
        "$project": {
            "fullName": {"$concat": ["$firstName", " ", "$lastName"]},
            "email": 1,
            "companyName": "$company",
            "companyIndustry": "$companyInfo.industry",
            "companySize": "$companyInfo.numberOfEmployees"
        }
    }
]

results = list(db.customers.aggregate(pipeline))
print("Enriched customer records:")
for doc in results[:2]:
    print_mongo(doc)

Enriched customer records:
{
  "_id": "678ae70138e354c7775126ef",
  "email": "vmata@colon.com",
  "fullName": "Preston Lozano",
  "companyName": "Vega-Gentry"
}
{
  "_id": "678ae70138e354c7775126f0",
  "email": "beckycarr@hogan.com",
  "fullName": "Roy Berry",
  "companyName": "Murillo-Perry"
}


## 4. Historical Snapshots
Create a snapshot of the 'organizations' collection by copying all documents into 'organizations_snapshot' and adding fields for timestamp and version metadata.

In [5]:
snapshot_time = datetime.now()

pipeline = [
    {
        "$addFields": {
            "snapshotMetadata": {
                "timestamp": snapshot_time,
                "version": "1.0"
            }
        }
    },
    {"$out": "organizations_snapshot"}
]

db.organizations.aggregate(pipeline)

snapshot_count = db.organizations_snapshot.count_documents({})
print(f"Created snapshot with {snapshot_count} records at {snapshot_time}")

sample = db.organizations_snapshot.find_one()
print("\nSample snapshot record:")
print_mongo(sample)