In [6]:
import json
import os
from datetime import datetime
import re

def convert_timestamp_to_mongo_date(obj):
    """
    Recursively convert timestamp fields to MongoDB date format.
    Looks for fields that contain timestamp-like values and converts them.
    """
    if isinstance(obj, dict):
        new_obj = {}
        for key, value in obj.items():
            # Check if this might be a timestamp field
            if is_timestamp_field(key, value):
                # Convert timestamp string to proper MongoDB date format
                mongo_date = format_for_mongodb(value)
                new_obj[key] = {"$date": mongo_date}
            else:
                # Recursively process nested objects
                new_obj[key] = convert_timestamp_to_mongo_date(value)
        return new_obj
    elif isinstance(obj, list):
        return [convert_timestamp_to_mongo_date(item) for item in obj]
    else:
        return obj

def format_for_mongodb(timestamp_str):
    """
    Convert various timestamp formats to MongoDB-compatible format.
    MongoDB expects either:
    1. ISO 8601 format in UTC with milliseconds: "2019-03-20T20:27:46.000Z"
    2. Milliseconds since epoch as number
    """
    try:
        # Parse the timestamp and convert to UTC
        dt = None
        
        # Handle various ISO formats
        if 'T' in timestamp_str:
            # Remove any existing .000Z suffix to avoid duplication
            clean_str = timestamp_str.rstrip('Z').rstrip('.000')
            
            # Handle timezone offsets like -07:00, +05:30, etc.
            if '+' in clean_str or clean_str.count('-') > 2:
                # This has a timezone offset, parse it properly
                dt = datetime.fromisoformat(timestamp_str)
            else:
                # No timezone, assume UTC
                try:
                    dt = datetime.fromisoformat(clean_str)
                except:
                    dt = datetime.strptime(clean_str, "%Y-%m-%dT%H:%M:%S")
        else:
            # Handle other formats
            for fmt in [
                "%Y-%m-%d %H:%M:%S",      # 2019-03-20 20:27:46
                "%Y/%m/%d %H:%M:%S",      # 2019/03/20 20:27:46
            ]:
                try:
                    dt = datetime.strptime(timestamp_str, fmt)
                    break
                except ValueError:
                    continue
        
        if dt is None:
            print(f"Warning: Could not parse timestamp '{timestamp_str}'")
            return timestamp_str
        
        # Convert to UTC if it has timezone info
        if dt.tzinfo is not None:
            dt = dt.utctimetuple()
            dt = datetime(*dt[:6])  # Convert back to naive datetime in UTC
        
        # Format as MongoDB compatible ISO string in UTC
        return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
        
    except Exception as e:
        print(f"Warning: Could not parse timestamp '{timestamp_str}': {e}")
        if timestamp_str[-1] != 'Z':
            # If it doesn't end with 'Z', append it
            timestamp_str = timestamp_str.rstrip('Z') + 'Z'
        return timestamp_str

def is_timestamp_field(key, value):
    """
    Determine if a field contains a timestamp that should be converted.
    """
    if not isinstance(value, str):
        return False
    
    # Common timestamp field names
    timestamp_keywords = [
        'timestamp', 'time', 'date', 'creationDate', 'updated', 'modified',
        'last_modified', 'lastModified', 'createdAt', 'updatedAt', 'registrationDate'
    ]

    # if key == 'lastModifiedBy':
    #     # Special case for 'lastModifiedBy' which is not a timestamp
    #     return False

    # Check exact matches first
    if key in timestamp_keywords:
        return True

    return False
    
    # Check if key contains timestamp-related words
    key_lower = key.lower()
    if any(keyword in key_lower for keyword in timestamp_keywords):
        return True
    
    # Check if value looks like an ISO timestamp
    iso_patterns = [
        r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?Z?$',  # ISO format
        r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$',            # SQL format
        r'^\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}$'             # Alternative format
    ]
    
    for pattern in iso_patterns:
        if re.match(pattern, value):
            return True
    
    return False

def process_json_file(input_file, output_file):
    """
    Process a JSON file and convert timestamp fields to MongoDB date format.
    """
    print(f"Processing {input_file}...")
    
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Convert timestamps
        converted_data = convert_timestamp_to_mongo_date(data)
        
        # Write converted data
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(converted_data, f, indent=2, ensure_ascii=False)
        
        print(f"✅ Successfully converted {input_file} → {output_file}")
        
    except FileNotFoundError:
        print(f"❌ File not found: {input_file}")
    except json.JSONDecodeError as e:
        print(f"❌ JSON decode error in {input_file}: {e}")
    except Exception as e:
        print(f"❌ Error processing {input_file}: {e}")

# List of files to process
files_to_process = [
    "mongo_files.json",
    "mongo_projects.json", 
    "mongo_users_with_redundancy.json",
    "mongo_commits.json"
]

# Process each file
for filename in files_to_process:
    input_path = filename
    output_path = f"converted_{filename}"
    process_json_file(input_path, output_path)

print("\n🎉 Date conversion completed!")
print("Converted files have 'converted_' prefix")

Processing mongo_files.json...
✅ Successfully converted mongo_files.json → converted_mongo_files.json
Processing mongo_projects.json...
✅ Successfully converted mongo_files.json → converted_mongo_files.json
Processing mongo_projects.json...
✅ Successfully converted mongo_projects.json → converted_mongo_projects.json
Processing mongo_users_with_redundancy.json...
✅ Successfully converted mongo_projects.json → converted_mongo_projects.json
Processing mongo_users_with_redundancy.json...
✅ Successfully converted mongo_users_with_redundancy.json → converted_mongo_users_with_redundancy.json
Processing mongo_commits.json...
✅ Successfully converted mongo_users_with_redundancy.json → converted_mongo_users_with_redundancy.json
Processing mongo_commits.json...
✅ Successfully converted mongo_commits.json → converted_mongo_commits.json

🎉 Date conversion completed!
Converted files have 'converted_' prefix
✅ Successfully converted mongo_commits.json → converted_mongo_commits.json

🎉 Date conversion

In [None]:
# Test the problematic date format and fix it
test_date = "2019-03-20T20:27:46"
print(f"Original: {test_date}")

# MongoDB requires milliseconds and Z timezone
if 'T' in test_date and not test_date.endswith('Z'):
    if '.' not in test_date:
        test_date += ".000"
    if not test_date.endswith('Z'):
        test_date += "Z"

print(f"Fixed: {test_date}")
print(f"MongoDB format: {{'$date': '{test_date}'}}")

# Test parsing the original format
from datetime import datetime
try:
    dt = datetime.strptime("2019-03-20T20:27:46", "%Y-%m-%dT%H:%M:%S")
    mongo_formatted = dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
    print(f"Parsed and formatted: {mongo_formatted}")
except Exception as e:
    print(f"Error: {e}")

In [None]:
# Test the timezone format that was causing issues
from datetime import datetime

def test_format_for_mongodb(timestamp_str):
    """Test function to verify the fix"""
    try:
        # Parse the timestamp and convert to UTC
        dt = None
        
        # Handle various ISO formats
        if 'T' in timestamp_str:
            # Remove any existing .000Z suffix to avoid duplication
            clean_str = timestamp_str.rstrip('Z').rstrip('.000')
            
            # Handle timezone offsets like -07:00, +05:30, etc.
            if '+' in clean_str or clean_str.count('-') > 2:
                # This has a timezone offset, parse it properly
                dt = datetime.fromisoformat(timestamp_str)
            else:
                # No timezone, assume UTC
                try:
                    dt = datetime.fromisoformat(clean_str)
                except:
                    dt = datetime.strptime(clean_str, "%Y-%m-%dT%H:%M:%S")
        
        if dt is None:
            return timestamp_str
        
        # Convert to UTC if it has timezone info
        if dt.tzinfo is not None:
            dt = dt.utctimetuple()
            dt = datetime(*dt[:6])  # Convert back to naive datetime in UTC
        
        # Format as MongoDB compatible ISO string in UTC
        return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
        
    except Exception as e:
        print(f"Error: {e}")
        return timestamp_str

# Test problematic formats
test_cases = [
    "2015-05-12T18:51:59-07:00",
    "2019-03-20T20:27:46",
    "2015-05-12T18:51:59+05:30",
    "2019-03-20T20:27:46.000Z"
]

for test_case in test_cases:
    result = test_format_for_mongodb(test_case)
    print(f"Input:  {test_case}")
    print(f"Output: {result}")
    print(f"MongoDB format: {{'$date': '{result}'}}")
    print("---")