In [44]:
import json
import os
import datetime
import math
from pymongo import MongoClient
from bson import json_util, ObjectId

# Setup MongoDB connection
client = MongoClient('mongodb://localhost:27017/')
db = client['airbnb']

# Define the aggregation pipeline
pipeline = [
    {
        '$lookup': {
            'from': 'listings_with_calendar',
            'localField': 'id',
            'foreignField': '_id',
            'as': 'cal_docs'
        }
    },
    {
        '$lookup': {
            'from': 'reviews_t',
            'localField': 'id',
            'foreignField': 'listing_id',
            'as': 'reviews'
        }
    },
    {
        '$unwind': {
            'path': '$cal_docs',
            'preserveNullAndEmptyArrays': True
        }
    },
    {
        '$addFields': {
            'average_price': '$cal_docs.average_price',
            'first_available_date': '$cal_docs.first_available_date',
            'last_available_date': '$cal_docs.last_available_date',
            'dates_list': '$cal_docs.dates_list'
        }
    },
    {
        '$project': {
            'cal_docs': 0
        }
    },
    {
        '$out': 'listings_with_reviews_and_cal'
    }
]

# Execute the pipeline
db.listings_with_reviews_m.aggregate(pipeline)



<pymongo.command_cursor.CommandCursor at 0x112cc3b50>

In [45]:
# Setup MongoDB connection
client = MongoClient('mongodb://localhost:27017/')
db = client['airbnb']

# Fetch the processed documents
subset_docs = db.listings_with_reviews_and_cal.find({'id': {'$regex': '^1000.*'}})

# Function to replace NaN with None and format datetime
def clean_document(doc):
    for key, value in doc.items():
        if isinstance(value, float) and math.isnan(value):
            doc[key] = None
        elif isinstance(value, ObjectId):
            doc[key] = str(value)
        elif isinstance(value, datetime.datetime):
            doc[key] = value.strftime('%Y-%m-%d')
        elif isinstance(value, list):
            for item in value:
                clean_document(item)
        elif isinstance(value, dict):
            clean_document(value)
    return doc

cleaned_documents = [clean_document(doc) for doc in subset_docs]



def write_dict_to_dir_json(data, dir, filename):
    # Creates a JSON file from dictionary data
    import os
    if not os.path.exists(dir):
        os.makedirs(dir)  # Ensure the directory exists
    full_path = f"{dir}/{filename}"
    with open(full_path, 'w') as fp:
        json.dump(data, fp, default=json_util.default)  # Use json_util.default to handle MongoDB specific types if needed

# Directory and filename
dir = 'OUTPUTS'
filename = 'listings_with_reviews_and_cal_subset_1000.json'

# Call the function with correct parameters
write_dict_to_dir_json(cleaned_documents, dir, filename)