In [161]:
import pandas as pd
from pymongo import MongoClient
import math
import numpy as np

# Load CSV files
listings_df = pd.read_csv('/Users/benedictnursalim/Desktop/data/listings.csv', keep_default_na=False)
reviews_df = pd.read_csv(
    '/Users/benedictnursalim/Desktop/data/reviews.csv',
    dtype={"date": str},  # Load 'date' as string to convert later
    keep_default_na=False
)

# Convert 'id' in listings_df and 'listing_id' in reviews_df to string
listings_df['id'] = listings_df['id'].astype(str)
listings_df['host_id'] = listings_df['host_id'].astype(str)

#Change reviews_df
reviews_df['listing_id'] = reviews_df['listing_id'].astype(str)
reviews_df['id'] = reviews_df['id'].astype(str)
reviews_df['reviewer_id'] = reviews_df['reviewer_id'].astype(str)
columns = ["date", "listing_id", "id", "reviewer_id", "reviewer_name", "comments"]
reviews_df = reviews_df[columns]


listings_df['price'] = listings_df['price'].apply(lambda x: None if x == '' else x)
listings_df['last_review'] = listings_df['last_review'].apply(lambda x: None if x == '' else x)
listings_df['last_review'] = listings_df['last_review'].apply(lambda x: None if x == '' else x)
# Properly handle empty strings and conversion to float
listings_df['reviews_per_month'] = listings_df['reviews_per_month'].apply(
    lambda x: None if x == '' else float(x) if pd.notna(x) else None
)
listings_df = listings_df.replace(np.nan, None)



# Define the desired column order
columns_listings = ["id", "name", "host_id", "host_name", "neighbourhood_group",
           "neighbourhood", "latitude", "longitude", "room_type", "price",
           "minimum_nights", "number_of_reviews", "last_review", "reviews_per_month",
           "calculated_host_listings_count", "availability_365", "number_of_reviews_ltm",
           "license"]

# Reorder the columns in the DataFrame according to the list 'columns'
listings_df = listings_df[columns_listings]


# Setup MongoDB connection
client = MongoClient()
db = client.airbnb

# Drop existing collections to avoid duplicates
db.listings_t.drop()
db.reviews_t.drop()

# Convert 'date' in reviews_df from string to datetime and localize to UTC

# Insert data into MongoDB
db.listings_t.insert_many(listings_df.to_dict('records'))
db.reviews_t.insert_many(reviews_df.to_dict('records'))
db.reviews_t.create_index('listing_id')


'listing_id_1'

In [162]:
pipeline = [
    {
        '$lookup': {
            'from': 'reviews_t',
            'localField': 'id',
            'foreignField': 'listing_id',
            'as': 'reviews'
        }
    },
    {
        '$out': 'listings_with_reviews_m'
    }
]

db.listings_t.aggregate(pipeline)

<pymongo.command_cursor.CommandCursor at 0x15c174cd0>

In [163]:
print(reviews_df.dtypes)

date             object
listing_id       object
id               object
reviewer_id      object
reviewer_name    object
comments         object
dtype: object


In [166]:
import json
import datetime
from pymongo import MongoClient
from bson import json_util, ObjectId

# Setup MongoDB connection
client = MongoClient()
db = client['airbnb']

def simplify_mongo_data(document):
    if isinstance(document, dict):
        for key, value in document.items():
            if isinstance(value, ObjectId):
                document[key] = str(value)
            elif isinstance(value, datetime.datetime):
                document[key] = value.strftime('%Y-%m-%dT%H:%M:%S.000Z')
            elif isinstance(value, dict):
                document[key] = simplify_mongo_data(value)
            elif isinstance(value, list):
                document[key] = [simplify_mongo_data(item) for item in value]
    return document

# Query documents
subset_docs = db.listings_with_reviews_m.find({'id': {'$regex': '^1000.*'}})

# Convert cursor to a list and process documents to convert BSON types to strings
subset_docs_list = [simplify_mongo_data(doc) for doc in subset_docs]

def write_dict_to_dir_json(data, dir, filename):
    # Creates a JSON file from dictionary data
    import os
    if not os.path.exists(dir):
        os.makedirs(dir)  # Ensure the directory exists
    full_path = f"{dir}/{filename}"
    with open(full_path, 'w') as fp:
        json.dump(data, fp, default=json_util.default)

# Directory and filename
dir = 'OUTPUTS'
filename = 'listings_with_reviews_m_subset_1000.json'

# Call the function
write_dict_to_dir_json(subset_docs_list, dir, filename)


In [165]:
with open('OUTPUTS/listings_with_reviews_m_subset_1000.json', 'r') as file:
    data = json.load(file)

number_of_documents = len(data)
print(f"The file contains {number_of_documents} documents.")

The file contains 43 documents.
