In [146]:
import pandas as pd
from pymongo import MongoClient
import math
import numpy as np

# Load CSV files
listings_df = pd.read_csv('/Users/benedictnursalim/Desktop/data/listings.csv', keep_default_na=False)
reviews_df = pd.read_csv(
    '/Users/benedictnursalim/Desktop/data/reviews.csv',
    dtype={"date": str},  # Load 'date' as string to convert later
    keep_default_na=False
)

# Convert 'id' in listings_df and 'listing_id' in reviews_df to string
listings_df['id'] = listings_df['id'].astype(str)
listings_df['host_id'] = listings_df['host_id'].astype(str)

#Change reviews_df
reviews_df['listing_id'] = reviews_df['listing_id'].astype(str)
reviews_df['id'] = reviews_df['id'].astype(str)
reviews_df['reviewer_id'] = reviews_df['reviewer_id'].astype(str)
columns = ["date", "listing_id", "id", "reviewer_id", "reviewer_name", "comments"]
reviews_df = reviews_df[columns]


listings_df['price'] = listings_df['price'].apply(lambda x: None if x == '' else x)
listings_df['last_review'] = listings_df['last_review'].apply(lambda x: None if x == '' else x)
listings_df['last_review'] = listings_df['last_review'].apply(lambda x: None if x == '' else x)
# Properly handle empty strings and conversion to float
listings_df['reviews_per_month'] = listings_df['reviews_per_month'].apply(
    lambda x: None if x == '' else float(x) if pd.notna(x) else None
)
listings_df = listings_df.replace(np.nan, None)



# Define the desired column order
columns_listings = ["id", "name", "host_id", "host_name", "neighbourhood_group",
           "neighbourhood", "latitude", "longitude", "room_type", "price",
           "minimum_nights", "number_of_reviews", "last_review", "reviews_per_month",
           "calculated_host_listings_count", "availability_365", "number_of_reviews_ltm",
           "license"]

# Reorder the columns in the DataFrame according to the list 'columns'
listings_df = listings_df[columns_listings]


# Setup MongoDB connection
client = MongoClient()
db = client.airbnb

# Drop existing collections to avoid duplicates
db.listings_t.drop()
db.reviews_t.drop()

# Convert 'date' in reviews_df from string to datetime and localize to UTC

# Insert data into MongoDB
db.listings_t.insert_many(listings_df.to_dict('records'))
db.reviews_t.insert_many(reviews_df.to_dict('records'))
db.reviews_t.create_index('listing_id')


[0.86, 0.14, 0.95, 0.05, 0.25, 1.99, 2.14, 0.84, 1.95, 0.05, 0.54, 0.53, 1.2, 0.63, 0.65, 4.35, 0.14, 0.32, 0.28, 1.42, 0.28, 2.74, 1.56, 0.4, 2.86, 3.81, 0.08, 0.16, 4.23, 1.61, 0.49, 5.0, 3.0, 1.41, 2.12, 0.2, 2.06, 0.27, 0.4, 0.19, 2.04, 1.84, 0.11, 0.28, 0.13, 0.1, 0.07, 1.56, 0.63, 2.02, 0.09, 0.35, 2.44, 0.11, 1.64, 0.21, 1.54, 2.75, 0.06, 0.1, 1.32, 1.25, 1.41, 0.02, 1.46, 0.71, 1.03, 0.21, 1.26, 1.35, 0.26, 0.02, 0.07, 4.06, 0.18, 0.64, 0.34, 0.19, 0.03, 1.72, 0.97, 2.88, 0.21, 0.01, 0.02, 2.37, 0.02, 6.62, 0.81, 0.33, 0.23, 0.77, 0.06, 0.05, 0.23, 1.66, 0.31, 0.16, 0.86, 2.31, 1.2, 1.92, 0.34, 3.32, 3.38, 1.0, 0.12, 0.07, 0.5, 0.52, 0.6, 1.76, 5.17, 0.23, 0.46, 0.13, 0.07, 0.78, 0.47, 1.46, 0.91, 0.09, 0.5, 0.05, 1.12, 0.17, 0.06, 1.06, 0.55, 1.06, 1.48, 1.49, 0.51, 2.05, 0.9, 0.1, 0.38, 0.35, 0.8, 0.25, 0.39, 1.48, 0.24, 1.15, 2.76, 0.39, 0.31, 0.3, 0.12, 0.23, 1.68, 0.43, 0.29, 0.41, 0.03, 1.5, 1.05, 4.33, 2.79, 0.45, 1.85, 0.35, 0.78, 0.36, 0.37, 0.92, 0.34, 1.37, 0.01, 0.4

'listing_id_1'

In [147]:
pipeline = [
    {
        '$lookup': {
            'from': 'reviews_t',
            'localField': 'id',
            'foreignField': 'listing_id',
            'as': 'reviews'
        }
    },
    {
        '$out': 'listings_with_reviews_m'
    }
]

db.listings_t.aggregate(pipeline)

<pymongo.command_cursor.CommandCursor at 0x168b6cb90>

In [148]:
print(reviews_df.dtypes)

date             object
listing_id       object
id               object
reviewer_id      object
reviewer_name    object
comments         object
dtype: object


In [149]:
import json
import datetime
from pymongo import MongoClient
from bson import json_util, ObjectId

# Setup MongoDB connection
client = MongoClient()  # Include your MongoDB URI if necessary
db = client['airbnb']  # Adjust the database name as necessary

def simplify_mongo_data(document):
    if isinstance(document, dict):
        for key, value in document.items():
            if isinstance(value, ObjectId):
                document[key] = str(value)
            elif isinstance(value, datetime.datetime):
                document[key] = value.strftime('%Y-%m-%dT%H:%M:%S.000Z')
            elif isinstance(value, dict):
                document[key] = simplify_mongo_data(value)
            elif isinstance(value, list):
                document[key] = [simplify_mongo_data(item) for item in value]
    return document

# Query documents where 'id' starts with "111"
subset_docs = db.listings_with_reviews_m.find({'id': {'$regex': '^1000.*'}})

# Convert cursor to a list and process documents to convert BSON types to strings
subset_docs_list = [simplify_mongo_data(doc) for doc in subset_docs]

def write_dict_to_dir_json(data, dir, filename):
    # Creates a JSON file from dictionary data
    import os
    if not os.path.exists(dir):
        os.makedirs(dir)  # Ensure the directory exists
    full_path = f"{dir}/{filename}"
    with open(full_path, 'w') as fp:
        json.dump(data, fp, default=json_util.default)  # Use json_util.default to handle MongoDB specific types if needed

# Directory and filename
dir = 'OUTPUTS'
filename = 'listings_with_reviews_m_subset_1000.json'

# Call the function with correct parameters
write_dict_to_dir_json(subset_docs_list, dir, filename)


In [150]:
with open('OUTPUTS/listings_with_reviews_m_subset_1000.json', 'r') as file:
    data = json.load(file)

number_of_documents = len(data)
print(f"The file contains {number_of_documents} documents.")

The file contains 28 documents.
