In [None]:
!pip install pymongo

In [None]:
import pandas as pd
import json
from bson import json_util

def read_mongodb_json(file_path):
    # Read the entire file content
    with open(file_path, 'r') as file:
        content = file.read()

    # Split the content into individual JSON objects
    json_objects = content.strip().split('}\n{')

    # Properly format each JSON object
    json_objects = [obj if obj.startswith('{') else '{' + obj for obj in json_objects]
    json_objects = [obj if obj.endswith('}') else obj + '}' for obj in json_objects]

    # Parse each JSON object
    data = [json.loads(obj, object_hook=json_util.object_hook) for obj in json_objects]

    # Use json_normalize to flatten nested structures
    df = pd.json_normalize(data)

    # Clean up column names
    df.columns = df.columns.str.replace('$', '')
    df.columns = df.columns.str.replace('.', '_')

    return df

# Read the data
brands_path = 'brands.json'
receipts_path = 'receipts.json'
users_path = 'users.json'

brands_df = read_mongodb_json(brands_path)
receipts_df = read_mongodb_json(receipts_path)
users_df = read_mongodb_json(users_path)


In [None]:
# Brands data analysis
print("=== Brands Data Analysis ===")

# Display sample of brands data
print(brands_df.sample(5))

# Calculate percentage of test data
name_percentage = (brands_df['name'].str.contains('test brand', case=False, na=False).sum() / len(brands_df)) * 100
brandcode_percentage = (brands_df['brandCode'].str.contains('TEST BRANDCODE', case=False, na=False).sum() / len(brands_df)) * 100

print(f"Percentage of 'name' containing 'test brand': {name_percentage:.2f}%")
print(f"Percentage of 'brandCode' containing 'TEST BRANDCODE': {brandcode_percentage:.2f}%")

# Shape of brands data
print(f"Shape of brands data: {brands_df.shape}")

# Percentage of null values in brands data
print("Percentage of null values in brands data:")
print(brands_df.isnull().sum() / len(brands_df) * 100)

# Count of duplicated values in brands data
print(f"Number of duplicated rows in brands data: {brands_df.duplicated().sum()}")

# Data types of columns in brands data
print("Data types of columns in brands data:")
print(brands_df.dtypes)

# Check for brandCode values containing barcode data
barcode_like = brands_df['brandCode'].str.contains(r'^\d{12,13}$', na=False)
print(f"Number of brandCode values that look like barcodes: {barcode_like.sum()}")

In [None]:
# Receipts data analysis
print("\n=== Receipts Data Analysis ===")

# Display sample of receipts data
print(receipts_df.head())

# Shape of receipts data
print(f"Shape of receipts data: {receipts_df.shape}")

# Data types of columns in receipts data
print("Data types of columns in receipts data:")
print(receipts_df.dtypes)

# Percentage of null values in receipts data (excluding rewardsReceiptItemList)
print("Percentage of null values in receipts data (excluding rewardsReceiptItemList):")
print(receipts_df.drop('rewardsReceiptItemList', axis=1).isnull().sum() / len(receipts_df) * 100)

# Analysis of rewardsReceiptItemList
exploded_items_df = receipts_df['rewardsReceiptItemList'].dropna().explode().apply(pd.Series)

print("Number of null values in exploded rewardsReceiptItemList:")
print(exploded_items_df.isnull().sum())

print("Percentage of null values in exploded rewardsReceiptItemList:")
print(exploded_items_df.isnull().sum() / len(exploded_items_df) * 100)

In [None]:
# Users data analysis
print("\n=== Users Data Analysis ===")

# Display sample of users data
print(users_df.head())

# Count and percentage of duplicated users
duplicates_count = users_df.duplicated().sum()
duplicates_percentage = duplicates_count / len(users_df) * 100
print(f"Number of duplicated rows in users data: {duplicates_count}")
print(f"Percentage of duplicated rows in users data: {duplicates_percentage:.2f}%")

# Percentage of null values in users data
print("Percentage of null values in users data:")
print(users_df.isnull().sum() / len(users_df) * 100)

# Check for consistency between users and receipts data
user_ids_in_receipts = set(receipts_df['userId'].unique())
user_ids_in_users = set(users_df['_id'])
missing_users = user_ids_in_receipts - user_ids_in_users
print(f"Number of user IDs in receipts not found in users data: {len(missing_users)}")