In [1]:
from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')
db = client['your_database_name']
collection = db['your_collection_name']


In [5]:
import pprint
pprint.pprint(list(collection.index_information()))


[]


In [3]:
collection.drop_index('index_name')


In [4]:
class MongoDBManager:
    def __init__(self, uri="mongodb://localhost:27017/"):
        self.client = MongoClient(uri)
        self.db = self.client['article_db']
        self.article_collection = self.db['articles']

    def drop_all_indexes(self):
        """Drop all indexes on the article collection except for the default _id index."""
        self.article_collection.drop_indexes()

    def recreate_indexes(self):
        """Create necessary indexes according to the new schema."""
        self.article_collection.create_index("metadata.url", unique=True)
        self.article_collection.create_index("metadata.published_date")
        self.article_collection.create_index("metadata.keywords")


In [2]:
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client['article_db']
collection = db['articles']

# Option 1: Delete All Documents in the Collection
def delete_all_documents():
    """Deletes all documents in the 'articles' collection."""
    result = collection.delete_many({})
    print(f"Deleted {result.deleted_count} documents from the collection.")

# Option 2: Drop the Entire Collection
def drop_collection():
    """Drops the entire 'articles' collection."""
    collection.drop()
    print("Collection 'articles' has been dropped.")

# Run either function as needed
# delete_all_documents()  # Deletes all documents
drop_collection()     # Drops the collection entirely


Collection 'articles' has been dropped.


In [47]:
from pymongo import MongoClient
import pprint

client = MongoClient("mongodb://localhost:27017/")
db = client['article_db']
collection = db['articles']

def aggregate_nested_schema(collection_name, sub_document):
    collection = db[collection_name]
    pipeline = [
        {"$match": {sub_document: {"$exists": True}}},  # Ensure the sub-document exists
        {"$project": {sub_document: {"$objectToArray": f"${sub_document}"}}},
        {"$unwind": f"${sub_document}"},
        {"$group": {"_id": None, "allKeys": {"$addToSet": f"${sub_document}.k"}}}
    ]
    results = collection.aggregate(pipeline)
    for result in results:
        print(f"All keys in {sub_document} of collection {collection_name}:")
        pprint.pprint(result['allKeys'])

# Example usage for 'metadata' and 'summary' sub-documents
aggregate_nested_schema('articles', 'metadata')
aggregate_nested_schema('articles', 'summary')


All keys in metadata of collection articles:
['processing_date',
 'url',
 'published_date',
 'authors',
 'content',
 'detailed_processing_date',
 'keywords',
 'title']
All keys in summary of collection articles:
['text', 'model_used']


In [51]:
from pymongo import MongoClient
import pprint

client = MongoClient("mongodb://localhost:27017/")
db = client['article_db']
users_collection = db['users']

def aggregate_schema(collection):
    pipeline = [
        {"$project": {"arrayOfKeyValue": {"$objectToArray": "$$ROOT"}}},
        {"$unwind": "$arrayOfKeyValue"},
        {"$group": {"_id": None, "allKeys": {"$addToSet": "$arrayOfKeyValue.k"}}}
    ]
    results = collection.aggregate(pipeline)
    for result in results:
        print(f"Keys found in {collection.name}:")
        pprint.pprint(result['allKeys'])

def aggregate_nested_schema(collection_name, sub_document):
    collection = db[collection_name]
    pipeline = [
        {"$match": {sub_document: {"$exists": True}}},  # Ensure the sub-document exists
        {"$project": {sub_document: {"$objectToArray": f"${sub_document}"}}},
        {"$unwind": f"${sub_document}"},
        {"$group": {"_id": None, "allKeys": {"$addToSet": f"${sub_document}.k"}}}
    ]
    results = collection.aggregate(pipeline)
    for result in results:
        print(f"All keys in {sub_document} of collection {collection_name}:")
        pprint.pprint(result['allKeys'])

# Get overall schema of the users collection
aggregate_schema(users_collection)

# If you know there are nested documents, such as a nested 'articles' or 'profiles', analyze them too
aggregate_nested_schema('users', 'articles')
aggregate_nested_schema('users', 'profiles')  # Change 'profiles' to the actual sub-document name if different


Keys found in users:
['email', 'articles', '_id']
All keys in articles of collection users:
['29102024', '30102024', '01112024', '28102024']


In [49]:
from pymongo import MongoClient
import pprint

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client['article_db']
collection = db['articles']  # Replace 'articles' with your collection name

# Check if the collection has any documents and print them
def check_collection():
    # Check if any documents exist in the collection
    if collection.count_documents({}) > 0:
        print("Documents found in the collection:")
        for document in collection.find():
            pprint.pprint(document)  # Pretty-print each document
    else:
        print("No documents found in the collection.")

# Run the check
check_collection()


Documents found in the collection:
{'_id': ObjectId('671f5763fc1603d19ae9352b'),
 'metadata': {'authors': ['Ben Tossell'],
              'content': 'Teachers today have more tools at their disposal '
                         'than ever before. Many AI apps also offer free '
                         'features that can make daily tasks easier. But how '
                         'can you navigate this noisy marketplace and find the '
                         'right tools for your teaching needs?\n'
                         '\n'
                         'This post covers the best free AI tools available to '
                         "teachers. We'll look at the key features of each "
                         'tool and how they can be useful in the classroom. '
                         'Side note: many of these tools have optional paid '
                         'plans, but you can get started with the free version '
                         'and pay to upgrade if/when you need.\n'
        

In [3]:
from pymongo import MongoClient
import pprint

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client['article_db']
collection = db['users']  # Replace 'articles' with your collection name

# Check if the collection has any documents and print them
def check_collection():
    # Check if any documents exist in the collection
    if collection.count_documents({}) > 0:
        print("Documents found in the collection:")
        for document in collection.find():
            pprint.pprint(document)  # Pretty-print each document
    else:
        print("No documents found in the collection.")

# Run the check
check_collection()

Documents found in the collection:
{'_id': ObjectId('671f5763d75b84aa4186c4b0'),
 'articles': {'28102024': ['671f5763fc1603d19ae9352b',
                           '671f576dfc1603d19ae9352c',
                           '671f577afc1603d19ae9352d',
                           '671feba01820fa5d57dbcc97',
                           '671febaa1820fa5d57dbcc98',
                           '671febb81820fa5d57dbcc99',
                           '671febc61820fa5d57dbcc9a',
                           '671febd01820fa5d57dbcc9b',
                           '671febd71820fa5d57dbcc9c',
                           '671febe11820fa5d57dbcc9d',
                           '671febe81820fa5d57dbcc9e',
                           '671febf11820fa5d57dbcc9f',
                           '671febfc1820fa5d57dbcca0',
                           '671fec041820fa5d57dbcca1',
                           '671fec0e1820fa5d57dbcca2',
                           '671fec211820fa5d57dbcca3',
                           '671fec29182

In [50]:
# ... existing code ...

# Check if the collection has any documents and print them
def check_collection():
    # Check if any documents exist in the collection
    if collection.count_documents({}) > 0:
        total_articles = 0  # Counter for total articles
        print(f"{collection.count_documents({})} Documents found in the collection:")
        for document in collection.find():
            pprint.pprint(document)  # Pretty-print each document
            # Count articles in the current document
            if 'users' in document:
                total_articles += len(document['articles'])

        print(f"Total articles in the collection: {total_articles}")  # Print total articles
    else:
        print("No documents found in the collection.")

# Run the check
check_collection()

275 Documents found in the collection:
{'_id': ObjectId('671f5763fc1603d19ae9352b'),
 'metadata': {'authors': ['Ben Tossell'],
              'content': 'Teachers today have more tools at their disposal '
                         'than ever before. Many AI apps also offer free '
                         'features that can make daily tasks easier. But how '
                         'can you navigate this noisy marketplace and find the '
                         'right tools for your teaching needs?\n'
                         '\n'
                         'This post covers the best free AI tools available to '
                         "teachers. We'll look at the key features of each "
                         'tool and how they can be useful in the classroom. '
                         'Side note: many of these tools have optional paid '
                         'plans, but you can get started with the free version '
                         'and pay to upgrade if/when you need.\n'
    

In [61]:
"""
MongoDB Schema and Data Viewer
Analyzes collection schema and displays user data
"""
from pymongo import MongoClient
import pprint
from datetime import datetime

# Initialize MongoDB connection
client = MongoClient("mongodb://localhost:27017/")
db = client['article_db']
users_collection = db['users']
articles_collection = db['articles']

def aggregate_schema(collection):
    """Analyze schema structure of a collection."""
    pipeline = [
        {"$project": {"arrayOfKeyValue": {"$objectToArray": "$$ROOT"}}},
        {"$unwind": "$arrayOfKeyValue"},
        {"$group": {"_id": None, "allKeys": {"$addToSet": "$arrayOfKeyValue.k"}}}
    ]
    results = collection.aggregate(pipeline)
    for result in results:
        print(f"\nKeys found in {collection.name}:")
        pprint.pprint(result['allKeys'])

def aggregate_nested_schema(collection_name, sub_document):
    """Analyze nested document schema structure."""
    collection = db[collection_name]
    pipeline = [
        {"$match": {sub_document: {"$exists": True}}},
        {"$project": {sub_document: {"$objectToArray": f"${sub_document}"}}},
        {"$unwind": f"${sub_document}"},
        {"$group": {"_id": None, "allKeys": {"$addToSet": f"${sub_document}.k"}}}
    ]
    results = collection.aggregate(pipeline)
    for result in results:
        print(f"\nAll keys in {sub_document} of collection {collection_name}:")
        pprint.pprint(result['allKeys'])

def display_user_data():
    """Display all users and their associated data."""
    users = users_collection.find()

    print("\n=== User Data Analysis ===")
    for user in users:
        print("\nUser:", user['email'])

        # Display article dates for this user
        if 'articles' in user and user['articles']:
            print("  Article Dates:")
            for date, article_ids in user['articles'].items():
                print(f"    {date}: {len(article_ids)} articles")

                # Fetch and display article details
                print("    Articles:")
                for article_id in article_ids:
                    article = articles_collection.find_one({"_id": article_id})
                    if article:
                        print(f"      - Title: {article['metadata']['title']}")
                        print(f"        URL: {article['metadata']['url']}")
                        print(f"        Processing Date: {article['metadata']['processing_date']}")
        else:
            print("  No articles found")

def analyze_user_statistics():
    """Analyze and display user statistics."""
    total_users = users_collection.count_documents({})
    users_with_articles = users_collection.count_documents({"articles": {"$ne": {}}})

    print("\n=== User Statistics ===")
    print(f"Total Users: {total_users}")
    print(f"Users with Articles: {users_with_articles}")

    # Analyze article distribution
    pipeline = [
        {"$project": {
            "email": 1,
            "articleCount": {"$size": {"$objectToArray": "$articles"}}
        }},
        {"$group": {
            "_id": None,
            "avgArticlesPerUser": {"$avg": "$articleCount"},
            "maxArticlesPerUser": {"$max": "$articleCount"},
            "minArticlesPerUser": {"$min": "$articleCount"}
        }}
    ]

    stats = list(users_collection.aggregate(pipeline))
    if stats:
        stats = stats[0]
        print(f"Average Articles per User: {stats['avgArticlesPerUser']:.2f}")
        print(f"Maximum Articles per User: {stats['maxArticlesPerUser']}")
        print(f"Minimum Articles per User: {stats['minArticlesPerUser']}")

def find_user_by_email(email):
    """Find and display specific user data."""
    user = users_collection.find_one({"email": email})
    if user:
        print(f"\n=== Data for User: {email} ===")
        if 'articles' in user and user['articles']:
            article_count = sum(len(articles) for articles in user['articles'].values())
            print(f"Total Articles: {article_count}")

            for date, article_ids in user['articles'].items():
                print(f"\nDate: {date}")
                print(f"Articles count: {len(article_ids)}")
                for article_id in article_ids:
                    article = articles_collection.find_one({"_id": article_id})
                    if article:
                        print(f"  - {article['metadata']['title']}")
        else:
            print("No articles found for this user")
    else:
        print(f"No user found with email: {email}")

def analyze_user_statistics():
    """Analyze and display user statistics with articles by date."""
    total_users = users_collection.count_documents({})
    users_with_articles = users_collection.count_documents({"articles": {"$ne": {}}})

    print("\n=== User Statistics ===")
    print(f"Total Users: {total_users}")
    print(f"Users with Articles: {users_with_articles}")

    # Analyze article distribution
    pipeline = [
        {"$project": {
            "email": 1,
            "articleCount": {"$size": {"$objectToArray": "$articles"}}
        }},
        {"$group": {
            "_id": None,
            "avgArticlesPerUser": {"$avg": "$articleCount"},
            "maxArticlesPerUser": {"$max": "$articleCount"},
            "minArticlesPerUser": {"$min": "$articleCount"}
        }}
    ]

    stats = list(users_collection.aggregate(pipeline))
    if stats:
        stats = stats[0]
        print(f"Average Articles per User: {stats['avgArticlesPerUser']:.2f}")
        print(f"Maximum Articles per User: {stats['maxArticlesPerUser']}")
        print(f"Minimum Articles per User: {stats['minArticlesPerUser']}")

    # Display articles by user and date
    print("\n=== Articles by User and Date ===")
    users = users_collection.find({"articles": {"$ne": {}}})

    for user in users:
        print(f"\nUser: {user['email']}")
        for date, article_ids in user['articles'].items():
            print(f"  Date: {date} - {len(article_ids)} articles")
            for article_id in article_ids:
                article = articles_collection.find_one({"_id": article_id})
                if article:
                    print(f"    - Title: {article['metadata']['title']}")
                    print(f"      URL: {article['metadata']['url']}")
                    print(f"      Processing Date: {article['metadata']['processing_date']}")

def main():
    """Main function to run all analyses."""
    print("\n=== MongoDB Schema Analysis ===")

    # Analyze collection schemas
    aggregate_schema(users_collection)
    aggregate_schema(articles_collection)

    # Analyze nested schemas
    aggregate_nested_schema('users', 'articles')
    aggregate_nested_schema('articles', 'metadata')
    aggregate_nested_schema('articles', 'summary')

    # Display user data and statistics with article details by date
    display_user_data()
    analyze_user_statistics()

if __name__ == "__main__":
    main()



=== MongoDB Schema Analysis ===

Keys found in users:
['_id', 'email', 'articles']

Keys found in articles:
['metadata', 'summary', '_id']

All keys in articles of collection users:
['29102024', '30102024', '01112024', '28102024']

All keys in metadata of collection articles:
['processing_date',
 'url',
 'authors',
 'published_date',
 'content',
 'detailed_processing_date',
 'keywords',
 'title']

All keys in summary of collection articles:
['text', 'model_used']

=== User Data Analysis ===

User: joe@gmail.com
  Article Dates:
    28102024: 163 articles
    Articles:

User: 
  Article Dates:
    28102024: 1 articles
    Articles:
    29102024: 6 articles
    Articles:

User: amrtyilmaz@gmail.com
  Article Dates:
    28102024: 1 articles
    Articles:
    29102024: 189 articles
    Articles:
    30102024: 2 articles
    Articles:
    01112024: 3 articles
    Articles:

=== User Statistics ===
Total Users: 3
Users with Articles: 3
Average Articles per User: 2.33
Maximum Articles per Us

In [63]:
import json
from bson import ObjectId

def display_article_by_id(article_id):
    """
    Display the details of an article by its ID.

    Args:
        article_id (str): The ID of the article to be displayed.
    """
    # Convert the article_id to ObjectId if it's a valid format
    try:
        article_id = ObjectId(article_id)
    except Exception as e:
        print(f"Invalid article ID format: {article_id}")
        return

    # Fetch the article from the collection using the provided ID
    article = articles_collection.find_one({"_id": article_id})

    # Check if the article exists and print its details
    if article:
        print(f"\n=== Article Details for ID: {article_id} ===")
        print(json.dumps(article, indent=4, default=str))
    else:
        print(f"No article found with ID: {article_id}")

# Example usage
# display_article_by_id("some_article_id")


In [66]:
display_article_by_id("671f5763fc1603d19ae9352b")


=== Article Details for ID: 671f5763fc1603d19ae9352b ===
{
    "_id": "671f5763fc1603d19ae9352b",
    "metadata": {
        "url": "https://bensbites.com/blog/top-free-ai-tools-for-teachers-in-2024",
        "title": "Top free AI tools for teachers in 2024",
        "content": "Teachers today have more tools at their disposal than ever before. Many AI apps also offer free features that can make daily tasks easier. But how can you navigate this noisy marketplace and find the right tools for your teaching needs?\n\nThis post covers the best free AI tools available to teachers. We'll look at the key features of each tool and how they can be useful in the classroom. Side note: many of these tools have optional paid plans, but you can get started with the free version and pay to upgrade if/when you need.\n\nLet's dive in.\n\nBest free AI teacher tools for creating resources and lesson plans\n\n\ud83d\udcdd Diffit\n\nKey features:\n\nDiffit uses AI to help teachers quickly generate differen

In [68]:
display_article_by_id("67253d7e497f59407b1b8ee6")


=== Article Details for ID: 67253d7e497f59407b1b8ee6 ===
{
    "_id": "67253d7e497f59407b1b8ee6",
    "metadata": {
        "url": "http://chatgpt.com/search",
        "title": "SearchGPT Prototype",
        "content": "OpenAI is testing SearchGPT, a prototype designed to enhance AI search capabilities by providing fast and relevant answers sourced from the web. This initiative aims to improve user experience by allowing conversational interactions and real-time information retrieval. The prototype is being launched to a select group of users and publishers for feedback, with plans to integrate successful features into ChatGPT. The focus is on creating a symbiotic relationship between technology and journalism, ensuring that high-quality content is highlighted in search results.",
        "authors": [],
        "published_date": "July 25, 2024",
        "processing_date": "2024-11-01",
        "detailed_processing_date": "01112024",
        "keywords": [
            "ai-search",
     