In [1]:
# Import necessary libraries
import wikipedia
import os
import pysolr

In [2]:
# Constants
CORE_NAME = "IRF23P1"
VM_IP = "35.202.185.173"

In [3]:
# Create a Solr connection
CORE_NAME = "mycol1"
# solr_url = f'http://{VM_IP}:8983/solr/{CORE_NAME}/'
solr = pysolr.Solr("http://35.202.185.173:8983/solr/" + CORE_NAME, always_commit=True, timeout=5000000)

# Function to delete and create a Solr core
def setup_solr_core(core=CORE_NAME):
    print("Deleting the Solr core...")
    solr.delete(q="*:*")

    print("Creating a new Solr core...")
    create_core_command = f'sudo su - solr -c "/opt/solr/bin/solr create -c {core} -n data_driven_schema_configs"'
    os.system(create_core_command)

# Function to index a document
def index_document(doc):
    solr.add([doc])

In [4]:
# Function to scrape Wikipedia data and preprocess it

def scrape_and_index(topic, num_documents):
    print(f"Scraping {num_documents} documents for the topic: {topic}...")

    documents = []
    unique_documents = set()

    while len(unique_documents) < num_documents:
        try:
            # Refine the search query to be more specific
            search_query = f"{topic} Wikipedia"

            search_results = wikipedia.search(search_query, results=10)

            for result in search_results:
                try:
                    page = wikipedia.page(result, auto_suggest=False)
                    document = {
                        "title": page.title,
                        "content": page.content
                    }

                    # Check for uniqueness and minimum length
                    if len(document["content"]) >= 200 and document["content"] not in unique_documents:
                        unique_documents.add(document["content"])
                        documents.append(document)

                        # Stop when we have enough documents
                        if len(documents) == num_documents:
                            break
                except Exception as e:
                    print(f"Error scraping page: {result}, Error: {str(e)}")
        except Exception as e:
            print(f"Error in search query: {search_query}, Error: {str(e)}")

    print(f"Indexing {len(documents)} unique documents for the topic: {topic}...")

    for doc in documents:
        index_document(doc)



In [5]:
if __name__ == "__main__":
    # List of topics and the number of documents to scrape for each
    topics_and_counts = {
        "Health": 500,
        "Environment": 500,
        "Technology": 500,
        "Economy": 500,
        "Entertainment": 500,
        "Sports": 500,
        "Politics": 500,
        "Education": 500,
        "Travel": 500,
        "Food": 500
    }

    # Set up the Solr core
    setup_solr_core()

    # Scraping and indexing for each topic
    for topic, num_documents in topics_and_counts.items():
        scrape_and_index(topic, num_documents)

    print("Scraping and indexing completed.")

Deleting the Solr core...
Creating a new Solr core...
Scraping 500 documents for the topic: Health...


KeyboardInterrupt: ignored