# Main part

In [1]:
import wikipedia
import pysolr
import re
import requests

In [2]:
CORE_NAME = "testing"
VM_IP = "34.121.33.106"

In [3]:
class Indexer:
    def __init__(self):
        self.solr_url = f'https://{VM_IP}:8983/solr/'
        self.connection = pysolr.Solr(f"http://{VM_IP}:8983/solr/" + CORE_NAME, always_commit = True, timeout = 10000)

    def create_documents(self, docs):
        print(self.connection.add(docs))

In [4]:
def clean_summary(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

In [5]:
def scrape_and_index_wikipedia(main_topic, subtopics, min_unique_documents=1250, min_summary_length=200):
    documents = []
    unique_titles = set()

    # Scrape data for the main topic
    main_topic_documents = []
    search_results_main = wikipedia.search(main_topic, results=min_unique_documents)
    for page_title in search_results_main:
        try:
            page = wikipedia.page(page_title)

            if page.title not in unique_titles and len(page.summary) >= min_summary_length:
                cleaned_summary = clean_summary(page.summary)
                if cleaned_summary:
                    document = {
                        "revision_id": str(page.revision_id),
                        "title": page.title,
                        "summary": cleaned_summary,
                        "url": page.url,
                        "topic": main_topic,
                    }
                    main_topic_documents.append(document)
                    unique_titles.add(page.title)
                    print(f"Scraped {len(main_topic_documents)} unique documents for {main_topic}: {page.title}...")
        except wikipedia.exceptions.PageError as e:
            pass
        except wikipedia.exceptions.DisambiguationError as e:
            pass

    documents.extend(main_topic_documents)

    # Distribute the remaining documents equally among subtopics
    documents_per_subtopic = (min_unique_documents - len(main_topic_documents)) // len(subtopics)

    for subtopic in subtopics:
        topic = f"{main_topic} - {subtopic}"

        # Scrape data for each subtopic
        search_results_sub = wikipedia.search(subtopic, results=min_unique_documents)
        for page_title in search_results_sub:
            try:
                page = wikipedia.page(page_title)

                if page.title not in unique_titles and len(page.summary) >= min_summary_length:
                    cleaned_summary = clean_summary(page.summary)
                    if cleaned_summary:
                        document = {
                            "revision_id": str(page.revision_id),
                            "title": page.title,
                            "summary": cleaned_summary,
                            "url": page.url,
                            "topic": main_topic,
                        }
                        documents.append(document)
                        unique_titles.add(page.title)
                        print(f"Scraped {len(documents)} unique documents for {topic}: {page.title}...")
            except wikipedia.exceptions.PageError as e:
                pass
            except wikipedia.exceptions.DisambiguationError as e:
                pass

    # Index the scraped documents
    indexer = Indexer()
    print(f"Indexing {len(documents)} documents for {main_topic} in Solr...")
    indexer.create_documents(documents)

In [6]:
topics = [
    ("Health", ["Common diseases", "global health statistics", "mental health trends"]),
    ("Environment", ["Global warming", "endangered species", "deforestation rates"]),
    ("Technology", ["Emerging technologies", "Artificial intelligence", "Software"]),
    ("Economy", ["Stock market", "job markets", "cryptocurrency"]),
    ("Entertainment", ["Music industry", "cultural events", "streaming platforms"]),
    ("Sports", ["Cricket", "Football", "sports analytics"]),
    ("Politics", ["Elections", "international relations", "Political parties"]),
    ("Education", ["Literacy rates", "online education", "student loans"]),
    ("Travel", ["Top tourist destinations", "airline industry", "travel trends"]),
    ("Food", ["Famines", "global hunger", "food security"])
]

In [7]:
len(topics)

10

In [8]:
for main_topic, subtopics in topics:
    print(f"Scraping data for {main_topic}...")
    scrape_and_index_wikipedia(main_topic, subtopics, min_unique_documents=5)

Scraping data for Health...
Scraped 1 unique documents for Health: Death...
Scraped 2 unique documents for Health: Health (film)...
Scraped 3 unique documents for Health: Mental health...
Scraped 4 unique documents for Health: Health care...




  lis = BeautifulSoup(html).find_all('li')


Scraped 5 unique documents for Health - Common diseases: Common disease-common variant...
Scraped 6 unique documents for Health - Common diseases: List of causes of death by rate...
Scraped 7 unique documents for Health - Common diseases: Disease...
Scraped 8 unique documents for Health - Common diseases: Mangosteen...
Scraped 9 unique documents for Health - Common diseases: Autoimmune disease...
Scraped 10 unique documents for Health - global health statistics: Global health...
Scraped 11 unique documents for Health - global health statistics: Institute for Health Metrics and Evaluation...
Scraped 12 unique documents for Health - global health statistics: Global Burden of Disease Study...
Scraped 13 unique documents for Health - global health statistics: Global Health Corps...
Scraped 14 unique documents for Health - global health statistics: Global Health Observatory...
Scraped 15 unique documents for Health - mental health trends: Mental disorder...
Scraped 16 unique documents for H

KeyboardInterrupt: 

# Verification

In [9]:
import pysolr

CORE_NAME = "testing"
VM_IP = "34.121.33.106"

solr = pysolr.Solr(f"http://{VM_IP}:8983/solr/{CORE_NAME}", always_commit=True, timeout = 1000)

query = "*:*"

results = solr.search(query)

lst = []

print(f"Total documents found: {results.hits}")

for result in results:
    print("Document:")
    print(f"  Revision ID: {result['revision_id']}")
    print(f"  Title: {result['title']}")
    print(f"  Summary: {result['summary']}")
    print(f"  URL: {result['url']}")
    print(f"  Topic: {result['topic']}")
    print("\n")
    lst.append(result['topic'])

Total documents found: 19
Document:
  Revision ID: [1187205041]
  Title: ['Death']
  Summary: ['Death is the irreversible cessation of all biological functions that sustain an organism For organisms with a brain death can also be defined as the irreversible cessation of functioning of the whole brain including the brainstem  Brain death is sometimes used as a legal definition of death The remains of a former organism normally begins to decompose shortly after death Death is an inevitable process that eventually occurs in all organisms Some organisms such as Turritopsis dohrnii are biologically immortal However they can still die from means other than agingDetermining when someone has definitively died has proven difficult Initially death was defined as occurring when breathing and the heartbeat ceased a status still known as clinical death However the development of CPR meant it was no longer strictly irreversible Brain death was the next option but several definitions exist for this S

# JSON creataion

In [16]:
import pysolr
import json

CORE_NAME = "sample_test"
VM_IP = "34.121.33.106"

solr = pysolr.Solr(f"http://{VM_IP}:8983/solr/{CORE_NAME}", always_commit = True, timeout = 100000)

start = 0
rows = 100

documents = []

while True:

    query = "*:*"

    results = solr.search(query, start = start, rows = rows)

    for result in results:
        document = {
            "revision_id": result['revision_id'],
            "title": result['title'],
            "summary": result['summary'],
            "url": result['url'],
            "topic": result['topic']
        }
        documents.append(document)

    if len(results) < rows:
        break

    start += rows

json_file_path = "indexed_documents.json"

with open(json_file_path, "w", encoding="utf-8") as json_file:
    json.dump(documents, json_file, ensure_ascii=False, indent=4)

print(f"Indexed documents saved to {json_file_path}")

Indexed documents saved to indexed_documents.json
