In [1]:
topics = {

   "Health":[
      "Medicine",
      "Nutrition",
      "Mental Health Trends",
      "Epidemiology",
      "Vaccination",
      "Health Insurance",
      "Genetic Disorders",
      "Pharmaceutical Industry",
      "Global Health Organizations",
      "Medical Devices",
      "Chronic Illnesses",
      "Addiction Medicine",
      "Sleep Disorders"
   ],
   "Environment":[
      "Climate Change",
      "Pollution",
      "Recycling",
      "Deforestation",
      "Endangered Species",
      "Natural Disasters",
      "Sustainable Agriculture",
      "Renewable Energy",
      "Ecotourism",
      "Oceans and Marine Life",
      "Conservation Biology",
      "Carbon Footprint",
      "Volcanoes",
   ],
   "Technology":[
      "Artificial Intelligence",
      "Machine Learning",
      "Cybersecurity",
      "Quantum Computing",
      "5G",
      "Blockchain",
      "Augmented Reality",
      "Internet of Things",
      "Biotechnology",
      "Nanotechnology",
      "Space Exploration Technologies",
      "Computer Vision",
      "Human-Computer Interaction"
   ],
   "Economy":[
      "Stock Market",
      "Inflation",
      "Unemployment",
      "GDP",
      "Consumer Price Index",
      "Interest Rates",
      "Microeconomics",
      "Macroeconomics",
      "Income Inequality",
      "Housing Market",
      "Fiscal Policy",
      "Venture Capital",
      "Labor Market Trends"
   ],
   "Entertainment":[
      "Music",
      "E Sports",
      "Video Games",
      "Artists",
      "Youtube",
      "Streaming Services",
      "Netflix",
      "Film Festivals",
      "Celebrity Culture",
      "Pop Culture",
      "Theater and Performing Arts",
      "Stand-up Comedy",
      "Animation",
      "Reality TV",
      "Anime",
      "Virtual Reality",
   ],
   "Sports":[
      "Football",
      "Basketball",
      "Soccer",
      "Baseball",
      "Hockey",
      "Tennis",
      "Golf",
      "Sports Events",
      "Olympics",
      "Martial Arts",
      "Winter Sports",
      "Extreme Sports",
      "NFL",
      "NBA",
   ],
   "Politics":[
      "Election",
      "Public Policy",
      "Political Parties",
      "Government",
      "Political Leaders",
      "Political Movements",
      "International Relations",
      "Constitutional Law",
      "Political Ideologies",
      "Diplomacy",
      "Human Rights",
      "Global Governance",
      "National Security",
      "Political Theories",
      "Voting Systems",
      "Civic Participation",
      "Policy Analysis",
      "Civil Liberties",
      "Geopolitical Conflicts",
      "Regional Alliances"
   ],
   "Education":[
      "Literacy Rate",
      "Masters Degree",
      "Universities",
      "Online Learning",
      "Education Statistics",
      "University at Buffalo",
      "Women In STEM",
      "Philosophy",
      "Physics",
      "Mathematics",
   ],
   "Travel":[
      "Tourists",
      "Airline Industry",
      "Railway",
      "Cruise Ships",
      "Travel Insurance",
      "Backpacking Culture",
      "Travel Blogs",
      "Adventure Tourism",
      "Travel Photography",
      "National Parks",
      "Visa Policies",
      "International Travel Regulations",
   ],
   "Food":[
      "Indian Food",
      "Spices",
      "Street Food",
      "Fast Food",
      "Nutrition Science", 
      "Dietary Supplements", 
      "Food Preservation", 
      "Ethnic Cuisines", 
      "Baking and Pastry",
      "Food Allergies", 
      "Healthy Eating", 
      "Fermentation Techniques", 
      "Food and Culture"
    
   ]
}

In [15]:
import wikipedia
import time
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import logging

# Set up logging
logging.basicConfig(filename='scraping.log', level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s %(threadName)s')


def getTopicPages(searchQueries: list, topicName: str, nPages=1200) -> list:
    pages = []
    resultsPerSubtopic = max(1, nPages // len(searchQueries))
    uniquePagesUrls = set()
    lock = threading.Lock()

    def processSearchResult(searchResult):
        pageData = {}
        disambiguationErrorCount = 0
        retry_attempts = 0
        max_retries = 4

        while retry_attempts < max_retries:
            try:
                page = wikipedia.page(searchResult, auto_suggest=False, preload=True)
                time.sleep(0.25)
                if len(page.summary) < 200:
                    logging.info(f"Summary too short for {searchResult}")
                    return None
                with lock:
                    if page.url in uniquePagesUrls:
                        logging.info(f"Page already added: {page.title}")
                        return None
                    uniquePagesUrls.add(page.url)
                pageData["revision_id"] = page.revision_id
                pageData["title"] = page.title
                pageData["url"] = page.url
                pageData["summary"] = page.summary
                pageData["topic"] = topicName
                pageData["content"] = page.content
                logging.info(f"Page added: {pageData['title']}")
                return pageData

            except (TimeoutError, ConnectionError, OSError) as e:
                logging.warning(f"TimeoutError on Page: {searchResult}, Error: {e}")
                time.sleep(2 ** retry_attempts)
                retry_attempts += 1

            except wikipedia.exceptions.DisambiguationError as e:
                disambiguationErrorCount += 1
                logging.warning(f"DisambiguationError on Page: {searchResult}, Options: {e.options}")
                if disambiguationErrorCount > 3:
                    logging.warning(f"DisambiguationError count exceeded 3 for {searchResult}")
                    return None
                searchResult = e.options[0]

            except wikipedia.exceptions.PageError as e:
                # logging.error(f"PageError on Page: {searchResult}, Error: {e}")
                return None

            except Exception as e:
                logging.error(f"Unexpected error on Page: {searchResult}, Error: {e}")
                return None

        logging.error(f"Failed to process {searchResult} after {max_retries} retries")
        return None

    try:
        with ThreadPoolExecutor(max_workers=12) as executor:
            futures = []
            for searchQuery in searchQueries:
                try:
                    searchResults = wikipedia.search(searchQuery, results=resultsPerSubtopic)
                except Exception as e:
                    logging.error(f"Exception during search for query '{searchQuery}' in topic '{topicName}': {e}")
                    continue

                logging.info(f"{len(searchResults)} results found for '{searchQuery}' in topic '{topicName}'")
                searchResults = list(set(searchResults))

                for searchResult in searchResults:
                    with lock:
                        if len(pages) >= nPages:
                            break
                    future = executor.submit(processSearchResult, searchResult)
                    futures.append(future)

                with lock:
                    if len(pages) >= nPages:
                        break

            # Collect results as they complete
            for future in as_completed(futures):
                try:
                    result = future.result()
                    if result is not None:
                        with lock:
                            if len(pages) >= nPages:
                                break
                            pages.append(result)
                except Exception as e:
                    logging.error(f"Exception in processing future in topic '{topicName}': {e}")

                with lock:
                    if len(pages) >= nPages:
                        break

        logging.info(f"Total pages collected for topic '{topicName}': {len(pages)}")
        return pages

    except Exception as e:
        logging.error(f"Exception in getTopicPages for topic '{topicName}': {e}")
        return []

def scrapeAndSave():
    dataDict = dict()
    with ThreadPoolExecutor(max_workers=3) as executor:
        future_to_topic = {executor.submit(getTopicPages, subtopics, topicName): topicName for topicName, subtopics in topics.items()}
        for future in as_completed(future_to_topic):
            topicName = future_to_topic[future]
            try:
                pages = future.result()
                if pages:
                    dataDict[topicName] = pages
                    logging.info(f"Collected {len(pages)} pages for topic '{topicName}'")
                else:
                    logging.warning(f"No pages collected for topic '{topicName}'")
            except Exception as exc:
                logging.error(f"Exception in scrapeAndSave for topic '{topicName}': {exc}")

    with open("data5.json", "w") as f:
        json.dump(dataDict, f, indent=4)
 
    return dataDict

if __name__ == "__main__":
    scrapeAndSave()



  lis = BeautifulSoup(html).find_all('li')


In [None]:
import json
# load data
with open("data.json", "r") as f:
    data = json.load(f)
    

# Remove duplicates from the data based on url
urls = set()
for topic in data.keys():
    uniquePages = []
    
    for page in data[topic]:
        if page["url"] not in urls and len(page["summary"]) > 200:
            uniquePages.append(page)
            urls.add(page["url"])
    data[topic] = uniquePages

# Count the number of pages per topic
for topic in data.keys():
    print("Number of pages Unique for ", topic, " : ", len(data[topic]))


# Check for number of pages with summary less than 200 characters for each topic

for topic in data.keys():
    count = 0


Number of pages Unique for  Economy  :  5305
Number of pages Unique for  Health  :  5649
Number of pages Unique for  Environment  :  5277
Number of pages Unique for  Technology  :  5559
Number of pages Unique for  Entertainment  :  5764
Number of pages Unique for  Sports  :  5844
Number of pages Unique for  Travel  :  5041
Number of pages Unique for  Education  :  5325
Number of pages Unique for  Politics  :  5264
Number of pages Unique for  Food  :  5209
Number of pages with summary less than 200 characters for  Economy  :  0
Number of pages with summary less than 200 characters for  Health  :  0
Number of pages with summary less than 200 characters for  Environment  :  0
Number of pages with summary less than 200 characters for  Technology  :  0
Number of pages with summary less than 200 characters for  Entertainment  :  0
Number of pages with summary less than 200 characters for  Sports  :  0
Number of pages with summary less than 200 characters for  Travel  :  0
Number of pages wit