## Web Scrpaing

### Import Necessary Libraries

In [1]:
import pandas as pd
import wikipedia
import time
import requests
from bs4 import BeautifulSoup, FeatureNotFound

# UB IT name based user agent
user = "amuthusu/IR (https://www.linkedin.com/in/anirudhmuthusundaram/; anirudhms247@gmail.com)"

# User agent Assignment
wikipedia.set_user_agent(user)

# API endpoint
url = "https://www.mediawiki.org/w/api.php"
headers = {
    "User-Agent": user
}

### Health

In [10]:
# Define the topics and subtopics
title = {
    'Health': [
        'Common diseases',
        'Global health statistics',
        'Mental health trends',
        'Cardiovascular health',
        'Diabetes management',
        'Cancer types and treatments',
        'Childhood vaccinations',
        'Epidemiology of infectious diseases',
        'Nutrition and dietetics',
        'Mental disorders and therapies',
        'Public health policies',
        'Healthcare systems worldwide',
        'Women’s health issues',
        'Men’s health issues',
        'Pediatric health and diseases',
        'Geriatric care and aging',
        'Dental health and hygiene',
        'Eye health and vision care',
        'Orthopedics and musculoskeletal disorders',
        'Dermatological conditions and skincare',
        'Neurological disorders',
        'Gastrointestinal diseases',
        'Respiratory diseases and treatment',
        'Endocrinology and hormonal disorders',
        'Physical fitness and exercise',
        'Mental wellness and mindfulness',
        'Alternative and complementary medicine',
        'Pharmacology and medication',
        'Environmental health and pollution',
        'Occupational health and safety',
        'Global pandemics and outbreaks',
        'Vaccination and immunization',
        'First aid and emergency medicine',
        'Radiology and imaging',
        'Surgical advancements and techniques',
        'Telemedicine and digital health',
        'Addiction and rehabilitation',
        'Reproductive health and fertility',
        'Healthcare technology and innovations',
        'Public health education and awareness',
        'Health insurance and economics',
        'Epidemics and their control',
        'Sleep health and disorders',
        'Allergies and autoimmune diseases',
        'Health effects of climate change',
        'Genetics in medicine',
        'Global health initiatives'
    ]
}

# Initialize an empty list to store the scraped data
data_list = []

# Each and Every Heading and Side Headings related to the title gets looped.
# Webpages with unique titles and Summary over 250 characters are chosen.
# More than 5600 documents are set as a limit to extract information.
# Webpages are appended onto a list that later can be used to get a DataFrame.
# Wikipedia Disambiguation, HTTP timeout and Page errors are handled to enhance the process.
for heading, side_heading in title.items():
    short_summary_count = 0
    unique_titles = set()

    for iterate in side_heading:
        wikipedia.set_lang("en")
        search_results = wikipedia.search(iterate, results=5600)
        for webpage_heading in search_results:
            retry_count = 0
            while retry_count < 3:  # Retry up to 3 times
                try:
                    if webpage_heading in unique_titles:
                        break
                    page = wikipedia.page(webpage_heading)
                    summary = page.summary
                    if len(summary) >= 250:
                        data_list.append({
                            'Topic': heading,
                            'Title': page.title,
                            'Summary': summary,
                            'URL': page.url,
                            'Revision ID': page.revision_id  
                        })
                        unique_titles.add(webpage_heading)
                    break  # Exit retry loop if successful
                except wikipedia.exceptions.DisambiguationError:
                    break  # No retry for disambiguation errors
                except wikipedia.exceptions.PageError:
                    print(f"PageError for title: {webpage_heading}")
                    break  # No retry for page errors
                except (wikipedia.exceptions.HTTPTimeoutError, requests.exceptions.ConnectionError):
                    print(f"Error for title: {webpage_heading}, retrying ({retry_count + 1}/3)")
                    time.sleep(5)  # Wait 5 seconds before retrying
                    retry_count += 1
                except Exception as e:
                    print(f"Unhandled exception for title: {webpage_heading}: {e}")
                    break  # Exit retry loop for any other exception

            if len(unique_titles) >= 5600:
                break

        if len(unique_titles) >= 5600:
            break


PageError for title: Common cold
PageError for title: Pogona
PageError for title: Nonsense mutation
PageError for title: Infection
PageError for title: Tetter
PageError for title: Lupus
PageError for title: Nobel disease
PageError for title: ALS
PageError for title: Chagas disease
PageError for title: Bronchus
PageError for title: Human body
PageError for title: Canavan disease
PageError for title: Batten disease
PageError for title: List of rye diseases
PageError for title: Diseases of the foot
PageError for title: Kuru (disease)
PageError for title: Enteritis
PageError for title: Tomato
PageError for title: Atypical pneumonia
PageError for title: Gallstone
PageError for title: Longan
PageError for title: Common nighthawk
PageError for title: Herd immunity
PageError for title: Canada
PageError for title: Sustainable Development Goal 3
PageError for title: Average human height by country
PageError for title: Statistics of the COVID-19 pandemic in Brazil
PageError for title: Allied heal

In [11]:
# Create the CSV from the DataFrame
h_df = pd.DataFrame(data_list)
csv_file_name = "./Data/Health.csv"
h_df.to_csv(csv_file_name, index=False)

# Check with the csv file.
Health = pd.read_csv("./Data/Health.csv")
Health.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        5600 non-null   object
 1   Title        5600 non-null   object
 2   Summary      5600 non-null   object
 3   URL          5600 non-null   object
 4   Revision ID  5600 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 218.9+ KB


### Environment

In [2]:
# Define the topics and subtopics
title = {
    'Environment': [
        'Global warming', 'Endangered species', 'Deforestation rates',
        'Climate change effects', 'Sea level rise', 'Ocean acidification',
        'Carbon footprint reduction', 'Renewable energy sources', 'Greenhouse gas emissions',
        'Wildlife conservation', 'Habitat loss', 'Invasive species',
        'Biodiversity hotspots', 'Pollinator decline', 'Marine life protection',
        'Plastic pollution', 'Air quality and smog', 'Water pollution',
        'Hazardous waste disposal', 'Recycling and upcycling', 'E-waste management',
        'Sustainable agriculture', 'Water scarcity and management', 'Fossil fuels vs. renewable energy',
        'Deforestation and reforestation', 'Soil degradation', 'Solar and wind energy advancements',
        'Environmental laws and regulations', 'Corporate sustainability practices', 'Environmental activism',
        'Impact of urbanization', 'Eco-friendly products and innovations', 'Environmental education and awareness',
        'Amazon rainforest conservation', 'Great Barrier Reef deterioration', 'Arctic ice melt',
        'Air quality in major cities', 'Effects of specific environmental disasters', 'Conservation efforts in specific countries',
        'Climate change predictions', 'Future of renewable energy', 'Long-term conservation strategies',
        'Environmental impact of technology', 'Sustainable urban planning', 'Population growth and environmental impact'
    ],
}
# Initialize an empty list to store the scraped data
data_list = []

# Each and Every Heading and Side Headings related to the title gets looped.
# Webpages with unique titles and Summary over 250 characters are chosen.
# More than 5100 documents are set as a limit to extract information.
# Webpages are appended onto a list that later can be used to get a DataFrame.
# Wikipedia Disambiguation, HTTP timeout and Page errors are handled to enhance the process.
for heading, side_heading in title.items():
    short_summary_count = 0
    unique_titles = set()

    for iterate in side_heading:
        wikipedia.set_lang("en")
        search_results = wikipedia.search(iterate, results=5600)
        for webpage_heading in search_results:
            retry_count = 0
            while retry_count < 3:  # Retry up to 3 times
                try:
                    if webpage_heading in unique_titles:
                        break
                    page = wikipedia.page(webpage_heading)
                    summary = page.summary
                    if len(summary) >= 250:
                        data_list.append({
                            'Topic': heading,
                            'Title': page.title,
                            'Summary': summary,
                            'URL': page.url,
                            'Revision ID': page.revision_id  
                        })
                        unique_titles.add(webpage_heading)
                    break  # Exit retry loop if successful
                except wikipedia.exceptions.DisambiguationError:
                    break  # No retry for disambiguation errors
                except wikipedia.exceptions.PageError:
                    print(f"PageError for title: {webpage_heading}")
                    break  # No retry for page errors
                except (wikipedia.exceptions.HTTPTimeoutError, requests.exceptions.ConnectionError):
                    print(f"Error for title: {webpage_heading}, retrying ({retry_count + 1}/3)")
                    time.sleep(5)  # Wait 5 seconds before retrying
                    retry_count += 1
                except Exception as e:
                    print(f"Unhandled exception for title: {webpage_heading}: {e}")
                    break  # Exit retry loop for any other exception

            if len(unique_titles) >= 5600:
                break

        if len(unique_titles) >= 5600:
            break



  lis = BeautifulSoup(html).find_all('li')


PageError for title: Drought
PageError for title: Indus River
PageError for title: Climate of Mars
PageError for title: Arctic
PageError for title: John Hagee
PageError for title: Polar bear
PageError for title: James Spann
PageError for title: Frank Luntz
PageError for title: Environmental issues
PageError for title: Maximum Ride
PageError for title: Deforestation
PageError for title: Hurricane Sandy
PageError for title: El Niño
PageError for title: Rat snake
PageError for title: Richard Branson
PageError for title: An Inconvenient Truth
PageError for title: Greta Thunberg
PageError for title: Steven Tyler
PageError for title: Environmental science
PageError for title: R-410A
PageError for title: Shale gas
PageError for title: Titanic II (film)
PageError for title: Pollution
PageError for title: Water scarcity
PageError for title: R-454B
PageError for title: Al Gore
PageError for title: Swami Sundaranand
PageError for title: City
PageError for title: The Washington Times
PageError for

In [3]:
# Create the CSV from the DataFrame
env_df = pd.DataFrame(data_list)
csv_file_name = "./Data/Environment.csv"
env_df.to_csv(csv_file_name, index=False)

# Check with the csv file.
environment = pd.read_csv("./Data/Environment.csv")
environment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        5600 non-null   object
 1   Title        5600 non-null   object
 2   Summary      5600 non-null   object
 3   URL          5600 non-null   object
 4   Revision ID  5600 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 218.9+ KB


### Technology

In [4]:
# Define the topics and subtopics
title = {
    'Technology': [
        'Emerging technologies', 'AI advancements', 'Bitcoin',
        'Blockchain applications', 'Quantum computing', '5G technology',
        'Virtual reality (VR) developments', 'Augmented reality (AR) trends', 'Internet of Things (IoT)',
        'Big data analytics', 'Cloud computing', 'Cybersecurity trends',
        'Biotechnology breakthroughs', 'Nanotechnology', 'Renewable energy technology',
        'Electric vehicles', 'Autonomous driving', 'Space exploration technology',
        'Robotics and automation', 'Wearable technology', 'Smart home devices',
        '3D printing applications', 'Artificial neural networks', 'Machine learning algorithms',
        'Digital currency trends', 'Fintech innovations', 'Smart cities',
        'Telemedicine and e-health', 'Sustainable tech solutions', 'Tech industry ethics',
        'Data privacy and protection', 'Social media technology', 'Tech startups and entrepreneurship',
        'E-commerce technology trends', 'Gaming technology advancements', 'Streaming media technology'
    ],
}


# Initialize an empty list to store the scraped data
data_list = []

# Each and Every Heading and Side Headings related to the title gets looped.
# Webpages with unique titles and Summary over 250 characters are chosen.
# More than 5100 documents are set as a limit to extract information.
# Webpages are appended onto a list that later can be used to get a DataFrame.
# Wikipedia Disambiguation, HTTP timeout and Page errors are handled to enhance the process.
for heading, side_heading in title.items():
    short_summary_count = 0
    unique_titles = set()

    for iterate in side_heading:
        wikipedia.set_lang("en")
        search_results = wikipedia.search(iterate, results=5600)
        for webpage_heading in search_results:
            retry_count = 0
            while retry_count < 3:  # Retry up to 3 times
                try:
                    if webpage_heading in unique_titles:
                        break
                    page = wikipedia.page(webpage_heading)
                    summary = page.summary
                    if len(summary) >= 250:
                        data_list.append({
                            'Topic': heading,
                            'Title': page.title,
                            'Summary': summary,
                            'URL': page.url,
                            'Revision ID': page.revision_id  
                        })
                        unique_titles.add(webpage_heading)
                    break  # Exit retry loop if successful
                except wikipedia.exceptions.DisambiguationError:
                    break  # No retry for disambiguation errors
                except wikipedia.exceptions.PageError:
                    print(f"PageError for title: {webpage_heading}")
                    break  # No retry for page errors
                except (wikipedia.exceptions.HTTPTimeoutError, requests.exceptions.ConnectionError):
                    print(f"Error for title: {webpage_heading}, retrying ({retry_count + 1}/3)")
                    time.sleep(5)  # Wait 5 seconds before retrying
                    retry_count += 1
                except Exception as e:
                    print(f"Unhandled exception for title: {webpage_heading}: {e}")
                    break  # Exit retry loop for any other exception

            if len(unique_titles) >= 5600:
                break

        if len(unique_titles) >= 5600:
            break

PageError for title: Transhumanism
PageError for title: Special Competitive Studies Project




  lis = BeautifulSoup(html).find_all('li')


PageError for title: 3D printing
PageError for title: Wendell Wallach
PageError for title: Daphne Koller
PageError for title: SD-WAN
PageError for title: Loitering munition
PageError for title: Timeline of historic inventions
PageError for title: Bioconvergence
PageError for title: Vertical farming
PageError for title: Society 5.0
PageError for title: Superhuman
PageError for title: Accenture
PageError for title: Gavin Andresen
PageError for title: 3CX
PageError for title: Designer baby
PageError for title: Anthony Dunne
PageError for title: Upgrade
PageError for title: Flying car
PageError for title: Schmidt Futures
Error for title: Bone ash, retrying (1/3)
PageError for title: Nanoengineering
PageError for title: Transrapid
PageError for title: DevOps
PageError for title: Metaliteracy
PageError for title: Nick Bostrom
PageError for title: Beggars in Spain
PageError for title: AI boom
PageError for title: ServiceNow
PageError for title: Generative artificial intelligence
PageError for

In [5]:
# Create the CSV from the DataFrame
tech_df = pd.DataFrame(data_list)
csv_file_name = "./Data/Technology.csv"
tech_df.to_csv(csv_file_name, index=False)

# Check with the csv file.
Technology = pd.read_csv("./Data/Technology.csv")
Technology.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        5600 non-null   object
 1   Title        5600 non-null   object
 2   Summary      5600 non-null   object
 3   URL          5600 non-null   object
 4   Revision ID  5600 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 218.9+ KB


## Economy

In [6]:
# Define the topics and subtopics
title = {
    'Economy': [
        'Stock market performance', 'Job markets', 'Cryptocurrency trends',
        'Global economic growth', 'Inflation rates', 'Unemployment trends',
        'Foreign exchange markets', 'Commodity prices', 'Real estate market trends',
        'Central bank policies', 'Fiscal policy', 'Monetary policy',
        'Economic impacts of COVID-19', 'E-commerce growth', 'Gig economy',
        'Trade wars and tariffs', 'Emerging market economies', 'Economic inequality',
        'Sustainable economic development', 'Consumer spending trends', 'Retail industry analysis',
        'Technology impact on economy', 'Green economy', 'Economic forecasting',
        'Business cycle analysis', 'Supply chain disruptions', 'Labor market dynamics',
        'Global investment trends', 'Financial regulation', 'Public debt and deficits',
        'Entrepreneurship trends', 'Venture capital markets', 'Corporate mergers and acquisitions',
        'Economic indicators', 'Energy market dynamics', 'Healthcare economics'
    ],
}

# Initialize an empty list to store the scraped data
data_list = []

# Each and Every Heading and Side Headings related to the title gets looped.
# Webpages with unique titles and Summary over 250 characters are chosen.
# More than 5100 documents are set as a limit to extract information.
# Webpages are appended onto a list that later can be used to get a DataFrame.
# Wikipedia Disambiguation, HTTP timeout and Page errors are handled to enhance the process.
for heading, side_heading in title.items():
    short_summary_count = 0
    unique_titles = set()

    for iterate in side_heading:
        wikipedia.set_lang("en")
        search_results = wikipedia.search(iterate, results=5600)
        for webpage_heading in search_results:
            retry_count = 0
            while retry_count < 3:  # Retry up to 3 times
                try:
                    if webpage_heading in unique_titles:
                        break
                    page = wikipedia.page(webpage_heading)
                    summary = page.summary
                    if len(summary) >= 250:
                        data_list.append({
                            'Topic': heading,
                            'Title': page.title,
                            'Summary': summary,
                            'URL': page.url,
                            'Revision ID': page.revision_id  
                        })
                        unique_titles.add(webpage_heading)
                    break  # Exit retry loop if successful
                except wikipedia.exceptions.DisambiguationError:
                    break  # No retry for disambiguation errors
                except wikipedia.exceptions.PageError:
                    print(f"PageError for title: {webpage_heading}")
                    break  # No retry for page errors
                except (wikipedia.exceptions.HTTPTimeoutError, requests.exceptions.ConnectionError):
                    print(f"Error for title: {webpage_heading}, retrying ({retry_count + 1}/3)")
                    time.sleep(5)  # Wait 5 seconds before retrying
                    retry_count += 1
                except Exception as e:
                    print(f"Unhandled exception for title: {webpage_heading}: {e}")
                    break  # Exit retry loop for any other exception

            if len(unique_titles) >= 5600:
                break

        if len(unique_titles) >= 5600:
            break



  lis = BeautifulSoup(html).find_all('li')


PageError for title: S&P 500
PageError for title: Russell 3000 Index
PageError for title: Russell 1000 Index
PageError for title: Nasdaq-100
PageError for title: Danone
PageError for title: Abnormal return
PageError for title: ISO 9000
PageError for title: KOSPI
PageError for title: Nifty Fifty
PageError for title: Wilshire 5000
PageError for title: CAC 40
PageError for title: Money market
PageError for title: P/B ratio
PageError for title: Dow Jones
PageError for title: Russell 2500 Index
PageError for title: Rights issue
PageError for title: EURO STOXX 50
PageError for title: S&P 600
PageError for title: PTC India
PageError for title: AEX index
PageError for title: Jerome Powell
PageError for title: Jordan Belfort
PageError for title: Hero MotoCorp
PageError for title: Hedge fund
PageError for title: Smith & Wesson M&P15
PageError for title: PSI-20
PageError for title: Ray Dalio
PageError for title: Bill Ackman
PageError for title: Amer Sports
PageError for title: Maxar Technologies


In [7]:
# Create the CSV from the DataFrame
eco_df = pd.DataFrame(data_list)
csv_file_name = "./Data/Economy.csv"
eco_df.to_csv(csv_file_name, index=False)

# Check with the csv file.
Economy = pd.read_csv("./Data/Economy.csv")
Economy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        5600 non-null   object
 1   Title        5600 non-null   object
 2   Summary      5600 non-null   object
 3   URL          5600 non-null   object
 4   Revision ID  5600 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 218.9+ KB


## Entertainment

In [8]:
# Define the topics and subtopics
title = {
    'Entertainment': [
        'Music industry', 'Popular cultural events', 'Streaming platforms',
        'Film industry trends', 'Television series popularity', 'Celebrity influence',
        'Video gaming industry', 'Virtual concerts and events', 'Social media influencers',
        'Theater and Broadway shows', 'Reality TV impact', 'Podcasting trends',
        'Book publishing industry', 'Comic books and graphic novels', 'Animation and CGI advancements',
        'Fashion trends and events', 'Art exhibitions and galleries', 'Stand-up comedy and live shows',
        'Sports entertainment', 'Award shows and ceremonies', 'Fan culture and fandoms',
        'Dance and performance arts', 'Photography in the digital age', 'Mobile gaming',
        'VR/AR in entertainment', 'Esports growth', 'Influencer marketing',
        'Cultural festivals', 'Film festivals and screenings', 'Music streaming trends',
        'Digital art and NFTs', 'Youth culture and entertainment', 'Classic vs. modern entertainment',
        'Interactive entertainment', 'Cable TV vs. streaming wars', 'Celebrity fashion and style trends'
    ],
}

# Initialize an empty list to store the scraped data
data_list = []

# Each and Every Heading and Side Headings related to the title gets looped.
# Webpages with unique titles and Summary over 250 characters are chosen.
# More than 5100 documents are set as a limit to extract information.
# Webpages are appended onto a list that later can be used to get a DataFrame.
# Wikipedia Disambiguation, HTTP timeout and Page errors are handled to enhance the process.
for heading, side_heading in title.items():
    short_summary_count = 0
    unique_titles = set()

    for iterate in side_heading:
        wikipedia.set_lang("en")
        search_results = wikipedia.search(iterate, results=5600)
        for webpage_heading in search_results:
            retry_count = 0
            while retry_count < 3:  # Retry up to 3 times
                try:
                    if webpage_heading in unique_titles:
                        break
                    page = wikipedia.page(webpage_heading)
                    summary = page.summary
                    if len(summary) >= 250:
                        data_list.append({
                            'Topic': heading,
                            'Title': page.title,
                            'Summary': summary,
                            'URL': page.url,
                            'Revision ID': page.revision_id  
                        })
                        unique_titles.add(webpage_heading)
                    break  # Exit retry loop if successful
                except wikipedia.exceptions.DisambiguationError:
                    break  # No retry for disambiguation errors
                except wikipedia.exceptions.PageError:
                    print(f"PageError for title: {webpage_heading}")
                    break  # No retry for page errors
                except (wikipedia.exceptions.HTTPTimeoutError, requests.exceptions.ConnectionError):
                    print(f"Error for title: {webpage_heading}, retrying ({retry_count + 1}/3)")
                    time.sleep(5)  # Wait 5 seconds before retrying
                    retry_count += 1
                except Exception as e:
                    print(f"Unhandled exception for title: {webpage_heading}: {e}")
                    break  # Exit retry loop for any other exception

            if len(unique_titles) >= 5600:
                break

        if len(unique_titles) >= 5600:
            break



  lis = BeautifulSoup(html).find_all('li')


PageError for title: Music
PageError for title: Record producer
PageError for title: Pop music
PageError for title: Tim Farriss
PageError for title: 2023 in Latin music
PageError for title: Andrew Farriss
PageError for title: G Flip
PageError for title: Music of Turkey
PageError for title: C-pop
PageError for title: MTV
PageError for title: Angèle (singer)
PageError for title: NF (rapper)
PageError for title: Tina Arena
PageError for title: Sony Music
PageError for title: Blackout Tuesday
PageError for title: Simon Cowell
PageError for title: Goa trance
PageError for title: Tommy Emmanuel
PageError for title: Midnight Oil
PageError for title: Kris Kross
PageError for title: 1989 (album)
PageError for title: MAMA Awards
PageError for title: Music publisher
PageError for title: Thom Yorke
PageError for title: Tones and I
PageError for title: Russell Simmons
PageError for title: Taylor Swift
PageError for title: Alicia Keys
PageError for title: UK Singles Chart
PageError for title: Martin

In [9]:
# Create the CSV from the DataFrame
entr_df = pd.DataFrame(data_list)
csv_file_name = "./Data/Entertainment.csv"
entr_df.to_csv(csv_file_name, index=False)

# Check with the csv file.
Entertainment = pd.read_csv("./Data/Entertainment.csv")
Entertainment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        5600 non-null   object
 1   Title        5600 non-null   object
 2   Summary      5600 non-null   object
 3   URL          5600 non-null   object
 4   Revision ID  5600 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 218.9+ KB


## Sports

In [10]:
# Define the topics and subtopics
title = {
    'Sports': [
        'Cricket','CricketT20', 'Cricket World Cup'
        'Major sporting events', 'Sports analytics', 'Sports history',
        'Olympic Games developments', 'World Cup soccer', 'Tennis Grand Slams',
        'NBA and basketball trends', 'NFL and American football updates', 'MLB and baseball news',
        'NHL and hockey insights', 'Golf tournaments', 'Boxing and MMA events',
        'Track and field athletics', 'Swimming competitions', 'Gymnastics championships',
        'Extreme sports', 'E-sports and competitive gaming', 'Cricket updates',
        'Rugby league and union', 'Formula 1 and motorsports', 'Cycling events like Tour de France',
        'Winter sports and skiing', 'Marathons and endurance races', 'Horse racing',
        'Sports nutrition and fitness trends', 'Sports psychology', 'Youth and college sports',
        'Sports technology advancements', 'Sports medicine and injuries', 'Women in sports',
        'Sports governance and policies', 'International sports diplomacy', 'Sports and environmental sustainability',
        'Athlete endorsements and sponsorships', 'Sports broadcasting and media rights', 'Fan cultures and sports communities'
    ],
}


# Initialize an empty list to store the scraped data
data_list = []

# Each and Every Heading and Side Headings related to the title gets looped.
# Webpages with unique titles and Summary over 250 characters are chosen.
# More than 5100 documents are set as a limit to extract information.
# Webpages are appended onto a list that later can be used to get a DataFrame.
# Wikipedia Disambiguation, HTTP timeout and Page errors are handled to enhance the process.
for heading, side_heading in title.items():
    short_summary_count = 0
    unique_titles = set()

    for iterate in side_heading:
        wikipedia.set_lang("en")
        search_results = wikipedia.search(iterate, results=5600)
        for webpage_heading in search_results:
            retry_count = 0
            while retry_count < 3:  # Retry up to 3 times
                try:
                    if webpage_heading in unique_titles:
                        break
                    page = wikipedia.page(webpage_heading)
                    summary = page.summary
                    if len(summary) >= 250:
                        data_list.append({
                            'Topic': heading,
                            'Title': page.title,
                            'Summary': summary,
                            'URL': page.url,
                            'Revision ID': page.revision_id  
                        })
                        unique_titles.add(webpage_heading)
                    break  # Exit retry loop if successful
                except wikipedia.exceptions.DisambiguationError:
                    break  # No retry for disambiguation errors
                except wikipedia.exceptions.PageError:
                    print(f"PageError for title: {webpage_heading}")
                    break  # No retry for page errors
                except (wikipedia.exceptions.HTTPTimeoutError, requests.exceptions.ConnectionError):
                    print(f"Error for title: {webpage_heading}, retrying ({retry_count + 1}/3)")
                    time.sleep(5)  # Wait 5 seconds before retrying
                    retry_count += 1
                except Exception as e:
                    print(f"Unhandled exception for title: {webpage_heading}: {e}")
                    break  # Exit retry loop for any other exception

            if len(unique_titles) >= 5600:
                break

        if len(unique_titles) >= 5600:
            break

PageError for title: 2023 Cricket World Cup
PageError for title: 2007 Cricket World Cup
PageError for title: 2015 Cricket World Cup
PageError for title: 1992 Cricket World Cup
PageError for title: 2011 Cricket World Cup
PageError for title: 1975 Cricket World Cup
PageError for title: 1983 Cricket World Cup
PageError for title: 2023 Cricket World Cup Qualifier
PageError for title: Twenty20
PageError for title: Shubman Gill
PageError for title: Shan Masood
PageError for title: Kane Williamson




  lis = BeautifulSoup(html).find_all('li')


PageError for title: 1987 Cricket World Cup squads
PageError for title: 2006 Under-19 Cricket World Cup
PageError for title: Jiminy Cricket
PageError for title: Pat Cummins
PageError for title: 1979 Cricket World Cup squads
PageError for title: Asia Cup
PageError for title: Babar Azam
PageError for title: Jay Shah
PageError for title: 2004 Under-19 Cricket World Cup
PageError for title: 2022 Under-19 Cricket World Cup
PageError for title: Mohammed Shami
PageError for title: Viv Richards
PageError for title: Mole cricket
PageError for title: Mohammad Kaif
PageError for title: Tamim Iqbal
PageError for title: MS Dhoni
PageError for title: Brian Lara Cricket (1998 video game)
PageError for title: 2020 Under-19 Cricket World Cup squads
PageError for title: ICC Women's T20 World Cup
PageError for title: 2002 Under-19 Cricket World Cup
PageError for title: Gray-Nicolls
PageError for title: Mitchell Starc
PageError for title: Adam Zampa
PageError for title: The Gabba
PageError for title: 2015

In [11]:
# Create the CSV from the DataFrame
sprs_df = pd.DataFrame(data_list)
csv_file_name = "./Data/Sports.csv"
sprs_df.to_csv(csv_file_name, index=False)

# Check with the csv file.
Sports = pd.read_csv("./Data/Sports.csv")
Sports.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        5600 non-null   object
 1   Title        5600 non-null   object
 2   Summary      5600 non-null   object
 3   URL          5600 non-null   object
 4   Revision ID  5600 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 218.9+ KB


## Politics

In [12]:
# Define the topics and subtopics
title = {
    'Politics': [
        'World Political Elections', 'Public policy analysis', 'International relations',
        'Political campaign strategies', 'Voting behavior and trends', 'Political party dynamics',
        'Governmental systems and structures', 'Legislative processes', 'Diplomacy and foreign policy',
        'Political ideologies and theories', 'Geopolitical conflicts', 'Human rights and international law',
        'Election laws and reforms', 'Political corruption and ethics', 'Global governance and institutions',
        'Environmental policy and politics', 'Economic policy debates', 'Healthcare policy',
        'Education policy', 'Immigration policy and debates', 'Defense and military policy',
        'Technology impact on politics', 'Social movements and activism', 'Media role in politics',
        'Political history and developments', 'Regional political analysis', 'Urban and rural politics',
        'Political psychology', 'Political sociology', 'Political economy',
        'Cybersecurity and politics', 'National security issues', 'Trade policies and agreements',
        'Political leadership and figures', 'Public opinion and polls', 'Civic engagement and participation'
    ],
}


# Initialize an empty list to store the scraped data
data_list = []

# Each and Every Heading and Side Headings related to the title gets looped.
# Webpages with unique titles and Summary over 250 characters are chosen.
# More than 5100 documents are set as a limit to extract information.
# Webpages are appended onto a list that later can be used to get a DataFrame.
# Wikipedia Disambiguation, HTTP timeout and Page errors are handled to enhance the process.
for heading, side_heading in title.items():
    short_summary_count = 0
    unique_titles = set()

    for iterate in side_heading:
        wikipedia.set_lang("en")
        search_results = wikipedia.search(iterate, results=5600)
        for webpage_heading in search_results:
            retry_count = 0
            while retry_count < 3:  # Retry up to 3 times
                try:
                    if webpage_heading in unique_titles:
                        break
                    page = wikipedia.page(webpage_heading)
                    summary = page.summary
                    if len(summary) >= 250:
                        data_list.append({
                            'Topic': heading,
                            'Title': page.title,
                            'Summary': summary,
                            'URL': page.url,
                            'Revision ID': page.revision_id  
                        })
                        unique_titles.add(webpage_heading)
                    break  # Exit retry loop if successful
                except wikipedia.exceptions.DisambiguationError:
                    break  # No retry for disambiguation errors
                except wikipedia.exceptions.PageError:
                    print(f"PageError for title: {webpage_heading}")
                    break  # No retry for page errors
                except (wikipedia.exceptions.HTTPTimeoutError, requests.exceptions.ConnectionError):
                    print(f"Error for title: {webpage_heading}, retrying ({retry_count + 1}/3)")
                    time.sleep(5)  # Wait 5 seconds before retrying
                    retry_count += 1
                except Exception as e:
                    print(f"Unhandled exception for title: {webpage_heading}: {e}")
                    break  # Exit retry loop for any other exception

            if len(unique_titles) >= 5600:
                break

        if len(unique_titles) >= 5600:
            break

PageError for title: Independent politician
PageError for title: 1940 United States presidential election




  lis = BeautifulSoup(html).find_all('li')


PageError for title: Gabon
PageError for title: Shas
PageError for title: Women's suffrage
PageError for title: Accountability
PageError for title: Political views of Generation Z
PageError for title: 1944 United States presidential election
PageError for title: Green party
PageError for title: Aftermath of World War I
PageError for title: Political repression
PageError for title: Centrism
PageError for title: 1956 United States presidential election
PageError for title: Amit Shah
PageError for title: 2022 United States House of Representatives elections
PageError for title: 2004 Indian general election
PageError for title: 2024 Salvadoran general election
PageError for title: 1952 United States presidential election
PageError for title: 1976 United States presidential election
PageError for title: 1994 South African general election
PageError for title: 2024 United States House of Representatives elections in Florida
PageError for title: 2023 United States elections
PageError for titl

In [13]:
# Create the CSV from the DataFrame
plts_df = pd.DataFrame(data_list)
csv_file_name = "./Data/Politics.csv"
plts_df.to_csv(csv_file_name, index=False)

# Check with the csv file.
Politics = pd.read_csv("./Data/Politics.csv")
Politics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        5600 non-null   object
 1   Title        5600 non-null   object
 2   Summary      5600 non-null   object
 3   URL          5600 non-null   object
 4   Revision ID  5600 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 218.9+ KB


## Education

In [14]:
# Define the topics and subtopics
title = {
    'Education': [
        'Literacy rates', 'Education trends', 'Online education',
        'Early childhood education', 'K-12 education systems', 'Higher education developments',
        'Special education and inclusivity', 'STEM education', 'Arts and humanities education',
        'Educational technology advancements', 'Distance learning methodologies', 'Blended learning models',
        'Student assessment and testing', 'Teacher training and professional development', 'Education policy and reform',
        'International education systems', 'Educational psychology', 'Career and technical education',
        'Lifelong learning and adult education', 'Vocational training', 'Scholarships and financial aid',
        'Educational leadership and administration', 'Learning disabilities and support', 'Bilingual and language education',
        'Gender equality in education', 'Education in developing countries', 'Impact of COVID-19 on education',
        'Academic research and publications', 'School safety and bullying', 'Extracurricular activities and student engagement',
        'Parental involvement in education', 'School funding and resources', 'Education and social justice',
        'Environmental education', 'Mental health in education', 'Innovative teaching methods'
    ],
}


# Initialize an empty list to store the scraped data
data_list = []

# Each and Every Heading and Side Headings related to the title gets looped.
# Webpages with unique titles and Summary over 250 characters are chosen.
# More than 5100 documents are set as a limit to extract information.
# Webpages are appended onto a list that later can be used to get a DataFrame.
# Wikipedia Disambiguation, HTTP timeout and Page errors are handled to enhance the process.
for heading, side_heading in title.items():
    short_summary_count = 0
    unique_titles = set()

    for iterate in side_heading:
        wikipedia.set_lang("en")
        search_results = wikipedia.search(iterate, results=5600)
        for webpage_heading in search_results:
            retry_count = 0
            while retry_count < 3:  # Retry up to 3 times
                try:
                    if webpage_heading in unique_titles:
                        break
                    page = wikipedia.page(webpage_heading)
                    summary = page.summary
                    if len(summary) >= 250:
                        data_list.append({
                            'Topic': heading,
                            'Title': page.title,
                            'Summary': summary,
                            'URL': page.url,
                            'Revision ID': page.revision_id  
                        })
                        unique_titles.add(webpage_heading)
                    break  # Exit retry loop if successful
                except wikipedia.exceptions.DisambiguationError:
                    break  # No retry for disambiguation errors
                except wikipedia.exceptions.PageError:
                    print(f"PageError for title: {webpage_heading}")
                    break  # No retry for page errors
                except (wikipedia.exceptions.HTTPTimeoutError, requests.exceptions.ConnectionError):
                    print(f"Error for title: {webpage_heading}, retrying ({retry_count + 1}/3)")
                    time.sleep(5)  # Wait 5 seconds before retrying
                    retry_count += 1
                except Exception as e:
                    print(f"Unhandled exception for title: {webpage_heading}: {e}")
                    break  # Exit retry loop for any other exception

            if len(unique_titles) >= 5600:
                break

        if len(unique_titles) >= 5600:
            break

PageError for title: Gender inequality in India




  lis = BeautifulSoup(html).find_all('li')


PageError for title: Digital literacy
PageError for title: Manor, India
PageError for title: Education in Mali
PageError for title: Munger district
PageError for title: Magadi
PageError for title: Financial literacy
PageError for title: Kachhi District
PageError for title: River Gee County
PageError for title: World Literacy Foundation
PageError for title: Malvi language
PageError for title: Banga, India
PageError for title: Bomdila
PageError for title: Higher education
PageError for title: Female education
PageError for title: XHamster
PageError for title: Bloom's taxonomy
PageError for title: Dark academia
PageError for title: Title IX
PageError for title: Scott Plous
PageError for title: Google Classroom
PageError for title: Embassy of Austria, Washington, D.C.
PageError for title: Jeremy Rifkin
PageError for title: Generation X
PageError for title: Average human height by country
PageError for title: Inflation
PageError for title: Educational psychology
PageError for title: Google 

In [15]:
# Create the CSV from the DataFrame
edcs_df = pd.DataFrame(data_list)
csv_file_name = "./Data/Education.csv"
edcs_df.to_csv(csv_file_name, index=False)

# Check with the csv file.
Education = pd.read_csv("./Data/Education.csv")
Education.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        5600 non-null   object
 1   Title        5600 non-null   object
 2   Summary      5600 non-null   object
 3   URL          5600 non-null   object
 4   Revision ID  5600 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 218.9+ KB


## Travel

In [16]:
# Define the topics and subtopics
title = {
    'Travel': [
        'Top tourist destinations', 'Airline industry data', 'Travel trends',
        'Eco-tourism and sustainable travel', 'Cruise industry insights', 'Adventure travel destinations',
        'Cultural and heritage tourism', 'Luxury travel experiences', 'Budget and backpacking travel',
        'Hotel and hospitality industry', 'Travel technology advancements', 'Travel insurance and safety',
        'Solo travel trends', 'Family travel and vacations', 'Business travel dynamics',
        'Road trips and car travel', 'Rail travel and train tours', 'Food and culinary tourism',
        'Health and wellness retreats', 'Travel photography and blogging', 'Digital nomad lifestyle',
        'Travel during COVID-19', 'Visa and immigration policies', 'Accessible travel for disabled travelers',
        'Destination weddings and honeymoons', 'Travel gear and accessories', 'Frequent flyer programs and loyalty schemes',
        'Travel booking platforms and apps', 'Cultural etiquette and customs', 'Language and communication in travel',
        'Wildlife and nature tourism', 'Historical travel and tours', 'Seasonal travel trends',
        'Volunteer and educational travel', 'Space tourism', 'Emerging travel destinations'
    ],
}


# Initialize an empty list to store the scraped data
data_list = []

# Each and Every Heading and Side Headings related to the title gets looped.
# Webpages with unique titles and Summary over 250 characters are chosen.
# More than 5100 documents are set as a limit to extract information.
# Webpages are appended onto a list that later can be used to get a DataFrame.
# Wikipedia Disambiguation, HTTP timeout and Page errors are handled to enhance the process.
for heading, side_heading in title.items():
    short_summary_count = 0
    unique_titles = set()

    for iterate in side_heading:
        wikipedia.set_lang("en")
        search_results = wikipedia.search(iterate, results=5600)
        for webpage_heading in search_results:
            retry_count = 0
            while retry_count < 3:  # Retry up to 3 times
                try:
                    if webpage_heading in unique_titles:
                        break
                    page = wikipedia.page(webpage_heading)
                    summary = page.summary
                    if len(summary) >= 250:
                        data_list.append({
                            'Topic': heading,
                            'Title': page.title,
                            'Summary': summary,
                            'URL': page.url,
                            'Revision ID': page.revision_id  
                        })
                        unique_titles.add(webpage_heading)
                    break  # Exit retry loop if successful
                except wikipedia.exceptions.DisambiguationError:
                    break  # No retry for disambiguation errors
                except wikipedia.exceptions.PageError:
                    print(f"PageError for title: {webpage_heading}")
                    break  # No retry for page errors
                except (wikipedia.exceptions.HTTPTimeoutError, requests.exceptions.ConnectionError):
                    print(f"Error for title: {webpage_heading}, retrying ({retry_count + 1}/3)")
                    time.sleep(5)  # Wait 5 seconds before retrying
                    retry_count += 1
                except Exception as e:
                    print(f"Unhandled exception for title: {webpage_heading}: {e}")
                    break  # Exit retry loop for any other exception

            if len(unique_titles) >= 5600:
                break

        if len(unique_titles) >= 5600:
            break

PageError for title: Heho Airport
PageError for title: Big Sur
PageError for title: Hiroshima
PageError for title: Zaan
PageError for title: Sipalay




  lis = BeautifulSoup(html).find_all('li')


PageError for title: Île-de-France
PageError for title: Kanyam
PageError for title: Belize
PageError for title: Disneyland Paris
PageError for title: Sex tourism
PageError for title: Boracay
PageError for title: Tourist attractions in Vienna
PageError for title: Axum
PageError for title: Overtourism
PageError for title: Top Withens
PageError for title: Tourism in Egypt
PageError for title: Dalaman Airport
PageError for title: Camm Morton
PageError for title: Ubud
PageError for title: Elephanta Island
PageError for title: Gay village
PageError for title: Cliffs of Moher
PageError for title: CityPASS
PageError for title: Warsaw
PageError for title: Pai, Thailand
PageError for title: Funchal
PageError for title: Riviera Maya
PageError for title: Space tourism
PageError for title: Ko Samui
PageError for title: Spiderhead
PageError for title: Metro Manila
PageError for title: Santa Claus, Indiana
PageError for title: Victoria Falls
PageError for title: Colca Canyon
PageError for title: Telu

In [17]:
# Create the CSV from the DataFrame
Trvl_df = pd.DataFrame(data_list)
csv_file_name = "./Data/Travel.csv"
Trvl_df.to_csv(csv_file_name, index=False)

# Check with the csv file.
Travel = pd.read_csv("./Data/Travel.csv")
Travel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        5600 non-null   object
 1   Title        5600 non-null   object
 2   Summary      5600 non-null   object
 3   URL          5600 non-null   object
 4   Revision ID  5600 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 218.9+ KB


## Food

In [20]:
# Define the topics and subtopics
title = {
    'Food': [
        'Crop yield statistics', 'Global hunger and food security', 'Top world dishes',
        'Sustainable agriculture', 'Organic farming trends', 'Genetically modified organisms (GMOs)',
        'Food industry innovations', 'Nutrition and dietary trends', 'Veganism and plant-based diets',
        'Food safety and regulations', 'Food supply chain management', 'Impact of climate change on agriculture',
        'Aquaculture and fisheries', 'Food waste reduction strategies', 'Local food movements',
        'Culinary tourism', 'Street food culture', 'Fusion cuisine trends',
        'Food technology and startups', 'Superfoods and health trends', 'Food policy and legislation',
        'Agricultural technology (AgTech)', 'Food marketing and branding', 'Celebrity chefs and influencers',
        'Food in popular culture', 'Cooking techniques and trends', 'Gastronomy and fine dining',
        'Ethical eating and food justice', 'Childrens nutrition and school meals', 'Food allergies and intolerances',
        'Wine and beverage industry', 'Food festivals and events', 'Traditional cooking methods and preservation',
        'Food anthropology and history', 'Farming and rural development', 'Food and economic development'
    ],
}


# Initialize an empty list to store the scraped data
data_list = []

# Each and Every Heading and Side Headings related to the title gets looped.
# Webpages with unique titles and Summary over 250 characters are chosen.
# More than 5100 documents are set as a limit to extract information.
# Webpages are appended onto a list that later can be used to get a DataFrame.
# Wikipedia Disambiguation, HTTP timeout and Page errors are handled to enhance the process.
for heading, side_heading in title.items():
    short_summary_count = 0
    unique_titles = set()

    for iterate in side_heading:
        wikipedia.set_lang("en")
        search_results = wikipedia.search(iterate, results=5600)
        for webpage_heading in search_results:
            retry_count = 0
            while retry_count < 3:  # Retry up to 3 times
                try:
                    if webpage_heading in unique_titles:
                        break
                    page = wikipedia.page(webpage_heading)
                    summary = page.summary
                    if len(summary) >= 250:
                        data_list.append({
                            'Topic': heading,
                            'Title': page.title,
                            'Summary': summary,
                            'URL': page.url,
                            'Revision ID': page.revision_id  
                        })
                        unique_titles.add(webpage_heading)
                    break  # Exit retry loop if successful
                except wikipedia.exceptions.DisambiguationError:
                    break  # No retry for disambiguation errors
                except wikipedia.exceptions.PageError:
                    print(f"PageError for title: {webpage_heading}")
                    break  # No retry for page errors
                except (wikipedia.exceptions.HTTPTimeoutError, requests.exceptions.ConnectionError):
                    print(f"Error for title: {webpage_heading}, retrying ({retry_count + 1}/3)")
                    time.sleep(5)  # Wait 5 seconds before retrying
                    retry_count += 1
                except Exception as e:
                    print(f"Unhandled exception for title: {webpage_heading}: {e}")
                    break  # Exit retry loop for any other exception

            if len(unique_titles) >= 5600:
                break

        if len(unique_titles) >= 5600:
            break

PageError for title: Cereal
PageError for title: Agriculture in India




  lis = BeautifulSoup(html).find_all('li')


PageError for title: Organic farming
PageError for title: Alfalfa
PageError for title: Vertical farming
PageError for title: Corn production in the United States
PageError for title: Sesame
PageError for title: Pearl millet
PageError for title: Pigeon pea
PageError for title: Injera
PageError for title: Almond
PageError for title: Agriculture in South Africa
PageError for title: Pollination
PageError for title: Pumpkin
PageError for title: Ginger
PageError for title: Proso millet
PageError for title: Food security
PageError for title: Yuan Longping
PageError for title: Shiso
PageError for title: Machine learning
PageError for title: Haber process
PageError for title: Hop production in the United States
PageError for title: Farmall
PageError for title: Ricky Ponting
PageError for title: Economy of Canada
PageError for title: Gilded Age
PageError for title: Q (disambiguation)
PageError for title: Jordan Peterson
PageError for title: Isabela (province)
PageError for title: Belize
PageErro

In [21]:
# Create the CSV from the DataFrame
food_df = pd.DataFrame(data_list)
csv_file_name = "./Data/Food.csv"
food_df.to_csv(csv_file_name, index=False)

# Check with the csv file.
Food = pd.read_csv("./Data/Food.csv")
Food.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        5600 non-null   object
 1   Title        5600 non-null   object
 2   Summary      5600 non-null   object
 3   URL          5600 non-null   object
 4   Revision ID  5600 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 218.9+ KB
