In [None]:
import requests
import xml.etree.ElementTree as ET
import csv

# URL of the XML sitemap
sitemap_url = 'https://www.ey.com/en_in/sitemap/topics.xml'

# Fetch the XML content from the URL
response = requests.get(sitemap_url)
xml_content = response.content

# Parse the XML content
root = ET.fromstring(xml_content)

# Define the namespace if needed (based on your XML structure)
namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

# Open a CSV file for writing
with open('sitemap_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    
    # Write the header row
    csv_writer.writerow(['Website', 'Date'])
    
    # Extract and write the links and dates
    for url in root.findall('.//ns:url', namespace):
        loc = url.find('ns:loc', namespace).text
        lastmod = url.find('ns:lastmod', namespace).text
        # Write the row to the CSV file
        csv_writer.writerow([loc, lastmod])

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

# Function to extract content from a blog URL
def extract_blog_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract author name
            author_div = soup.find('div', class_='surfaceProfile-author-description')
            if author_div:
                author_name_tag = author_div.find('a')
                author_name = author_name_tag.get_text(strip=True) if author_name_tag else ''
            else:
                author_name = ''
            
            # Extract related topics
            related_topics_div = soup.find('div', class_='col-xs-12 col-sm-8 col-sm-offset-1 col-md-12 col-md-offset-0 col-lg-offset-0 default-style')
            if related_topics_div:
                related_topics_tags = related_topics_div.find_all('a')
                related_topics = ', '.join(tag.get_text(strip=True) for tag in related_topics_tags)
            else:
                related_topics = ''
            
            # Extract page content
            def extract_markdown_content(div):
                markdown_content = []
                for element in div:
                    if element.name == 'h1':
                        markdown_content.append(f"# {element.get_text(strip=True)}")
                    elif element.name == 'h2':
                        markdown_content.append(f"## {element.get_text(strip=True)}")
                    elif element.name == 'p':
                        markdown_content.append(f"{element.get_text(strip=True)}")
                    elif element.name == 'ul':
                        for li in element.find_all('li', recursive=False):
                            markdown_content.append(f"- {li.get_text(strip=True)}")
                    elif element.name == 'li':
                        markdown_content.append(f"- {element.get_text(strip=True)}")
                return "\n".join(markdown_content)
            
            main_div = soup.find('div', class_='optional-components paragraphSystem')
            if main_div:
                content_divs = main_div.find_all('div', class_='richText component section richText-copy-block col-xs-12')
                markdown_content = []
                for content_div in content_divs:
                    inner_div = content_div.find('div', class_='component-content')
                    if inner_div:
                        rich_text_div = inner_div.find('div', class_='richText-content')
                        if rich_text_div:
                            markdown_content.append(extract_markdown_content(rich_text_div.children))
                
                page_content = "\n\n".join(markdown_content)
            else:
                page_content = ''
            
            # Extract unique PDF links
            pdf_links = set()
            pdf_divs = soup.find_all('div', class_='fileList-download')
            for pdf_div in pdf_divs:
                pdf_link_tag = pdf_div.find('a', class_='fileList-download-link')
                if pdf_link_tag:
                    pdf_link = pdf_link_tag.get('href')
                    if pdf_link:
                        pdf_links.add(pdf_link)
            
            return author_name, related_topics, list(pdf_links), page_content
        else:
            return '', '', [], ''
    except Exception as e:
        return '', '', [], ''

# Read CSV file
input_file = 'sitemap_data.csv'
output_file = 'blogs_data_500.csv'
df = pd.read_csv(input_file)
# Iterate through each website in the CSV and extract data
for index, row in df.iterrows():
    website_url = row['Website']
    start_time = time.time()
    author_name, related_topics, pdf_links, page_content = extract_blog_content(website_url)
    print(time.time()-start_time)
    # Store extracted data in the DataFrame
    df.at[index, 'author_name'] = author_name
    df.at[index, 'related_topics'] = related_topics
    df.at[index, 'pdf_links'] = ', '.join(pdf_links)  # Join list into a string
    df.at[index, 'page_content'] = page_content


# Save the updated DataFrame to a new CSV file
df.to_csv(output_file, index=False)
print("Content extraction complete. Data saved to:", output_file)

In [None]:
def data_loading_unstructured():
    import pandas as pd
    content = []
    df = pd.read_csv('blogs_data_500.csv')
    websites = df['Website'].tolist()
    loader = UnstructuredURLLoader(websites)
    loaded_data = loader.load()
    for data in loaded_data:
        content.append(data.page_content)
    return content

def data_preprocessing():
    content = data_loading_unstructured()
    df = pd.read_csv('blogs_data_500.csv')
    new_data = pd.DataFrame({'page_content':content})
    df = df.drop('page_content',axis=1)
    concate_df = pd.concat([df, new_data],axis=1)
    concate_df['author_name'] = concate_df['author_name'].apply(lambda x:str(x))
    concate_df['related_topics'] = concate_df['related_topics'].apply(lambda x:str(x))
    concate_df['pdf_links'] = concate_df['related_topics'].apply(lambda x:str(x))
    concate_df['page_content'] = concate_df['page_content'].apply(lambda x:str(x))
    concate_df = concate_df[concate_df['page_content']!="A custom errorhandler for 404 responses"]
    concate_df = concate_df.reset_index(drop=True)
    concate_df['page_content'] = concate_df['page_content'].apply(lambda x:str(x))
    unstructured_data = concate_df.copy()
    for i, rows in unstructured_data.iterrows():
    result = rows[5]
    strings_to_remove = [
        "with us\n\nOur locations\n\nMy EY\n\nSite map\n\nLegal and privacy\n\nOpen Facebook profile\n\nOpen X profile\n\nOpen LinkedIn profile\n\nOpen Youtube profile\n\nEY refers to the global organization, and may refer to one or more, of the member firms of Ernst & Young Global Limited, each of which is a separate legal entity. Ernst & Young Global Limited, a UK company limited by guarantee, does not provide services to clients.\n\nYou are visiting EY in (en)\n\nin en",
        "EY Logo\n\nInsights\n\nAsking the better questions that unlock new answers to the working world's most complex issues.\n\nExplore\n\nTrending topics\n\nSee more\n\nSee more\n\nSpotlight\n\nAI insights\n\nCEO agenda\n\nCFO agenda\n\nEY Center for board matters\n\nEY podcasts\n\nEY webcasts\n\nOperations leaders\n\nTechnology leaders\n\nSee more\n\nServices\n\nEY helps clients create long-term value for all stakeholders. Enabled by data and technology, our services and solutions provide trust through assurance and help clients transform, grow and operate.\n\nExplore\n\nSee more\n\nSpotlight\n\nEY.ai - A unifying platform\n\nStrategy, transaction and transformation consulting\n\nTechnology transformation\n\nTax function operations\n\nClimate change and sustainability services\n\nEY Ecosystems\n\nEY Nexus: business transformation platform\n\nIndustries\n\nDiscover how EY insights and services are helping to reframe the future of your industry.\n\nExplore\n\nSee more\n\nCase studies\n\nAdvanced Manufacturing\n\nHow a manufacturer eliminates cost and value leakages with AI-ML\n\n03 Jul 2024Vinayak vipul\n\nConsulting\n\nHow a young cement company grew 2.5x with organizational and functional transformation\n\n05 Apr 2024EY India\n\nAI\n\nHow a state government transformed into an ecotourism haven\n\n12 Mar 2024EY India\n\nCareers\n\nWe bring together extraordinary people, like you, to build a better working world.\n\nExplore\n\nSee more\n\nSpotlight\n\nExperienced professionals\n\nEY-Parthenon careers\n\nStudent and entry level programs\n\nTalent community\n\nAbout us\n\nAt EY, our purpose is building a better working world. The insights and services we provide help to create long-term value for clients, people and society, and to build trust in the capital markets.\n\nExplore\n\nSee more\n\nSpotlight\n\nEY expands its EY ESG Compass platform with new innovative use-cases\n\n06 Aug 2024EY India\n\nAddressing customs, payment, and logistics challenges key to stronger e-commerce exports growth from India: EY-ASSOCHAM report\n\n24 Jul 2024EY India\n\nPE/VC investments in May 2024 exceeded US$6.9 billion across 115 deals, 54% growth Y-o-Y: EY-IVCA report\n\n01 Jul 2024EY India\n\nSearch\n\nSee all results for ' '\n\nNo results have been found\n\nTopics\n\nSee All\n\nGeneral\n\nSee All\n\nPeople\n\nSee All\n\nRecent Searches\n\nTrending\n\nUnion Budget 2024-25: Accelerating fiscal consolidation for sustained growth\n\nUnion Budget 2024-25: Drive fiscal consolidation for lower interest rates, boost private investment, and job growth.\n\n26 Jul 2024 Tax\n\nHow India Inc. can navigate the road to financial resilience\n\nExplore key findings from the 2024 Cost of Capital Survey, revealing insights into India Inc.'s financial resilience and strategic growth.\n\n17 Jul 2024\n\nImpact of new GST law on skill-based online games\n\nExplore the effects of the new GST law on skill-based online gaming. Understand the implications for players and industry with our in-depth analysis.\n\n04 Jul 2024 Tax\n\nMy EY My EY\n\nSelect your location\n\nLocal",
        "Facebook\n\nTwitter\n\nLinkedIn\n\nLink Copied",
        "Related articles",
        "Rebecca Dabbs\n\nHow EY can help\n\nEnvironment, health and safety\n\nEY CCaSS teams can help reduce the risk of EHS incidents and support decision-making to improve outcomes associated with EHS. Find out how.\n\nRead more\n\nConnect",
        "\n\nRead more\n\nConnect ",
        " sites\n\n",
        "Related topics",
        "Read more",
        "Connect ",
        "Subscribe",
        "Contact us to learn more",
        "Like what you’ve seen? Get in touch to learn more.",
        "Direct to your inbox",
        "Stay up to date with our Editor‘s picks newsletter."
    ]
    for string_to_remove in strings_to_remove:
        result = re.sub(re.escape(string_to_remove), '', result)
    unstructured_data.at[i, 'page_content'] = result
    return unstructured_data