# Web Scraping Websites

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import os


In [8]:


def extract_article_text(url_id, url, output_folder):
    try:
        response = requests.get(url)
        response.raise_for_status() 

        soup = BeautifulSoup(response.content, 'html.parser')

        article_title = soup.title.text.strip()

        def is_content_valid(tag):
            undesired_content = 'We provide intelligence, accelerate innovation and implement technology with extraordinary breadth and depth global insights into the big data,data-driven dashboards, applications development, and information management for organizations through combining unique, specialist services and high-lvel human expertise.'
            return tag.name == 'p' and undesired_content not in tag.get_text(strip=True).lower() or tag.name == 'ol'

        paragraphs = [tag.get_text(strip=True) for tag in soup.find_all(is_content_valid)]

        article_text = ' '.join(paragraphs)

        file_name = f'{output_folder}/blackassign_{url_id}.txt'
        with open(file_name, 'w', encoding='utf-8') as file:
            file.write(f'{article_title}\n\n{article_text}')

        print(f'Successfully extracted data from {url} and saved to {file_name}')

    except requests.exceptions.RequestException as e:
        print(f'Failed to retrieve the webpage. Error: {e}')

In [9]:
def main():
    
    input_excel_path = 'input.xlsx'
    output_folder = 'output'
    
    os.makedirs(output_folder, exist_ok=True)
    
    df = pd.read_excel(input_excel_path)

    for index, row in df.iterrows():
        url_id = row['URL_ID']
        url = row['URL']
        extract_article_text(url_id, url, output_folder)




In [10]:
if __name__ == "__main__":
    main()

Successfully extracted data from https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/ and saved to output/blackassign_blackassign0001.txt
Successfully extracted data from https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/ and saved to output/blackassign_blackassign0002.txt
Successfully extracted data from https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/ and saved to output/blackassign_blackassign0003.txt
Successfully extracted data from https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/ and saved to output/blackassign_blackassign0004.txt
Successfully extracted data from https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/ and saved to output/blackassign_blackassign0005.tx

Successfully extracted data from https://insights.blackcoffer.com/all-you-need-to-know-about-online-marketing/ and saved to output/blackassign_blackassign0046.txt
Successfully extracted data from https://insights.blackcoffer.com/evolution-of-advertising-industry/ and saved to output/blackassign_blackassign0047.txt
Successfully extracted data from https://insights.blackcoffer.com/how-data-analytics-can-help-your-business-respond-to-the-impact-of-covid-19/ and saved to output/blackassign_blackassign0048.txt
Failed to retrieve the webpage. Error: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
Successfully extracted data from https://insights.blackcoffer.com/environmental-impact-of-the-covid-19-pandemic-lesson-for-the-future/ and saved to output/blackassign_blackassign0050.txt
Successfully extracted data from https://insights.blackcoffer.com/how-data-analytics-and-ai-are-used-to-halt-the-covid-19-pandemic/ and saved to ou

Successfully extracted data from https://insights.blackcoffer.com/gaming-disorder-and-effects-of-gaming-on-health/ and saved to output/blackassign_blackassign0094.txt
Successfully extracted data from https://insights.blackcoffer.com/what-is-the-repercussion-of-the-environment-due-to-the-covid-19-pandemic-situation/ and saved to output/blackassign_blackassign0095.txt
Successfully extracted data from https://insights.blackcoffer.com/what-is-the-repercussion-of-the-environment-due-to-the-covid-19-pandemic-situation-2/ and saved to output/blackassign_blackassign0096.txt
Successfully extracted data from https://insights.blackcoffer.com/impact-of-covid-19-pandemic-on-office-space-and-co-working-industries/ and saved to output/blackassign_blackassign0097.txt
Successfully extracted data from https://insights.blackcoffer.com/contribution-of-handicrafts-visual-arts-literature-in-the-indian-economy/ and saved to output/blackassign_blackassign0098.txt
Successfully extracted data from https://insig