In [1]:
from bs4 import BeautifulSoup
import requests
import time
import datetime
import smtplib
import pandas as pd
import warnings
import json

# Ignore all warnings
warnings.filterwarnings("ignore")

#creating a session
session = requests.Session()

# Scraping Daily Bitcoin Headlines

In [9]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"}
url = "https://cryptonews.com/news/bitcoin-news/"
page = session.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

#initialising the lists for storing data
headline_lst = []
ext_news_lst = []

# Use a CSS selector to find elements with multiple classes
articles = soup.select('a.article__title.article__title--lg.article__title--featured.mb-20')
# finding all articles with the desired class
ext_articles = soup.find_all('div', class_='mb-25 d-none d-md-block')

# Loop through each article and extract the headline
for article in articles:
    headline = article.text
    headline_lst.append(headline)
for ext_article in ext_articles:
    ext_news = ext_article.text
    ext_news_lst.append(ext_news)
# Create a dictionary from the lists
data = {'Headline': headline_lst, 'News': ext_news_lst}

# Create a dataframe from the dictionary
bitcoin_news = pd.DataFrame(data)

bitcoin_news

Unnamed: 0,Headline,News
0,This New Tool Track’s Bitcoin’s Price Without ...,The Bitcoin blockchain alone is now capable of...
1,"Bitcoin Gains Recognition As A Unique, Non-Rep...",The Shanghai Second Intermediate People's Cour...
2,Bitcoin Price Prediction: MicroStrategy's Accu...,Bitcoin's price is slightly up at just above $...
3,Bitwise Draws Close to Spot Bitcoin Fund Appro...,NYSE Arca Exchange has submitted an updated ap...


# Scraping Daily Etherium Headlines

In [10]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"}
url = "https://cryptonews.com/news/ethereum-news/"
page = session.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

#initialising the lists for storing data
headline_lst = []
ext_news_lst = []

# Use a CSS selector to find elements with multiple classes
articles = soup.select('a.article__title.article__title--lg.article__title--featured.mb-20')
# finding all articles with the desired class
ext_articles = soup.find_all('div', class_='mb-25 d-none d-md-block')

# Loop through each article and extract the headline
for article in articles:
    headline = article.text
    headline_lst.append(headline)
for ext_article in ext_articles:
    ext_news = ext_article.text
    ext_news_lst.append(ext_news)
# Create a dictionary from the lists
data = {'Headline': headline_lst, 'News': ext_news_lst}

# Create a dataframe from the dictionary
etherium_news = pd.DataFrame(data)

etherium_news

Unnamed: 0,Headline,News
0,"Vitalik Deposits 400 ETH to Coinbase, Total 24...",Spot On Chain data revealed that 400 ETH worth...
1,High Gas Fees Alert: Binance Wallet Records $8...,Ethereum experienced a surge of abnormally hig...
2,Ethereum Devs Put Forth ERC-7512 Standard to R...,Ethereum (ETH) developers have introduced a ne...
3,Ethereum Price Prediction as Grayscale Files F...,Ether (ETH)'s near-term price outlook will tak...


# Scraping Daily Altcoin Headlines

In [11]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"}
url = "https://cryptonews.com/news/altcoin-news/"
page = session.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

#initialising the lists for storing data
headline_lst = []
ext_news_lst = []

# Use a CSS selector to find elements with multiple classes
articles = soup.select('a.article__title.article__title--lg.article__title--featured.mb-20')
# finding all articles with the desired class
ext_articles = soup.find_all('div', class_='mb-25 d-none d-md-block')

# Loop through each article and extract the headline
for article in articles:
    headline = article.text
    headline_lst.append(headline)
for ext_article in ext_articles:
    ext_news = ext_article.text
    ext_news_lst.append(ext_news)
# Create a dictionary from the lists
data = {'Headline': headline_lst, 'News': ext_news_lst}

# Create a dataframe from the dictionary
altcoin_news = pd.DataFrame(data)

altcoin_news

Unnamed: 0,Headline,News
0,Biggest Crypto Gainers Today on DEXTools – POF...,With blue chip crypto markets at risk of exper...
1,HTX Hacker Makes Off With $8 Million,Crypto exchange HTX experienced a security bre...
2,Trader Joe Sends Grant Proposal to Arbitrum,"Trader Joe, a major decentralized exchange (DE..."
3,Is Another US Government Shutdown on The Horiz...,"Over the next week, the United States governme..."


# Scraping Multiyear Data

In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Initialize an empty DataFrame to store all the data
all_data = pd.DataFrame()

# Loop through page numbers (e.g., 1, 2, 3, ...)
for page_number in range(1, 6): 
    url = f"https://cryptonews.com/paged/newsbitcoin-news-{page_number}.json"

    # Send a GET request to the URL to retrieve the JSON data
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the JSON data as a string
        json_string = response.text

        # Convert the JSON-like string to a Python list
        articles = json.loads(json_string)

        # Initialize lists to store data for this page
        titles = []
        urls = []
        dates = []

        # Loop through the list of articles and extract information
        for article_html in articles:
            # Parse the HTML content of the article
            soup = BeautifulSoup(article_html, 'html.parser')

            # Extract information from the parsed HTML
            title_element = soup.find('h4')
            title = title_element.text.strip() if title_element else 'No Title'
            titles.append(title)

            url_element = soup.find('a', href=True)
            url = url_element['href'] if url_element else 'No URL'
            urls.append(url)

            date_element = soup.find('div', class_='article__badge-date')
            date = date_element.get('data-utctime', 'No Date') if date_element else 'No Date'
            dates.append(date)

        # Create a DataFrame from the lists for this page
        data = {
            'Title': titles,
            'URL': urls,
            'Date': dates
        }
        df = pd.DataFrame(data)

        # Append the data for this page to the overall DataFrame
        all_data = all_data.append(df, ignore_index=True)

    else:
        print(f"Failed to retrieve data for page {page_number}. Status code: {response.status_code}")
all_data

Unnamed: 0,Title,URL,Date
0,This New Tool Track’s Bitcoin’s Price Without ...,/news/new-tool-tracks-bitcoins-price-without-u...,2023-09-26 18:30:00
1,"Bitcoin Gains Recognition As A Unique, Non-Rep...",/news/bitcoin-gains-recognition-as-a-unique-no...,2023-09-26 14:21:00
2,Bitcoin Price Prediction: MicroStrategy's Accu...,/news/bitcoin-price-prediction-microstrategys-...,2023-09-26 07:42:00
3,Bitwise Draws Close to Spot Bitcoin Fund Appro...,/news/bitwise-draws-close-spot-bitcoin-fund-ap...,2023-09-25 22:55:00
4,Bitcoin (BTC) Price Prediction: Is BTCBSC's Po...,/news/bitcoin-btc-price-prediction-btcbscs-pop...,2023-09-25 19:48:00
...,...,...,...
2495,"First Volcano Bitcoin Mined, Says El Salvador’...",/news/first-volcano-bitcoin-mined-says-el-salv...,2021-10-01 10:43:00
2496,"Bitcoin, Ethereum, and Major Altcoins Stuck In...",/news/bitcoin-ethereum-and-major-altcoins-stuc...,2021-10-01 04:36:00
2497,KYC Headaches on the Horizon for South Korean ...,/news/kyc-headaches-on-the-horizon-for-south-k...,2021-10-01 00:00:00
2498,South Korean Regulators Deserting Jobs for Pos...,/news/south-korean-regulators-deserting-jobs-f...,2021-09-30 20:00:00
