In [1]:
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
from bs4.element import Comment
import pandas as pd
import requests

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
#from webdriver_manager.chrome import ChromeDriverManager
#from webdriver_manager.core.os_manager import ChromeType


import time
from datetime import datetime

import os
import shutil
import subprocess


API_URL = "https://api-inference.huggingface.co/models/1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase"
headers = {"Authorization": ""}

def hf_query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

def true_case_annotator(original_texts):
    inputs = """"""
    for i in range(len(original_texts)):
        original_texts[i] = original_texts[i].lower()
        inputs = inputs + original_texts[i] + '<ZXCSDZ>.'
    output = hf_query({"inputs":inputs})
    if 'error' in output:
        return False
    output = output[0]['generated_text']
    output_arr = output.split('<ZXCSDZ>.')[:-1]
    output_arr = [ele.replace("\\n", "").strip().strip('.') for ele in output_arr]

    return output_arr

In [2]:
def scrape_venturebeat_rss(no_uid=True):
    url = "https://venturebeat.com/category/ai/feed"
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    
    xml_data = requests.get(url,headers=headers).content
    soup = BeautifulSoup(xml_data, "xml")

    res = soup.find_all('item')

    arr = []
    for item in res:
        title = item.find('title').text.strip()
        link = item.find('link').text.strip()
        pub_date = item.find('pubDate').text.strip()
        pub_date = datetime.strptime(pub_date, "%a, %d %b %Y %H:%M:%S %z").date()
        description = item.find('description').text.strip()


        uid = len(arr)+1
        obj = {
            'title': title,
            'description': description,
            'pub_date': pub_date,
            'link': link,
            'source': 'VentureBeat'
        }
        
        if not no_uid: 
            obj['uid'] = uid
        arr.append(obj)
        
    return arr

In [3]:
scrape_venturebeat_rss()

[{'title': 'Google DeepMind unveils ‘superhuman’ AI system that excels in fact-checking, saving costs and improving accuracy',
  'description': "Google DeepMind researchers have developed a new AI system that excels in fact-checking, outperforming human annotators and saving costs, but critics question what 'superhuman' really means in this context.",
  'pub_date': datetime.date(2024, 3, 28),
  'link': 'https://venturebeat.com/ai/google-deepmind-unveils-superhuman-ai-system-that-excels-in-fact-checking-saving-costs-and-improving-accuracy/',
  'source': 'VentureBeat'},
 {'title': 'SambaNova announces new AI Samba-CoE v0.2 that already beats Databricks DBRX',
  'description': 'What makes this achievement particularly notable is the efficiency of the model—it achieves these speeds without compromising on precision.',
  'pub_date': datetime.date(2024, 3, 28),
  'link': 'https://venturebeat.com/ai/sambanova-announces-new-ai-samba-coe-v0-2-that-already-beats-databricks-dbrx/',
  'source': 'V

In [4]:
def scrape_techcrunch_rss(no_uid=True):
    url = "https://techcrunch.com/feed"
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    
    xml_data = requests.get(url,headers=headers).content
    soup = BeautifulSoup(xml_data, "xml")

    res = soup.find_all('item')

    arr = []
    for item in res:
        title = item.find('title').text.strip()
        link = item.find('link').text.strip()
        pub_date = item.find('pubDate').text.strip()
        pub_date = datetime.strptime(pub_date, "%a, %d %b %Y %H:%M:%S %z").date()
        description_raw = item.find('description').text.strip()
        parsed_html = BeautifulSoup(description_raw, 'lxml')
        p_tags = parsed_html.find_all('p')
        description = p_tags[0].text.strip() if p_tags else description_raw


        uid = len(arr)+1
        obj = {
            'title': title,
            'description': description,
            'pub_date': pub_date,
            'link': link,
            'source': 'TechCrunch'
        }
        
        if not no_uid: 
            obj['uid'] = uid
        arr.append(obj)
        
    return arr

In [5]:
scrape_techcrunch_rss()

[{'title': 'Byju’s founder floats share offer to make peace with estranged investors',
  'description': 'Byju Raveendran, the founder of embattled edtech group Byju’s, has made a last-ditch attempt to placate disgruntled investors. He has just informed them that the board is weighing an offer of renounced shares — shares that a group of investors chose not to buy recently in protest —\xa0to prevent the dilution of their holdings ahead […]',
  'pub_date': datetime.date(2024, 3, 29),
  'link': 'https://techcrunch.com/2024/03/28/byjus-founder-floats-share-offer-to-make-peace-with-estranged-investors/',
  'source': 'TechCrunch'},
 {'title': 'X’s Grok chatbot will soon get an upgraded model, Grok-1.5',
  'description': 'X.ai, Elon Musk’s AI startup, has revealed its latest generative AI model, Grok-1.5. Set to power social network X’s Grok chatbot in the not-to-distant future (“in the coming days,” X.ai writes in a blog post), Grok-1.5 appears to be a measurable upgrade over its predecessor

In [6]:
def scrape_ft_rss(no_uid=True):
    url = "https://www.ft.com/artificial-intelligence?format=rss"
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    
    xml_data = requests.get(url,headers=headers).content
    soup = BeautifulSoup(xml_data, "xml")

    res = soup.find_all('item')

    arr = []
    for item in res:
        title = item.find('title').text.strip()
        link = item.find('link').text.strip()
        pub_date = item.find('pubDate').text.strip()
        pub_date = datetime.strptime(pub_date, "%a, %d %b %Y %H:%M:%S %Z").date()
        description = item.find('description').text.strip()
        description = description + '.' if len(description)>0 and description[-1] != '.' else description


        uid = len(arr)+1
        obj = {
            'title': title,
            'description': description,
            'pub_date': pub_date,
            'link': link,
            'source': 'Financial Times'
        }
        
        if not no_uid: 
            obj['uid'] = uid
        arr.append(obj)
        
    return arr

In [7]:
scrape_ft_rss()

[{'title': 'AI boom broadens out across Wall Street',
  'description': 'Reflected glory of chipmaker Nvidia has caused other stocks to surge.',
  'pub_date': datetime.date(2024, 3, 28),
  'link': 'https://www.ft.com/content/d91c9cd4-889c-4810-94ba-08b87a8397ef',
  'source': 'Financial Times'},
 {'title': 'AI advertising start-up valued at $4bn after fundraising',
  'description': 'The Brandtech Group will use funds to disrupt industry with machine-generated content and artificial intelligence.',
  'pub_date': datetime.date(2024, 3, 28),
  'link': 'https://www.ft.com/content/4c7bee10-51d3-489b-873a-765157af8aac',
  'source': 'Financial Times'},
 {'title': 'How Silicon Valley’s ‘Oppenheimer’ found lucrative trade in AI weapons',
  'description': 'Anduril Industries’ Palmer Luckey is leading the start-ups infiltrating the US government’s war machine.',
  'pub_date': datetime.date(2024, 3, 28),
  'link': 'https://www.ft.com/content/ce6f96f8-6ab8-4089-b7db-f99db22c2071',
  'source': 'Financ

In [8]:
def scrape_mitnews_rss(no_uid=True):
    url = "https://www.ft.com/artificial-intelligence?format=rss"
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    
    xml_data = requests.get(url,headers=headers).content
    soup = BeautifulSoup(xml_data, "xml")

    res = soup.find_all('item')

    arr = []
    for item in res:
        title = item.find('title').text.strip()
        link = item.find('link').text.strip()
        pub_date = item.find('pubDate').text.strip()
        pub_date = datetime.strptime(pub_date, "%a, %d %b %Y %H:%M:%S %Z").date()
        description = item.find('description').text.strip()
        description = description + '.' if len(description)>0 and description[-1] != '.' else description

        uid = len(arr)+1
        obj = {
            'title': title,
            'description': description,
            'pub_date': pub_date,
            'link': link,
            'source': 'MIT News'
        }
        
        if not no_uid: 
            obj['uid'] = uid
        arr.append(obj)
        
    return arr

In [9]:
scrape_mitnews_rss()

[{'title': 'AI boom broadens out across Wall Street',
  'description': 'Reflected glory of chipmaker Nvidia has caused other stocks to surge.',
  'pub_date': datetime.date(2024, 3, 28),
  'link': 'https://www.ft.com/content/d91c9cd4-889c-4810-94ba-08b87a8397ef',
  'source': 'MIT News'},
 {'title': 'AI advertising start-up valued at $4bn after fundraising',
  'description': 'The Brandtech Group will use funds to disrupt industry with machine-generated content and artificial intelligence.',
  'pub_date': datetime.date(2024, 3, 28),
  'link': 'https://www.ft.com/content/4c7bee10-51d3-489b-873a-765157af8aac',
  'source': 'MIT News'},
 {'title': 'How Silicon Valley’s ‘Oppenheimer’ found lucrative trade in AI weapons',
  'description': 'Anduril Industries’ Palmer Luckey is leading the start-ups infiltrating the US government’s war machine.',
  'pub_date': datetime.date(2024, 3, 28),
  'link': 'https://www.ft.com/content/ce6f96f8-6ab8-4089-b7db-f99db22c2071',
  'source': 'MIT News'},
 {'title

In [13]:
def rss_scraper(omit):
    to_scrape = ['VentureBeat', 'TechCrunch', 'Financial Times', 'MIT News']
    source_to_scraper_mapper = {'VentureBeat': scrape_venturebeat_rss(), 'TechCrunch': scrape_techcrunch_rss(), 
                                'Financial Times': scrape_ft_rss(), 'MIT News': scrape_mitnews_rss()}
    for source in to_scrape:
        if source in omit: 
            to_scrape.remove(source)
            
    news_df = pd.DataFrame()
    
    for source in to_scrape:
        arr_of_obj = source_to_scraper_mapper[source]
        df = pd.DataFrame(arr_of_obj)
        news_df = pd.concat([news_df, df])
    news_df = news_df[news_df['description'].str.len() > 25]
    return news_df.drop_duplicates(subset='title').reset_index(drop=True)
        
def generate_curated_news(curr_date_str, earliest_date_str=None, omit=[]):
    df = rss_scraper(omit)
    if earliest_date_str is not None:
        df = df[df['pub_date']>=datetime.strptime(earliest_date_str, "%d %b %Y").date()]
    df = df.sort_values('title').reset_index(drop=True)
    return df, curr_date_str


In [14]:
generate_curated_news('29 Mar 2024','23 Mar 2024')

(                                                title  \
 0   AI advertising start-up valued at $4bn after f...   
 1             AI boom broadens out across Wall Street   
 2   AI fuels startup success: 86% of founders repo...   
 3   AI is accelerating the energy transition, say ...   
 4   AI risk management startup ValidMind raises $8...   
 ..                                                ...   
 60  X’s Grok chatbot will soon get an upgraded mod...   
 61  YouTube now lets creators share exclusive Shor...   
 62  Zscaler finds enterprise AI adoption soars 600...   
 63  ‘Humanoid’ robot wave signals change on the pr...   
 64  ‘ShadowRay’ vulnerability on Ray framework exp...   
 
                                           description    pub_date  \
 0   The Brandtech Group will use funds to disrupt ...  2024-03-28   
 1   Reflected glory of chipmaker Nvidia has caused...  2024-03-28   
 2   New HubSpot report finds 86% of startup founde...  2024-03-27   
 3   The technology is