In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import pymongo

In [2]:
def finding_raw_urls_from_base(url):
    service = Service("./chromedriver/chromedriver.exe")
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    
    WebDriverWait(driver, 60).until(
        EC.presence_of_all_elements_located((By.TAG_NAME, 'a'))
    )
    
    html = driver.page_source
    driver.quit()
    
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all('a', href=True)
    raw_urls = []
    for link in links:
        if link['href'].startswith("https://www.moneycontrol.com/news/business/"):
            raw_urls.append(link['href'])
    return list(set(raw_urls))
    

In [3]:
# raw_urls = finding_raw_urls_from_base("https://www.moneycontrol.com/news/business/stocks/")
raw_urls = finding_raw_urls_from_base("https://www.moneycontrol.com/news/business/stocks/page-3")

In [4]:
raw_urls

['https://www.moneycontrol.com/news/business/stocks/page-2/',
 'https://www.moneycontrol.com/news/business/markets/chartist-talks-sbi-securities-sudeep-shah-picks-these-2-stocks-for-january-12903479.html',
 'https://www.moneycontrol.com/news/business/companies',
 'https://www.moneycontrol.com/news/business/mutual-funds/',
 'https://www.moneycontrol.com/news/business/ipo/standard-glass-lining-ipo-sails-through-within-20-minutes-on-day-1-gmp-surges-to-nearly-70-12903966.html',
 'https://www.moneycontrol.com/news/business/stocks/page-30/',
 'https://www.moneycontrol.com/news/business/markets/bajaj-finance-q3-biz-update-aum-rises-28-records-highest-ever-quarterly-customer-additions-12903288.html',
 'https://www.moneycontrol.com/news/business/markets/nishant-pitti-says-easy-trip-planners-share-sale-for-personal-reasons-no-plan-to-sell-further-stake-12902971.html',
 'https://www.moneycontrol.com/news/business/ipo/',
 'https://www.moneycontrol.com/news/business/ipo/indo-farm-equipment-share-p

In [5]:
stock_urls =[]
market_urls = []
next_page_urls = []
other_urls = []
ipo_urls = []
def extract_urls(all_urls):
    for link in raw_urls:
        if link.startswith("https://www.moneycontrol.com/news/business/stocks/page"):
            next_page_urls.append(link)
        elif link.startswith("https://www.moneycontrol.com/news/business/stocks/"):
            stock_urls.append(link)
        elif link.startswith("https://www.moneycontrol.com/news/business/markets/"):
            market_urls.append(link)
        elif link.startswith("https://www.moneycontrol.com/news/business/ipo/"):
            ipo_urls.append(link)
        else:
            other_urls.append(link)

In [6]:
extract_urls(raw_urls)

In [7]:
stock_urls

['https://www.moneycontrol.com/news/business/stocks/buy-radico-khaitan-target-of-rs-2996-sharekhan-12902869.html',
 'https://www.moneycontrol.com/news/business/stocks/these-50-smallcaps-gain-between-10-40-as-broader-indices-outperform-12903245.html',
 'https://www.moneycontrol.com/news/business/stocks/',
 'https://www.moneycontrol.com/news/business/stocks/ntpc-renewable-energy-bags-1-000-mw-solar-power-project-12903522.html',
 'https://www.moneycontrol.com/news/business/stocks/iti-shares-hit-20-upper-circuit-amid-high-volumes-to-post-biggest-single-day-gain-in-9-months-12902808.html']

In [8]:

regex = r'https:\/\/www\.moneycontrol\.com\/news\/business\/stocks\/[^\/\s]+(?:\.[a-z]{2,6})(?:[\/\?].*)?'

final_stocks_urls = [url for url in stock_urls if re.match(regex, url)]

In [9]:
final_stocks_urls

['https://www.moneycontrol.com/news/business/stocks/buy-radico-khaitan-target-of-rs-2996-sharekhan-12902869.html',
 'https://www.moneycontrol.com/news/business/stocks/these-50-smallcaps-gain-between-10-40-as-broader-indices-outperform-12903245.html',
 'https://www.moneycontrol.com/news/business/stocks/ntpc-renewable-energy-bags-1-000-mw-solar-power-project-12903522.html',
 'https://www.moneycontrol.com/news/business/stocks/iti-shares-hit-20-upper-circuit-amid-high-volumes-to-post-biggest-single-day-gain-in-9-months-12902808.html']

In [10]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["new_db"]
mycol = mydb['raw_news']

In [11]:
service = Service("./chromedriver/chromedriver.exe")
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.199 Safari/537.36")

driver = webdriver.Chrome(service=service, options= options)

In [12]:
def extract_data(url):  
    driver.get(url)
    
    WebDriverWait(driver, 60).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
    )
    
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')    
    news = {}
    
    title = soup.find('h1', class_ = "article_title")
    news.update({"title":f"{title.text.strip()}"})
    
    desc = soup.find('h2', class_ = 'article_desc')
    news.update({"desc":f"{desc.text.strip()}"})
    
    date_time_div = soup.find('div', class_="article_schedule")
    if date_time_div:
        span_tag = date_time_div.find('span')
        date = span_tag.text.strip()
        news.update({"date":f"{date}"})
    
    datetime = date_time_div.text.strip()
    news.update({"datetime":f"{datetime}"})
    
    paragraphs_list = []
    paragrphs_div = soup.find('div', class_ ="content_wrapper")
    if paragrphs_div:
        paragraph_tags = paragrphs_div.find_all('p')
        for p in paragraph_tags:
            para_text = p.text.strip()            
            if len(para_text) < 50 :
                continue            
            if re.search(r"(click\s+here|disclaimer|modal|window|advertisement|investment\s+tips)", para_text, re.IGNORECASE):
                continue            
            paragraphs_list.append(para_text)         
            
    news.update({"content": paragraphs_list})
    
    stock_name = soup.find('a', class_="stock-name")
    if stock_name:
        news.update({"stock_name":f"{stock_name.text.strip()}"})
            
    # driver.quit()   
    return news

In [13]:
for url in final_stocks_urls :
    data = extract_data(url)
    x = mycol.insert_one(data)
    print(x.inserted_id)

677cb7bce75b8c80b424082e


677cb7e7e75b8c80b424082f


677cb7f5e75b8c80b4240830


677cb7fbe75b8c80b4240831


In [14]:
driver.quit()   