In [2]:
! pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
  Downloading soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4

   ------------- -------------------------- 1/3 [beautifulsoup4]
   ---------------------------------------- 3/3 [bs4]

Successfully installed beautifulsoup4-4.13.4 bs4-0.0.2 soupsieve-2.7


In [5]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime

url = "https://in.investing.com/news/stock-market-news/nifty-rangebound-ahead-of-rbi-policy-sebi-ras-expect-subdued-expiry-session-4863820"


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Connection": "keep-alive",
    "Referer": "https://www.google.com/"
}


response = requests.get(url, headers=headers)


soup = BeautifulSoup(response.text, "html.parser")

# Extract date
# The date is inside a span with class "articleTime"
date_span = soup.find("span", class_="articleTime")
if date_span:
    date_str = date_span.text.strip()
    # Example date_str: "Jun 4, 2025 3:22PM IST"
    # Parse date string
    try:
        published_date = datetime.strptime(date_str, "%b %d, %Y %I:%M%p IST")
    except ValueError:
        published_date = date_str  # fallback to raw text if parsing fails
else:
    published_date = None

# Extract paragraphs inside the article container
article_div = soup.find("div", id="article")
paragraphs = article_div.find_all("p") if article_div else []

full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)

print("Published Date:", published_date)
print("Article Text:\n", full_text)


Published Date: None
Article Text:
 


In [None]:
@track_performance
    def get_closing_prices1(self, ticker):
        try:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=self.chrome_options)
            url = f'https://finance.yahoo.com/quote/'+ticker+'/history'
            driver.get(url)
            table = driver.find_element(By.XPATH, "//div[@class='container' and @data-testid='history-table']//table")
            rows = table.find_elements(By.TAG_NAME, 'tr')

            data = []
            for row in rows:
                cols = row.find_elements(By.TAG_NAME, 'td')
                if not cols:
                    cols = row.find_elements(By.TAG_NAME, 'th')
                data.append([col.text.strip() for col in cols])

            df = pd.DataFrame(data)
            df.columns = df.iloc[0]
            df = df[1:]
            df.rename(columns={'Date': 'date', 'Close': 'close'}, inplace=True)

            df['date'] = pd.to_datetime(df['date'].apply(lambda x: datetime.strptime(x, "%b %d, %Y")))
            df['close'] = df['close'].apply(lambda x: float(x.replace(",", "")))

            driver.quit()
            return df[['date', 'close']]
        except CustomException as e:
            logger.error(f"[{ticker}] Failed to scrape data: {e}")

In [None]:
 @track_performance
    def extract_text(self, url):
        try:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=self.chrome_options)
            driver.get(url)

            paragraphs = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located(
                (By.XPATH, "//*[@id='article'][@class='article_container']//p")
            ))
            date_text = WebDriverWait(driver, 10).until(EC.visibility_of_element_located(
                (By.XPATH, "//span[contains(text(),'Published')]")
            )).text

            extracted_date = datetime.strptime(date_text.split(" ")[1].strip().replace(",", ""), "%d-%m-%Y").strftime("%Y-%m-%d")
            full_text = " ".join([p.text for p in paragraphs if p.text.strip()])
            return [extracted_date, full_text]
        except CustomException as e:
            logger.error(f"[SCRAPE ERROR] URL: {url}, Error: {e}")
            return None
        finally:
            if 'driver' in locals():
                driver.quit()

In [6]:
import sqlite3
import pandas as pd

# Step 1: Connect to your SQLite database
conn = sqlite3.connect("Data/data.db")  # Example: "data/my_data.db"

# Step 2: Fetch the table into a DataFrame
table_name = "news_data"  # Replace with your actual table name
df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)

# Step 3: (Optional) Close the connection
conn.close()

# Step 4: View your DataFrame
print(df.head())
print(df.tail())

         date  mean_sentiment_score
0  2020-09-03              0.456088
1  2020-09-04              0.496634
2  2020-09-07              0.579715
3  2020-09-08              0.526186
4  2020-09-09              0.186779
            date  mean_sentiment_score
1402  2025-05-30              0.701223
1403  2025-06-02              0.624525
1404  2025-06-03              0.608422
1405  2025-06-04              0.573783
1406  2025-06-05              0.433324


In [3]:
from Model_Utils.feature_splitting_scaling import ScalingWithSplitStrategy
import pandas as pd
splitter = ScalingWithSplitStrategy()
df = pd.read_csv("Data/processed_data/preprocessed_data.csv", index_col=[0])
df1 = df.drop(columns=['date'])
X_train, X_val, X_test, y_train, y_val, y_test = splitter.apply(df1)
        

[2025-06-07 23:12:21,728] INFO - Running 'apply'...


[2025-06-07 23:12:21,908] INFO - Successfully applied scaling and data splitting.
[2025-06-07 23:12:21,918] INFO - 'apply' completed in 0.1902 sec
[2025-06-07 23:12:21,921] INFO - Memory used: 640.91 KB (peak: 878.40 KB)


In [4]:
df.tail(1)

Unnamed: 0,date,mean_sentiment_score,nasdaq,sp500,dj30,crude_oil,gold,usd_inr,10yb,vix,...,sp500_pct_chg,dj30_pct_chg,crude_oil_pct_chg,gold_pct_chg,usd_inr_pct_chg,10yb_pct_chg,vix_pct_chg,nsebank_pct_chg,nsei_pct_chg,target
1147,2025-06-06,0.481459,19529.95,6000.36,42762.87,60.292708,0.23187,85.88,4.51,1.391643,...,1.028067,1.0471,2.163219,0.0,-0.023283,2.733485,-1.179332,1.466172,1.018751,


In [30]:
features = df.drop(columns=['date'], errors='ignore')
last_row = features.iloc[[-1]]
last_row_scaled = splitter.apply(last_row)
print(last_row)

[2025-06-07 12:12:49,471] INFO - Running 'apply'...


ValueError: Found array with 0 sample(s) (shape=(0, 22)) while a minimum of 1 is required by StandardScaler.