## Scrape News data

In [5]:
# import statements
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.by import By
import re
from datetime import datetime, timedelta
import pandas as pd
import torch # for finbert model
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# Set Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run in headless mode (no UI)

driver = webdriver.Chrome(service = Service(ChromeDriverManager().install()), options=chrome_options)

In [15]:
driver.get("https://news.google.com/search?q=infosys&hl=en-IN&gl=IN&ceid=IN%3Aen")
time.sleep(3)  # Wait for the page to load

In [16]:
# Extract article titles
articles = driver.find_elements(By.CLASS_NAME, "JtKRv")  # Target anchor tags with class "JtKRv"

# Extract and print the titles
for article in articles[:5]:
    title = article.text  # Extract the text inside the <a> tag
    print(title)

Infosys terminates more trainees from Mysuru campus; offers alternative career path
Infosys to trainees as it announces more layoffs at Mysuru campus: Please be informed that you have not m
Infosys layoffs: Bad news for Infosys employees as Narayana Murthy's company fires...., move set to affect...
Cognizant has message for TCS, Infosys, Wipro, HCL Tech & other peers
Infosys, Wipro, TCS on US hiring blacklist? Leaked recruiter email says…


In [17]:
# Extract dates
date_elements = driver.find_elements(By.CLASS_NAME, "hvbAAd")  # Target <time> elements

# Extract and print the dates
for date_element in date_elements[:5]:
    date_text = date_element.text  # Extract the visible text (e.g., "2 days ago")
    date_time = date_element.get_attribute("datetime")  # Extract the actual datetime attribute
    print(f"Published: {date_text} | Datetime: {date_time}")

Published: 2 days ago | Datetime: 2025-03-27T03:01:25Z
Published: 2 days ago | Datetime: 2025-03-27T09:20:00Z
Published: 2 days ago | Datetime: 2025-03-27T03:42:50Z
Published: 2 days ago | Datetime: 2025-03-27T05:53:56Z
Published: Yesterday | Datetime: 2025-03-28T06:46:32Z


In [10]:
driver.quit()

In [18]:
# Get Current Date for conversions
current_date = datetime.today()

In [19]:
# Function to convert date formats
def convert_date(date_element):
    """Convert various date formats to 'YYYY-MM-DD'."""
    try:
        # Extract datetime attribute if available
        if hasattr(date_element, "get_attribute"):
            datetime_attr = date_element.get_attribute("datetime")
            if datetime_attr:
                return datetime_attr[:10]  # Extract 'YYYY-MM-DD'

            date_text = date_element.text.strip()
        else:
            date_text = date_element.strip()

        # Handle empty dates
        if not date_text:
            return "Unknown"

        # Handle 'X days ago'
        days_match = re.match(r'(\d+)\s+days?\s+ago', date_text)
        if days_match:
            return (datetime.today() - timedelta(days=int(days_match.group(1)))).strftime("%Y-%m-%d")

        # Handle 'X hours ago', 'X minutes ago', 'Just now'
        if any(unit in date_text for unit in ["hour", "minute", "just now"]):
            return datetime.today().strftime("%Y-%m-%d")

        # Handle 'DD-MMM' format (e.g., '10-Mar')
        try:
            return datetime.strptime(f"{date_text}-2024", "%d-%b-%Y").strftime("%Y-%m-%d")
        except ValueError:
            return "Unknown"

    except Exception as e:
        print(f"Error parsing date: {e}")  # Debugging
        return "Unknown"

In [20]:
# Store extracted data
news_data = []

# Ensure we don't exceed the number of available elements
for i in range(min(len(articles), len(date_elements))):
    title = articles[i].text.strip()  # Extract news title
    raw_date = date_elements[i].text.strip()  # Extract human-readable date
    formatted_date = convert_date(raw_date)  # Convert date to 'YYYY-MM-DD' format
    news_data.append({"Title": title, "Published Date": formatted_date})

In [21]:
news_data[:2]

[{'Title': 'Infosys terminates more trainees from Mysuru campus; offers alternative career path',
  'Published Date': '2025-03-27'},
 {'Title': 'Infosys to trainees as it announces more layoffs at Mysuru campus: Please be informed that you have not m',
  'Published Date': '2025-03-27'}]

In [22]:
# Create DataFrame
df = pd.DataFrame(news_data)
df.head()

Unnamed: 0,Title,Published Date
0,Infosys terminates more trainees from Mysuru c...,2025-03-27
1,Infosys to trainees as it announces more layof...,2025-03-27
2,Infosys layoffs: Bad news for Infosys employee...,2025-03-27
3,"Cognizant has message for TCS, Infosys, Wipro,...",2025-03-27
4,"Infosys, Wipro, TCS on US hiring blacklist? Le...",Unknown


In [23]:
df['Published Date'].value_counts().sort_values(ascending=False)

Published Date
2025-03-27    35
Unknown       20
2025-03-29     8
2025-03-26     8
2025-03-25     8
2025-03-07     4
2025-03-19     4
2025-03-06     3
2025-03-21     3
2025-03-13     2
2025-03-20     2
2025-03-17     1
2025-03-15     1
2025-03-22     1
2025-03-14     1
2025-03-24     1
Name: count, dtype: int64

There are 15 unknown date values. so removing them, we get

In [24]:
df = df[df["Published Date"] != "Unknown"] # remove 'unknown'
df.to_csv('Data/news_dataset.csv', index=False)
df.shape

(82, 2)

## Sentiment Analysis

Now that we have scraped the news on INFOSYS and stored in a dataframe along with dates published, we move to sentiment analysis using finvert model

In [25]:
# load the news_data
news_data = pd.read_csv('Data/news_dataset.csv')
news_data.head()

Unnamed: 0,Title,Published Date
0,Infosys terminates more trainees from Mysuru c...,2025-03-27
1,Infosys to trainees as it announces more layof...,2025-03-27
2,Infosys layoffs: Bad news for Infosys employee...,2025-03-27
3,"Cognizant has message for TCS, Infosys, Wipro,...",2025-03-27
4,"TCS, Infosys, HDFC Bank, HCL Tech among 5 key ...",2025-03-29


In [26]:
# Load FinBERT model
MODEL_NAME = "ProsusAI/finbert"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME)

# Load the pre-trained FinBERT model
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


In [16]:
# Perform sentiment analysis
headlines = news_data["Title"].dropna().tolist()  # Remove NaN values before analysis
results = sentiment_analyzer(headlines)
print(results[:1]) # for DEMO

[{'label': 'negative', 'score': 0.7016247510910034}]


In [17]:
# Extract sentiment labels and scores
labels = [result["label"] for result in results]
scores = [result["score"] for result in results]

# Add results to the DataFrame
news_data.loc[news_data["Title"].notna(), "Sentiment"] = labels
news_data.loc[news_data["Title"].notna(), "Sentiment Score"] = scores

news_data.head()

Unnamed: 0,Title,Published Date,Sentiment,Sentiment Score
0,"Infosys, Wipro, TCS among tech giants blacklis...",2025-03-25,negative,0.701625
1,"Dividend stocks in India: Infosys, HDFC Bank t...",2025-03-22,positive,0.559317
2,Jefferies says 'buy' Infosys on attractive ris...,2025-03-25,neutral,0.517249
3,"Infosys, Wipro, TCS may struggle in FY26 as US...",2025-03-22,negative,0.937908
4,Infosys shares rise nearly 3% despite Jefferie...,2025-03-25,positive,0.918781


In [18]:
# Perform One-Hot Encoding on 'Sentiment'
news_data = pd.get_dummies(news_data, columns=["Sentiment"], prefix="Sentiment")
news_data.to_csv("Data/news_data_with_sentiment.csv", index=False)
news_data.head()

Unnamed: 0,Title,Published Date,Sentiment Score,Sentiment_negative,Sentiment_neutral,Sentiment_positive
0,"Infosys, Wipro, TCS among tech giants blacklis...",2025-03-25,0.701625,True,False,False
1,"Dividend stocks in India: Infosys, HDFC Bank t...",2025-03-22,0.559317,False,False,True
2,Jefferies says 'buy' Infosys on attractive ris...,2025-03-25,0.517249,False,True,False
3,"Infosys, Wipro, TCS may struggle in FY26 as US...",2025-03-22,0.937908,True,False,False
4,Infosys shares rise nearly 3% despite Jefferie...,2025-03-25,0.918781,False,False,True


## Load Historical Data And Merge With News Data to Prepare the Final Data

In [None]:
# Load the dataset obtained from NSE website
nse_stock_data = pd.read_csv('Data/Quote-Equity-INFY-EQ-25-03-2024-to-25-03-2025.csv')
nse_stock_data.head()

Unnamed: 0,Date,series,OPEN,HIGH,LOW,PREV. CLOSE,ltp,close,vwap,52W H,52W L,VOLUME,VALUE,No of trades
0,25-Mar-2025,EQ,1605.0,1636.15,1605.0,1592.75,1619.25,1628.45,1626.09,2006.45,1358.35,9890140,16082264118.35,266112
1,24-Mar-2025,EQ,1597.95,1607.1,1572.7,1592.55,1593.95,1592.75,1588.76,2006.45,1358.35,8857726,14072823625.15,250512
2,21-Mar-2025,EQ,1577.95,1603.9,1563.65,1615.55,1595.4,1592.55,1591.34,2006.45,1358.35,10074677,16032262527.35,241185
3,20-Mar-2025,EQ,1592.0,1631.9,1592.0,1586.55,1609.05,1615.55,1616.31,2006.45,1358.35,7186750,11616004800.3,229276
4,19-Mar-2025,EQ,1603.0,1603.0,1572.8,1609.35,1589.1,1586.55,1583.14,2006.45,1358.35,7387068,11694751809.1,250311


In [24]:
# Load the dataset news data with sentiment
news_data = pd.read_csv("Data/news_data_with_sentiment.csv")
news_data.head()

Unnamed: 0,Title,Published Date,Sentiment Score,Sentiment_negative,Sentiment_neutral,Sentiment_positive
0,"Infosys, Wipro, TCS among tech giants blacklis...",2025-03-25,0.701625,True,False,False
1,"Dividend stocks in India: Infosys, HDFC Bank t...",2025-03-22,0.559317,False,False,True
2,Jefferies says 'buy' Infosys on attractive ris...,2025-03-25,0.517249,False,True,False
3,"Infosys, Wipro, TCS may struggle in FY26 as US...",2025-03-22,0.937908,True,False,False
4,Infosys shares rise nearly 3% despite Jefferie...,2025-03-25,0.918781,False,False,True


In [None]:
nse_stock_data.columns # inspect column names

Index(['Date ', 'series ', 'OPEN ', 'HIGH ', 'LOW ', 'PREV. CLOSE ', 'ltp ',
       'close ', 'vwap ', '52W H ', '52W L ', 'VOLUME ', 'VALUE ',
       'No of trades '],
      dtype='object')

In [33]:
# remove the trailing whitespaces
nse_stock_data.columns = nse_stock_data.columns.str.strip()

In [35]:
nse_stock_data.columns # inspect column names

Index(['Date', 'series', 'OPEN', 'HIGH', 'LOW', 'PREV. CLOSE', 'ltp', 'close',
       'vwap', '52W H', '52W L', 'VOLUME', 'VALUE', 'No of trades'],
      dtype='object')

In [37]:
# convert date columns from both datasets into same datetime format
news_data["Published Date"] = pd.to_datetime(news_data["Published Date"], format="%Y-%m-%d")
nse_stock_data["Date"] = pd.to_datetime(nse_stock_data["Date"], format="%d-%b-%Y")

In [39]:
# Print the dtype of date columns
print("news_data 'Published Date' dtype:", news_data["Published Date"].dtype)
print("nse_stock_data 'Date' dtype:", nse_stock_data["Date"].dtype)

news_data 'Published Date' dtype: datetime64[ns]
nse_stock_data 'Date' dtype: datetime64[ns]


In [42]:
# inner join on the date column
merged_data = pd.merge(nse_stock_data, news_data, left_on="Date", right_on="Published Date", how="inner")
merged_data.to_csv("Data/final_merged_data.csv", index=False) # save the df
merged_data.head()

Unnamed: 0,Date,series,OPEN,HIGH,LOW,PREV. CLOSE,ltp,close,vwap,52W H,52W L,VOLUME,VALUE,No of trades,Title,Published Date,Sentiment Score,Sentiment_negative,Sentiment_neutral,Sentiment_positive
0,2025-03-25,EQ,1605.0,1636.15,1605.0,1592.75,1619.25,1628.45,1626.09,2006.45,1358.35,9890140,16082264118.35,266112,"Infosys, Wipro, TCS among tech giants blacklis...",2025-03-25,0.701625,True,False,False
1,2025-03-25,EQ,1605.0,1636.15,1605.0,1592.75,1619.25,1628.45,1626.09,2006.45,1358.35,9890140,16082264118.35,266112,Jefferies says 'buy' Infosys on attractive ris...,2025-03-25,0.517249,False,True,False
2,2025-03-25,EQ,1605.0,1636.15,1605.0,1592.75,1619.25,1628.45,1626.09,2006.45,1358.35,9890140,16082264118.35,266112,Infosys shares rise nearly 3% despite Jefferie...,2025-03-25,0.918781,False,False,True
3,2025-03-25,EQ,1605.0,1636.15,1605.0,1592.75,1619.25,1628.45,1626.09,2006.45,1358.35,9890140,16082264118.35,266112,"TCS, HCLTech, Infosys, Wipro: Why IT stocks ar...",2025-03-25,0.713755,False,True,False
4,2025-03-25,EQ,1605.0,1636.15,1605.0,1592.75,1619.25,1628.45,1626.09,2006.45,1358.35,9890140,16082264118.35,266112,"Stock Recommendations Today: Suzlon Energy, LI...",2025-03-25,0.951364,False,True,False


In [41]:
merged_data.shape

(72, 20)