In [1]:
import pymongo
from datetime import datetime

In [2]:
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client['new_db']
raw_collection = db['raw_news']

In [3]:
raw_data = list(raw_collection.find())

In [4]:
len(raw_data)

252

In [5]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# import spacy


In [6]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AjayPatil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\AjayPatil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AjayPatil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AjayPatil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
lemmatizer = WordNetLemmatizer()

In [8]:
def preprocess_news(news):
    text = f"{news['title']} {news['desc']} {' '.join(news['content'])}"
    
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+"," ", text).strip()
    
    text = text.lower()
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    lemmatized_token = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized_token)

In [9]:
def preprocess_date(date):
    date_obj = datetime.strptime(date, "%B %d, %Y") # strptime str -> datetime obj
    formatted_date = date_obj.strftime("%Y-%m-%d")  # strftime  datetime obj -> formatted str
    return formatted_date

In [10]:
def preprocess_datetime(raw_datetime):
    raw_datetime_split = raw_datetime.split("/")
    date_obj = datetime.strptime(raw_datetime_split[0].strip(), "%B %d, %Y")
    formatted_date = date_obj.strftime("%Y-%m-%d")
    time_obj =  raw_datetime_split[1].strip().replace("IST","")
    formatted_datetime = formatted_date + " " + time_obj
    return formatted_datetime.strip()

# print(preprocess_datetime("January 02, 2025 / 17:53 IST"))

In [11]:
from yahooquery import search

def extract_ticker_name(company_name):
    company_name = company_name.strip()
    ticker_name = None       
    if company_name:
        results = search(company_name)        
        if results and 'quotes' in results:
            for quote in results['quotes']:
                if 'symbol' in quote and 'longname' in quote:
                    if company_name.lower() in quote['longname'].lower():
                        ticker_name = quote['symbol']
                        break      
    return ticker_name

In [12]:
def get_complete_raw_news(news):
    text = f"{news['title']} {news['desc']} {' '.join(news['content'])}"
    return text

In [13]:
processed_data = []

In [14]:
for news in raw_data:
    processed_text = preprocess_news(news)
    formatted_date = preprocess_date(news['date'])
    formatted_datetime = preprocess_datetime(news['datetime'])
    raw_news = get_complete_raw_news(news)
    if "stock_name" in news:
        ticker_name = extract_ticker_name(news["stock_name"]) 
    
        processed_data.append({
            "_id": news["_id"],
            "raw_news":raw_news,
            "processed_text": processed_text,
            "date": formatted_date,
            "datetime": formatted_datetime,
            "stock_name": news.get("stock_name"),
            "ticker_name":ticker_name,        
        })
    else:
        processed_data.append({
                "_id": news["_id"],
                "raw_news":raw_news,
                "processed_text": processed_text,
                "date": formatted_date,
                "datetime": formatted_datetime,
                "stock_name": news.get("stock_name"),      
                "ticker_name": None,      
            })

In [15]:
len(processed_data)

252

In [20]:
processed_data[251]

{'_id': ObjectId('677cb7fbe75b8c80b4240831'),
 'raw_news': "ITI shares hit 20% upper circuit amid high volumes to post biggest single-day gain in 9 months ITI shares rose a whopping 70% since April 2024. Shares of ITI Ltd. witnessed a significant surge today to post biggest single-day gain in 9 months\xa0at Rs 456.5\xa0apiece,\xa0driven by high trading volumes that neared 9 crore shares.\xa0The shares rose a whopping 70% since April 2024. On January 3, the stock soared by a substantial 20 percent i.e. adding Rs 76.20 to its value from its opening on Friday. This strong performance has pushed the company's market capitalisation to Rs 43,936.56 crore. The company had also made a recent announcement of securing a contract worth approximately Rs 95 crore from the Directorate of Geology & Mining, Government of Uttarakhand. This contract is for the implementation of a Mining Digital Transformation & Surveillance System (MDTSS) project. As per exchange filing, the MDTSS project aims to modern

In [21]:
collection = db['processed_news']
x = collection.insert_many(processed_data)

In [22]:
x.inserted_ids

[ObjectId('6777a09f8581574bb8d03ac9'),
 ObjectId('6777a0a68581574bb8d03aca'),
 ObjectId('6777a0d08581574bb8d03acb'),
 ObjectId('6777a0d38581574bb8d03acc'),
 ObjectId('6777a0d78581574bb8d03acd'),
 ObjectId('6777a0d98581574bb8d03ace'),
 ObjectId('6777a0db8581574bb8d03acf'),
 ObjectId('6777a0e78581574bb8d03ad0'),
 ObjectId('6777a1448581574bb8d03ad2'),
 ObjectId('6777a1498581574bb8d03ad3'),
 ObjectId('6777a14c8581574bb8d03ad4'),
 ObjectId('6777a17b8581574bb8d03ad5'),
 ObjectId('6777a17d8581574bb8d03ad6'),
 ObjectId('6777a17f8581574bb8d03ad7'),
 ObjectId('6777a1d68581574bb8d03ad9'),
 ObjectId('6777a1d98581574bb8d03ada'),
 ObjectId('6777a1dc8581574bb8d03adb'),
 ObjectId('6777a1df8581574bb8d03adc'),
 ObjectId('6777a1e68581574bb8d03add'),
 ObjectId('6777a1e98581574bb8d03ade'),
 ObjectId('6777a2148581574bb8d03adf'),
 ObjectId('6777a2178581574bb8d03ae0'),
 ObjectId('6777a2738581574bb8d03ae2'),
 ObjectId('6777a2798581574bb8d03ae3'),
 ObjectId('6777a2a58581574bb8d03ae4'),
 ObjectId('6777a2a8858157

In [65]:
client.close()