In [None]:
import yfinance as yf
import pandas as pd
import requests
from datetime import datetime
import re
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Step 1: Download Historical Stock Prices
ticker = "^BSESN"  # Ticker for SENSEX
start_date = "2020-01-01"
end_date = "2023-01-01"

In [None]:
# Downloading the stock data
sensex_data = yf.download(ticker, start=start_date, end=end_date)
sensex_data = sensex_data.dropna()  # Drop missing values

[*********************100%%**********************]  1 of 1 completed


In [None]:
import pandas as pd

# Initialize an empty DataFrame to hold the data
data = pd.DataFrame()

# Read the CSV file in chunks with specified quoting options
chunksize = 100000  # Adjust the chunk size based on your system's memory capacity

try:
    for chunk in pd.read_csv('india-news-headlines.csv', chunksize=chunksize, on_bad_lines='skip', quoting=3):
        data = pd.concat([data, chunk], ignore_index=True)
    print("Data loaded successfully.")
except pd.errors.ParserError as e:
    print("Error parsing CSV file:", e)

# Display the first few rows of the dataset
print(data.head())


Data loaded successfully.
   publish_date headline_category  \
0      20010102           unknown   
1      20010102           unknown   
2      20010102           unknown   
3      20010102           unknown   
4      20010102           unknown   

                                       headline_text  
0  "Status quo will not be disturbed at Ayodhya; ...  
1              "Fissures in Hurriyat over Pak visit"  
2            "America's unwanted heading for India?"  
3               "For bigwigs; it is destination Goa"  
4             "Extra buses to clear tourist traffic"  


In [None]:
# Convert the publish_date to datetime
data['publish_date'] = pd.to_datetime(data['publish_date'], format='%Y%m%d', errors='coerce')

**Step 3: Data Preprocessing**

In [None]:
# Clean text data
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

In [None]:
# Convert to string before applying clean_text
data['headline_text'] = data['headline_text'].apply(lambda x: clean_text(str(x)))

In [None]:
# Perform Sentiment Analysis
def get_sentiment_score(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

data['sentiment_score'] = data['headline_text'].apply(get_sentiment_score)

**Step 4: Feature Engineering**

In [None]:
# Averaging sentiment scores per day
daily_sentiment = data.groupby(data['publish_date'].dt.date)['sentiment_score'].mean()
daily_sentiment = daily_sentiment.reindex(pd.date_range(start=start_date, end=end_date), fill_value=0)
daily_sentiment.index = pd.to_datetime(daily_sentiment.index)

In [None]:
# Combine with stock prices
sensex_data['Sentiment'] = daily_sentiment

In [None]:
# Ensure we drop any NaN values that might have been introduced
sensex_data.dropna(inplace=True)

In [None]:
# Features and target variable
X = sensex_data[['Open', 'High', 'Low', 'Close', 'Volume', 'Sentiment']]
y = sensex_data['Close'].shift(-1).dropna()
X = X.iloc[:-1]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Step 5: Model Development
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [None]:
# Step 6: Prediction
y_pred = model.predict(X_test_scaled)

In [None]:
# Step 7: Model Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 573243.4050014545
R^2 Score: 0.9942249884606744


In [None]:
# Optional: Display the first few predictions
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predictions_df.head())

                  Actual     Predicted
Date                                  
2022-06-17  51597.839844  51403.130100
2022-03-07  53424.089844  52859.536564
2022-10-19  59202.898438  59101.252554
2021-06-03  52100.050781  52248.737050
2020-09-14  39044.351562  38756.116237
