In [9]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import yfinance as yf
from datetime import datetime


rss_feeds = {
"MarketWatch": "https://feeds.marketwatch.com/marketwatch/topstories",
"CNBC": "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000664"
}
news_data = []
for source, url in rss_feeds.items():
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'xml')
    items = soup.find_all('item')
for item in items:
    headline = item.title.text
    pub_date = pd.to_datetime(item.pubDate.text).date()
    news_data.append([source, headline, pub_date])
news_df = pd.DataFrame(news_data, columns=['source', 'headline', 'date'])
news_df.head()


stock = yf.download("AAPL", start="2024-01-01", end="2025-01-01")
# Flatten MultiIndex columns if present
if isinstance(stock.columns, pd.MultiIndex):
    stock.columns = stock.columns.get_level_values(0)

stock.reset_index(inplace=True)
stock['date'] = stock['Date'].dt.date
stock.head()

stock['Close_price'] = stock['Close']
stock['next_close'] = stock['Close_price'].shift(-1)
stock['price_up'] = (stock['next_close'] > stock['Close_price']).astype(int)
stock = stock.dropna()

from textblob import TextBlob
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity
news_df['sentiment'] = news_df['headline'].apply(get_sentiment)


sentiment_daily = news_df.groupby('date').agg(
sentiment_score=('sentiment', 'mean'),
positive_count=('sentiment', lambda x: (x > 0).sum()),
negative_count=('sentiment', lambda x: (x < 0).sum()),
neutral_count=('sentiment', lambda x: (x == 0).sum())
).reset_index()
sentiment_daily.head()


data = pd.merge(stock, sentiment_daily, on='date', how='left')
data.fillna(0, inplace=True)


features = ['sentiment_score', 'positive_count', 'negative_count', 'neutral_count']
target = 'price_up'
train = data[data['date'] < datetime(2024, 10, 1).date()]
test = data[data['date'] >= datetime(2024, 10, 1).date()]
X_train, y_train = train[features], train[target]
X_test, y_test = test[features], test[target]


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression")
print(classification_report(y_test, y_pred_lr))


from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest")
print(classification_report(y_test, y_pred_rf))


from xgboost import XGBClassifier
xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost")
print(classification_report(y_test, y_pred_xgb))


results = pd.DataFrame({
'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
'Accuracy': [
accuracy_score(y_test, y_pred_lr),
accuracy_score(y_test, y_pred_rf),
accuracy_score(y_test, y_pred_xgb)
]
})
results

'''
10. Results and Insights
XGBoost achieved the highest accuracy, indicating its strength in capturing nonlinear patterns.
Logistic Regression served as a strong baseline but lacked complexity.
Random Forest improved over the baseline but was slightly less effective than boosting.
Sentiment-based features show predictive value but market noise limits performance.
'''

[*********************100%***********************]  1 of 1 completed


Logistic Regression
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        25
           1       0.60      1.00      0.75        38

    accuracy                           0.60        63
   macro avg       0.30      0.50      0.38        63
weighted avg       0.36      0.60      0.45        63



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        25
           1       0.60      1.00      0.75        38

    accuracy                           0.60        63
   macro avg       0.30      0.50      0.38        63
weighted avg       0.36      0.60      0.45        63

XGBoost


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        25
           1       0.60      1.00      0.75        38

    accuracy                           0.60        63
   macro avg       0.30      0.50      0.38        63
weighted avg       0.36      0.60      0.45        63



'\n10. Results and Insights\nXGBoost achieved the highest accuracy, indicating its strength in capturing nonlinear patterns.\nLogistic Regression served as a strong baseline but lacked complexity.\nRandom Forest improved over the baseline but was slightly less effective than boosting.\nSentiment-based features show predictive value but market noise limits performance.\n'

In [6]:

!pip install textblob


Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 624.3/624.3 kB 5.1 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.19.0
