In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import joblib

In [4]:
df = pd.read_csv("/Users/anubhavpawan/Downloads/Data.csv", encoding='latin1')
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.sort_values('Date').reset_index(drop=True)

In [5]:
# 2) Combine Top1..Top25 into a single 'news' field
top_cols = [c for c in df.columns if c.lower().startswith('top')]
df['news'] = df[top_cols].astype(str).apply(lambda row: ' '.join([str(x) for x in row.values if x and x!='nan']), axis=1)

In [6]:

# 3) Train/test chronological split (80/20)
split_idx = int(0.8 * len(df))
train = df.iloc[:split_idx].copy()
test = df.iloc[split_idx:].copy()

X_train = train['news'].fillna('')
y_train = train['Label'].astype(int)
X_test = test['news'].fillna('')
y_test = test['Label'].astype(int)

In [7]:
# 4) TF-IDF + Logistic Regression (fast baseline)
tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1,2), stop_words='english')
Xtr = tfidf.fit_transform(X_train)
Xte = tfidf.transform(X_test)

lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(Xtr, y_train)
pred = lr.predict(Xte)
proba = lr.predict_proba(Xte)[:,1]

In [8]:
# 5) Metrics
print("Accuracy:", accuracy_score(y_test, pred))
print("Precision:", precision_score(y_test, pred))
print("Recall:", recall_score(y_test, pred))
print("F1:", f1_score(y_test, pred))
print("ROC AUC:", roc_auc_score(y_test, proba))
print("\nClassification Report:\n", classification_report(y_test, pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, pred))

Accuracy: 0.5225334957369062
Precision: 0.5391156462585034
Recall: 0.723744292237443
F1: 0.6179337231968811
ROC AUC: 0.4936931459160437

Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.29      0.36       383
           1       0.54      0.72      0.62       438

    accuracy                           0.52       821
   macro avg       0.51      0.51      0.49       821
weighted avg       0.51      0.52      0.50       821

Confusion Matrix:
 [[112 271]
 [121 317]]


In [9]:
# 6) Save model for later use
joblib.dump((tfidf, lr), "tfidf_lr_tuple.pkl")
print("Saved model to tfidf_lr_tuple.pkl")

Saved model to tfidf_lr_tuple.pkl


In [10]:

# 7) OPTIONAL: If you have price history (DataFrame 'prices' with 'Date','Open','High','Low','Close','Volume'):
#    you can compute technical indicators and merge with the news features before training.
def compute_technical_indicators(prices):
    p = prices.copy().sort_values('Date')
    p['SMA_5'] = p['Close'].rolling(5).mean()
    p['SMA_10'] = p['Close'].rolling(10).mean()
    p['EMA_10'] = p['Close'].ewm(span=10, adjust=False).mean()
    # simple RSI (14)
    delta = p['Close'].diff()
    up = delta.clip(lower=0)
    down = -1 * delta.clip(upper=0)
    roll_up = up.rolling(14).mean()
    roll_down = down.rolling(14).mean()
    rs = roll_up / (roll_down + 1e-8)
    p['RSI_14'] = 100.0 - (100.0 / (1.0 + rs))
    p = p.dropna().reset_index(drop=True)
    return p

In [5]:
import joblib

tfidf, lr = joblib.load("tfidf_lr_tuple.pkl")

# Ask user for a single news headline
headline = input("Enter a news headline: ")

# Transform and predict
X_new = tfidf.transform([headline])
pred = lr.predict(X_new)[0]

# Show result in friendly form
if pred == 1:
    print("Dow & Jones likely to go UP")
else:
    print("Dow & Jones likely to go DOWN")

Enter a news headline:  A company went bankrupt


Dow & Jones likely to go UP
