## Sentiment analysis
- Currently this does not work very well 
- Seems like the correct analysis is applied ~50% of the time

In [4]:
import mysql.connector
from transformers import pipeline
import numpy as np
from credentials import ipCred, usernameCred, passwordCred, databaseCred

# Use your fine-tuned model (change paths as needed)
classifier = pipeline(
    task="text-classification", 
    model="./finbert-finetuned", 
    tokenizer="./finbert-finetuned", 
    device=-1
)

ticker = 'AAPL'
db_config = {
    'host': ipCred,
    'user': usernameCred,
    'password': passwordCred,
    'database': databaseCred
}

conn = mysql.connector.connect(**db_config)

# --- Cursor A: Fetch rows that need sentiment scores ---
cursor_fetch = conn.cursor()
fetch_query = f"""
    SELECT news_id, summary
    FROM {ticker}_news
    WHERE sentiment IS NULL
    LIMIT 10000;
"""
cursor_fetch.execute(fetch_query)
rows = cursor_fetch.fetchall()
cursor_fetch.close()  # Close fetch cursor

if not rows:
    print("No rows to update.")
    conn.close()
    exit()

# Filter out rows with empty summaries and unzip IDs and summaries
id_summary_pairs = [(news_id, summary) for news_id, summary in rows if summary]
if not id_summary_pairs:
    print("No valid summaries found.")
    conn.close()
    exit()

news_ids, summaries = zip(*id_summary_pairs)

# Process summaries in batches
batch_size = 32
results = []
for i in range(0, len(summaries), batch_size):
    batch = list(summaries[i:i+batch_size])
    batch_results = classifier(batch, truncation=True)
    results.extend(batch_results)

# --- Cursor B: Update rows with both numeric and label sentiment ---
cursor_update = conn.cursor()
update_query = f"""
    UPDATE {ticker}_news
    SET sentiment = %s, sentiment_label = %s
    WHERE news_id = %s
"""

# Process each result: store numeric score and its corresponding label.
for news_id, result in zip(news_ids, results):
    r = result[0] if isinstance(result, list) else result
    label = r['label'].upper()  # e.g., "POSITIVE", "NEGATIVE", "NEUTRAL"
    score = r['score']
    
    # Compute bipolar sentiment for the numeric column:
    if label == "POSITIVE":
        sentiment_score = score
    elif label == "NEGATIVE":
        sentiment_score = -score
    else:
        sentiment_score = 0.0

    # Update both columns: numeric sentiment and the label string.
    cursor_update.execute(update_query, (sentiment_score, label, news_id))

conn.commit()
cursor_update.close()
conn.close()

print("Sentiment scores and labels updated successfully.")

Device set to use cpu


Sentiment scores and labels updated successfully.


### Possible Chat - PAID

In [None]:
import openai

openai.api_key = "YOUR_OPENAI_API_KEY"

def get_sentiment(text):
    prompt = f"Classify the sentiment of the following news summary as either POSITIVE, NEUTRAL, or NEGATIVE:\n\n{text}\n\nAnswer:"
    response = openai.Completion.create(
        engine="text-davinci-003",  # or "gpt-3.5-turbo" if using ChatCompletion API
        prompt=prompt,
        max_tokens=1,
        temperature=0.0,  # low temperature for deterministic output
    )
    label = response.choices[0].text.strip().upper()
    return label

# Example usage:
summary = "The company reported record earnings this quarter and shares soared."
print(get_sentiment(summary))  # Expected output: POSITIVE

## FINE TUNING FINBERT
# Using https://arc.net/l/quote/zbvxjftu this data set

### note: please do not run this if you do not have a beefy computer with a GPU you will cook your machine (actually) (genuinely)(this is warning)
##### Currently takes .5 hours on m4 mac mini pro w 24GB vRAM

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Convert CSV into Pandas Data frame
df = pd.read_csv("labeledNews.csv")  # columns: "Sentence", "Sentiment"

# convert pandas dataframe to hugging face dataset
dataset = Dataset.from_pandas(df)

#splitting the data randomly into training and testing (currently 80% train and 20% test)
train_test = dataset.train_test_split(test_size=0.2)
dataset_train = train_test["train"]
dataset_val = train_test["test"]

# cleaning and mapping the data
def cleanData(example):
    example["text"] = example["Sentence"]
    return example

# mapping the cleanData function to the dataset
dataset_train = dataset_train.map(cleanData)
dataset_val = dataset_val.map(cleanData)

labelToID = {"negative": 0, "neutral": 1, "positive": 2}

def encode_labels(example):
    example["label"] = labelToID[example["Sentiment"].lower()]
    return example

dataset_train = dataset_train.map(encode_labels)
dataset_val = dataset_val.map(encode_labels)

# tokenize the data
model_checkpoint = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

dataset_train = dataset_train.map(tokenize_function, batched=True)
dataset_val = dataset_val.map(tokenize_function, batched=True)

# loading up the model
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=3,
    id2label={0: "negative", 1: "neutral", 2: "positive"},
    label2id={"negative": 0, "neutral": 1, "positive": 2},
    problem_type="single_label_classification"
)

# 7. Set up training args & Trainer
training_args = TrainingArguments(
    output_dir="./finbert-finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5, #learning rate higher means speeding up the training, but may overconverge (overshoot) where you want to be
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01, # does not let the model overfit the data
    logging_dir='./logs',
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    tokenizer=tokenizer,
)

# 8. Train, save, evaluate
trainer.train()
model.save_pretrained("./finbert-finetuned")
tokenizer.save_pretrained("./finbert-finetuned")

results = trainer.evaluate()
print(results)

Map:   0%|          | 0/4673 [00:00<?, ? examples/s]

Map:   0%|          | 0/1169 [00:00<?, ? examples/s]

Map:   0%|          | 0/4673 [00:00<?, ? examples/s]

Map:   0%|          | 0/1169 [00:00<?, ? examples/s]

Map:   0%|          | 0/4673 [00:00<?, ? examples/s]

Map:   0%|          | 0/1169 [00:00<?, ? examples/s]

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'