In [33]:
# Data manipulation and analysis
import pandas as pd

# Natural language processing
import nltk
from nltk.corpus import stopwords

# Sentiment analysis
from textblob import TextBlob                                          # Textblob
from transformers import BertTokenizer, BertForSequenceClassification  # Bert
from nltk.sentiment.vader import SentimentIntensityAnalyzer            # Vader
import openai                                                          # ChatGPT

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Custom modules
from fetch_news import MarketNews                                      # Fetch news
from utils import process_text                            # Processing text

# Visual parameters
sns.set_style("whitegrid")
plt.rcParams["lines.linewidth"] = 1
plt.rcParams["axes.edgecolor"] = "k"

In [6]:
# Download the vader lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aungs_tko91wk\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [7]:
# If you need help with the MarketNews class
MarketNews?

[1;31mInit signature:[0m [0mMarketNews[0m[1;33m([0m[0mapi_key[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Fetches market news for a given stock ticker and index.

Attributes:
    url (str): URL endpoint for news API.
    query (dict): Parameters for API request.
    articles (list): List of fetched articles.
    data (DataFrame): Processed articles in DataFrame format.
    _api_key (str): API key for accessing the news API.
[1;31mInit docstring:[0m Initializes MarketNews with an API key.
[1;31mFile:[0m           c:\users\aungs_tko91wk\eigen\wip[sentimentanalysis]\fetch_news\news.py
[1;31mType:[0m           type
[1;31mSubclasses:[0m     

In [8]:
ticker = "DIS"
# index = "NYSE"
# api_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxx"
# base_url = "https://api.markets.sh/api/v1/symbols"

# start = "2022-09-17"
# end = "2023-09-17"

# news = MarketNews(api_key=api_key)

# news.get_news(
#     index,
#     ticker,
#     base_url=base_url,
#     start=start,
#     end=end,
#     limit=10000
# )

In [9]:
headlines = pd.read_csv(f"data/{ticker}_headlines.csv", index_col=0, parse_dates=True)
news_start = headlines.index.min()
news_end = headlines.index.max()

print(f"News start date: {news_start}")
print(f"News end date: {news_end}")
print(f"News timeframe: {(news_end - news_start).days} days")

News start date: 2022-10-06 16:25:00
News end date: 2023-09-14 22:25:20
News timeframe: 343 days


In [10]:
headlines = headlines.dropna()
headlines.head()

Unnamed: 0,DIS
2022-10-06 16:25:00,JAKKS Pacific: Toy Company On The Mend; Watch ...
2022-10-06 17:06:00,Disney: Sanderson Sisters Lead The Fourth Quarter
2022-10-06 23:25:00,JAKKS Pacific: Relative Undervaluation Makes I...
2022-10-06 23:41:09,ESPN Nears Large New Partnership With DraftKings
2022-10-06 23:57:00,DraftKings Shares Gain 8% After Report of Poss...


In [11]:
stop_words = stopwords.words("english")
headlines["processed_text"] = headlines[ticker].apply(
    process_text, lower_case=True, remove_stopwords=True, stop_words=stop_words
)

headlines.head()

Unnamed: 0,DIS,processed_text
2022-10-06 16:25:00,JAKKS Pacific: Toy Company On The Mend; Watch ...,jakks pacific toy company mend watch upcoming
2022-10-06 17:06:00,Disney: Sanderson Sisters Lead The Fourth Quarter,disney sanderson sisters lead fourth quarter
2022-10-06 23:25:00,JAKKS Pacific: Relative Undervaluation Makes I...,jakks pacific relative undervaluation makes buy
2022-10-06 23:41:09,ESPN Nears Large New Partnership With DraftKings,espn nears large new partnership draftkings
2022-10-06 23:57:00,DraftKings Shares Gain 8% After Report of Poss...,draftkings shares gain report possible espn pa...


In [12]:
# def tb_sentiment(text):
#     sentiment_analysis = TextBlob(text)
#     return sentiment_analysis.sentiment

# def tb_extract_scores(text, aspect):
#     if aspect == "polarity":
#         return tb_sentiment(text)[0]
#     elif aspect == "subjectivity":
#         return tb_sentiment(text)[1]

# def tb_extract_sentiment(polarity):
#     return "Positive" if polarity > 0 else "Negative" if polarity < 0 else "Neutral"

# headlines_textblob = headlines.copy()
# headlines_textblob["textblob_polarity"] = headlines_textblob["processed_text"].apply(tb_extract_scores, aspect="polarity")
# headlines_textblob["textblob_subjectivity"] = headlines_textblob["processed_text"].apply(tb_extract_scores, aspect="subjectivity")
# headlines_textblob["textblob_sentiment"] = headlines_textblob["textblob_polarity"].apply(tb_extract_sentiment)

headlines_textblob = pd.read_csv("data/textblob_sentiment.csv", index_col=0, parse_dates=True)

In [13]:
headlines_textblob.head()

Unnamed: 0,DIS,processed_text,textblob_polarity,textblob_subjectivity,textblob_sentiment
2022-10-06 16:25:00,JAKKS Pacific: Toy Company On The Mend; Watch ...,jakks pacific toy company mend watch upcoming,0.0,0.0,Neutral
2022-10-06 17:06:00,Disney: Sanderson Sisters Lead The Fourth Quarter,disney sanderson sisters lead fourth quarter,0.0,0.0,Neutral
2022-10-06 23:25:00,JAKKS Pacific: Relative Undervaluation Makes I...,jakks pacific relative undervaluation makes buy,0.0,0.0,Neutral
2022-10-06 23:41:09,ESPN Nears Large New Partnership With DraftKings,espn nears large new partnership draftkings,0.175325,0.441558,Positive
2022-10-06 23:57:00,DraftKings Shares Gain 8% After Report of Poss...,draftkings shares gain report possible espn pa...,0.0,1.0,Neutral


In [14]:
# def bert_preprocess(text, tokenizer):
#     tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
#     with torch.no_grad():
#         outputs = model(**tokens)
#     return int(torch.argmax(outputs.logits)) + 1

# def bert_sentiment(logit):
#     return "Positive" if logit > 3 else "Negative" if logit < 3 else "Neutral"

# def bert_extract_sentiment(text, tokenizer):
#     logit = bert_preprocess(text, tokenizer)
#     return bert_sentiment(logit)

# tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
# model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# headlines_bert = headlines.copy()
# headlines_bert["bert_sentiment"] = headlines_bert[ticker].apply(bert_extract_sentiment, tokenizer=tokenizer)

headlines_bert = pd.read_csv("data/bert_sentiment.csv", index_col=0, parse_dates=True)

In [15]:
headlines_bert.head()

Unnamed: 0,DIS,processed_text,bert_sentiment
2022-10-06 16:25:00,JAKKS Pacific: Toy Company On The Mend; Watch ...,jakks pacific toy company mend watch upcoming,Positive
2022-10-06 17:06:00,Disney: Sanderson Sisters Lead The Fourth Quarter,disney sanderson sisters lead fourth quarter,Positive
2022-10-06 23:25:00,JAKKS Pacific: Relative Undervaluation Makes I...,jakks pacific relative undervaluation makes buy,Neutral
2022-10-06 23:41:09,ESPN Nears Large New Partnership With DraftKings,espn nears large new partnership draftkings,Positive
2022-10-06 23:57:00,DraftKings Shares Gain 8% After Report of Poss...,draftkings shares gain report possible espn pa...,Negative


In [16]:
# vader = SentimentIntensityAnalyzer()

# def vader_extract_scores(text, vader):
#     return vader.polarity_scores(text)

# def vader_get_score(text, vader, score):
#     return vader_extract_scores(text, vader)[score]

# def vader_extract_sentiment(text, vader):
#     score = vader_get_score(text, vader, score="compound")
#     return "Positive" if score > 0.05 else "Negative" if score < -0.05 else "Neutral"

# headlines_vader = headlines.copy()
# headlines_vader["vader_neu"] = headlines_vader["processed_text"].apply(vader_get_score, vader=vader, score="neu")
# headlines_vader["vader_pos"] = headlines_vader["processed_text"].apply(vader_get_score, vader=vader, score="pos")
# headlines_vader["vader_neg"] = headlines_vader["processed_text"].apply(vader_get_score, vader=vader, score="neg")
# headlines_vader["vader_compound"] = headlines_vader["processed_text"].apply(vader_get_score, vader=vader, score="compound")
# headlines_vader["vader_sentiment"] = headlines_vader["processed_text"].apply(vader_extract_sentiment, vader=vader)

headlines_vader = pd.read_csv("data/vader_sentiment.csv", index_col=0, parse_dates=True)
headlines_vader.head()

Unnamed: 0,DIS,processed_text,vader_neu,vader_pos,vader_neg,vader_compound,vader_sentiment
2022-10-06 16:25:00,JAKKS Pacific: Toy Company On The Mend; Watch ...,jakks pacific toy company mend watch upcoming,1.0,0.0,0.0,0.0,Neutral
2022-10-06 17:06:00,Disney: Sanderson Sisters Lead The Fourth Quarter,disney sanderson sisters lead fourth quarter,1.0,0.0,0.0,0.0,Neutral
2022-10-06 23:25:00,JAKKS Pacific: Relative Undervaluation Makes I...,jakks pacific relative undervaluation makes buy,1.0,0.0,0.0,0.0,Neutral
2022-10-06 23:41:09,ESPN Nears Large New Partnership With DraftKings,espn nears large new partnership draftkings,1.0,0.0,0.0,0.0,Neutral
2022-10-06 23:57:00,DraftKings Shares Gain 8% After Report of Poss...,draftkings shares gain report possible espn pa...,0.472,0.528,0.0,0.6808,Positive


In [17]:
# tb_sentiment = headlines_textblob["textblob_sentiment"]
# bert_sentiment = headlines_bert["bert_sentiment"]
# vader_sentiment = headlines_vader["vader_sentiment"]

# sentiments = pd.concat([tb_sentiment, bert_sentiment, vader_sentiment], axis=1)
# sentiment_analysis = pd.concat([headlines, sentiments], axis=1)

# sentiment_analysis

In [18]:
def gpt_sentiment(headline, api_key, model="gpt-3.5-turbo-16k"):
    """
    Function to ask a question to ChatGPT.
    
    Parameters:
    - question (str): The question to ask.
    - model (str): The model to use, default is "gpt-3.5-turbo-16k".
    
    Returns:
    - str: The answer from ChatGPT
    """
    
    openai.api_key = api_key
    
    messages = [
        {"role": "system", "content": \
         "You are a helpful assistant who will be performing sentiment analysis. \
         Your task is to respond with 'Positive', 'Negative', or 'Neutral', and nothing else."},
        {"role": "user", "content": f"What is the sentiment of the following headline: {headline}"}
    ]

    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
    )
    
    # Extract the assistant's reply
    answer = response['choices'][0]['message']['content']
    return answer

# OPENAI_API_KEY = "xxxxxxxxxxxxxxxxxxxxxxxx"

In [19]:
# sentiment_analysis["chatgpt_sentiment"] = sentiment_analysis[ticker].apply(gpt_sentiment, api_key=OPENAI_API_KEY)

sentiment_analysis = pd.read_csv("data/sentiment_analysis.csv", index_col=0, parse_dates=True)
sentiment_analysis.head()

Unnamed: 0,DIS,processed_text,textblob_sentiment,bert_sentiment,vader_sentiment,chatgpt_sentiment
2022-10-06 16:25:00,JAKKS Pacific: Toy Company On The Mend; Watch ...,jakks pacific toy company mend watch upcoming,Neutral,Positive,Neutral,Neutral
2022-10-06 17:06:00,Disney: Sanderson Sisters Lead The Fourth Quarter,disney sanderson sisters lead fourth quarter,Neutral,Positive,Neutral,Neutral
2022-10-06 23:25:00,JAKKS Pacific: Relative Undervaluation Makes I...,jakks pacific relative undervaluation makes buy,Neutral,Neutral,Neutral,Positive
2022-10-06 23:41:09,ESPN Nears Large New Partnership With DraftKings,espn nears large new partnership draftkings,Positive,Positive,Neutral,Neutral
2022-10-06 23:57:00,DraftKings Shares Gain 8% After Report of Poss...,draftkings shares gain report possible espn pa...,Neutral,Negative,Positive,Positive


In [20]:
# Function to check if there are different sentiments from three models
def different_sentiments(row):
    return len(set([row["textblob_sentiment"], row["bert_sentiment"], row["vader_sentiment"]])) == 3

# Drop the "processed_text" column from the sentiment_analysis DataFrame
zoom_in = sentiment_analysis.drop("processed_text", axis=1)

# Filter rows where the chatgpt_sentiment is "Negative"
chatgpt_negative = zoom_in["chatgpt_sentiment"] == "Negative"

# Filter rows where the chatgpt_sentiment is "Neutral"
chatgpt_neutral = zoom_in["chatgpt_sentiment"] == "Neutral"

# Filter rows where the chatgpt_sentiment is "Positive"
chatgpt_positive = zoom_in["chatgpt_sentiment"] == "Positive"

# Apply the different_sentiments function to each row to check if there are differing sentiments from other models
disagreeing_other_models = zoom_in.apply(lambda row: different_sentiments(row), axis=1)

In [21]:
# Define the number of samples to select
n_samples = 3

# Filter rows where chatgpt_sentiment is "Neutral" and there is a disagreement with other models
comparison_1 = zoom_in[chatgpt_neutral & disagreeing_other_models]

# Randomly select 'n_samples' rows from the filtered data with a fixed random state for reproducibility
sample_1 = comparison_1.sample(n_samples, random_state=42)

sample_1

Unnamed: 0,DIS,textblob_sentiment,bert_sentiment,vader_sentiment,chatgpt_sentiment
2023-05-11 14:41:18,Remote and hybrid work look set to continue ac...,Negative,Positive,Neutral,Neutral
2023-04-03 20:53:41,Disney-DeSantis war of words heats up at annua...,Neutral,Positive,Negative,Neutral
2022-11-25 16:52:42,Apple Buying Manchester United? More Like Chin...,Neutral,Negative,Positive,Neutral


In [22]:
# Inspect the article headlines in sample 1
sample_1[ticker].to_list()

['Remote and hybrid work look set to continue across Europe',
 'Disney-DeSantis war of words heats up at annual meeting',
 'Apple Buying Manchester United? More Like Chinese Art Marrying French Rap']

In [23]:
# Filter rows where chatgpt_sentiment is "Positive" and there is a disagreement with other models
comparison_2 = zoom_in[chatgpt_positive & disagreeing_other_models]

# Randomly select 'n_samples' rows from the filtered data with a fixed random state for reproducibility
sample_2 = comparison_2.sample(n_samples, random_state=42)

sample_2

Unnamed: 0,DIS,textblob_sentiment,bert_sentiment,vader_sentiment,chatgpt_sentiment
2023-08-15 18:02:54,Linear TV viewing sinks below 50% as streaming...,Positive,Negative,Neutral,Positive
2023-06-24 05:21:51,Directors Guild of America votes to ratify new...,Positive,Negative,Neutral,Positive
2022-10-06 23:57:00,DraftKings Shares Gain 8% After Report of Poss...,Neutral,Negative,Positive,Positive


In [24]:
# Inspect the article headlines in sample 2
sample_2[ticker].to_list()

['Linear TV viewing sinks below 50% as streaming soars to new heights',
 'Directors Guild of America votes to ratify new contract with studios',
 'DraftKings Shares Gain 8% After Report of Possible ESPN Partnership']

In [25]:
# Filter rows where chatgpt_sentiment is "Negative" and there is a disagreement with other models
comparison_3 = zoom_in[chatgpt_negative & disagreeing_other_models]

# Randomly select 'n_samples' rows from the filtered data with a fixed random state for reproducibility
sample_3 = comparison_3.sample(n_samples, random_state=42)

sample_3

Unnamed: 0,DIS,textblob_sentiment,bert_sentiment,vader_sentiment,chatgpt_sentiment
2023-03-30 18:23:39,ABC News lays off several senior executives am...,Neutral,Positive,Negative,Negative
2022-11-29 04:36:40,Hoping to beat the tourist crowd on your trip ...,Neutral,Negative,Positive,Negative
2023-04-05 00:08:55,How Disney and DeSantis started feuding,Neutral,Positive,Negative,Negative


In [26]:
# Inspect the article headlines in sample 3
sample_3[ticker].to_list()

['ABC News lays off several senior executives amid broader Disney cost-cutting in ‘shock’ to newsroom',
 'Hoping to beat the tourist crowd on your trip to Japan? That ship has sailed',
 'How Disney and DeSantis started feuding']

In [32]:
true_labels = sentiment_analysis.drop(["textblob_sentiment", "bert_sentiment", "vader_sentiment"], axis=1).copy()
true_labels.rename(columns={"chatgpt_sentiment": "sentiment"}, inplace=True)

display(true_labels.head())
true_labels.to_csv("data/true_labels_gpt.csv")

Unnamed: 0,DIS,processed_text,sentiment
2022-10-06 16:25:00,JAKKS Pacific: Toy Company On The Mend; Watch ...,jakks pacific toy company mend watch upcoming,Neutral
2022-10-06 17:06:00,Disney: Sanderson Sisters Lead The Fourth Quarter,disney sanderson sisters lead fourth quarter,Neutral
2022-10-06 23:25:00,JAKKS Pacific: Relative Undervaluation Makes I...,jakks pacific relative undervaluation makes buy,Positive
2022-10-06 23:41:09,ESPN Nears Large New Partnership With DraftKings,espn nears large new partnership draftkings,Neutral
2022-10-06 23:57:00,DraftKings Shares Gain 8% After Report of Poss...,draftkings shares gain report possible espn pa...,Positive
