In [104]:
##### Importing the necessary libraries #####
import pandas as pd
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import time
from pydantic import BaseModel
from openai import OpenAI
import matplotlib

In [179]:
### load All_external.csv but only a rows from index 1000000
### This contains the links to the external websites
df = pd.read_csv('./data/All_external.csv', nrows=1000000)

In [181]:
### Convert the Date column to datetime
df['Date']=pd.to_datetime(df['Date'])


In [183]:
#### Filter the data to only include the stock symbol 'APPL'
df_appl = df[df['Stock_symbol'] == 'APPL'].reset_index(drop=True)

In [216]:
### Load the full history of APPL stock (price, volume, etc)
df_a = pd.read_csv("./data/full_history/AAPL.csv")

In [185]:
#### Function to get the content of a website
def get_website_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Try to find the div with class "article-content-body"
        article_content = soup.find('div', class_='article-content-body')
        
        if article_content:
            # If found, extract text from all child elements
            return ' '.join(article_content.stripped_strings)
        else:
            # If not found, fall back to extracting text from all <p> tags
            paragraphs = soup.find_all('p')
            if paragraphs:
                return ' '.join([p.get_text() for p in paragraphs])
            else:
                # If no <p> tags, extract all text from the body
                return ' '.join(soup.body.stripped_strings)
    except requests.RequestException as e:
        print(f"Error fetching website content: {e}")
        return None

In [None]:
#### Get the content of the websites
df_appl['Content'] = ""
for i in range(1000,len(df_appl)):
    url = df_appl['Url'][i]
    df_appl.iloc[i, df_appl.columns.get_loc('Content')] = get_website_content(url)
    time.sleep(1)

In [None]:
#####  Filter the data to only include the rows where the content is not empty
df_appl[(df_appl['Content'] != "") & (df_appl['Content'].notna())]
df_appl = df_appl.dropna(subset=['Content'])
#### Ensure that the content contains the word 'appl'
df_appl = df_appl[df_appl['Content'].str.lower().str.contains("appl")].reset_index(drop=True)

In [89]:
##### Save the data to a csv file
df_appl[['Date','Article_title','Content','Url']].to_csv('./data/AAPL_articles.csv', index=False)

In [217]:
#### Load
df_appl = pd.read_csv('./data/AAPL_articles.csv')
df_appl['Date_Only'] = pd.to_datetime(df_appl['Date']).dt.date

In [295]:
df_a['date'] = pd.to_datetime(df_a['date'])
### Merge the two dataframes
df_appl = pd.merge(df_appl,df_a,left_on='Date_Only',right_on='date')

In [None]:
### plot the data and high
plt.plot(df_a['date'], df_a['high'])

In [225]:
#### prompt to analyze
system_prompt = '''
Analyze the article and answer the following questions based on the content:
Are there indications that recent or upcoming policy decisions could support market growth? (Yes/No)
Do statements from central banks suggest optimism about the economic outlook? (Yes/No)
Are there emerging trends or patterns that suggest a shift in market sentiment? (Yes/No)
Is there evidence of key technical levels acting as support for major indices? (Yes/No)
Are certain sectors or industries showing stronger performance compared to others? (Yes/No)
Do shifts in investor interest suggest a move toward specific sectors, such as technology or energy? (Yes/No)
Do recent economic data releases (e.g., employment, inflation, consumer sentiment) point toward growth? (Yes/No)
Are any indicators flashing signals that typically correlate with significant market moves (e.g., yield curves, commodity prices)? (Yes/No)
Is there evidence of a “risk-on” approach among investors? (Yes/No)
Do recent market movements suggest increased interest in safe-haven assets like gold or bonds? (Yes/No)
Are there global or geopolitical events mentioned that could influence market volatility? (Yes/No)
Could changes in international markets or currencies impact domestic market trends? (Yes/No)
Are recent corporate earnings or business announcements likely to influence market sentiment? (Yes/No)
Do specific companies or sectors appear to be driving recent market gains? (Yes/No)
'''

In [226]:

OPENAI_API_KEY = "" # Add your OpenAI API key here
client = OpenAI(OPENAI_API_KEY)
#### Structure the response
class FinancialAnalysis(BaseModel):
    support_growth: bool
    optimism: bool
    support_indices: bool
    strong_sectors: bool
    investor_interest: bool
    economic_data: bool
    significant_moves: bool
    risk_approach: bool
    safe_haven_assets: bool
    global_events: bool
    international_impact: bool
    market_sentiment: bool
    driving_factors: bool

#### Function to analyze the article
def analyze_article(article_content: str) -> FinancialAnalysis:
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": article_content},
        ],
        response_format=FinancialAnalysis,
    )
    return completion
df_appl['Completition'] = df_appl['Content'].apply(analyze_article)
df_appl = df_appl[['Date','Completition']]

In [290]:
##### Parse the JSON response to extract the answers
import json
df_appl['Completition_JSON'] = df_appl['Completition'].apply(lambda x: json.loads(x.dict()['choices'][0]['message']['content']))

In [291]:
#### convert the JSON to a dataframe
df_appl = pd.concat([df_appl, df_appl['Completition_JSON'].apply(pd.Series)], axis=1)

In [297]:
### ratio of the stock close price to the previous day
df_appl['ratio'] = df_appl['close']/df_appl['close'].shift(1)

In [298]:
df_appl['Date'] = pd.to_datetime(df_appl['Date'])

In [299]:
### ratio difference
df_appl['ratio_diff'] = df_appl['ratio'] - df_appl['ratio'].shift(1)

In [None]:
### plot the data
fig, ax = plt.subplots()

ax.plot(df_appl['Date'], df_appl['ratio_diff'], label='AAPL')
ax.plot(df_appl['Date'], df_appl['support_growth'], label='support_growth')
ax.plot(df_appl['Date'], df_appl['optimism'], label='optimism')
ax.set_ylabel('Close ratio')
ax.set_xlabel('Date')

In [None]:
### replace the NaN values with 0
df_appl['ratio_diff'].fillna(0, inplace=True)

In [None]:
##### plot the main pot

## use tex
matplotlib.rcParams.update({'text.usetex': True})
matplotlib.rcParams.update({'font.size': 14,})
### relace _ with space and camel case
clean_label = lambda x: x.replace('_', ' ').title()
sns.set(style="darkgrid")
fig, ax = plt.subplots(3,4,figsize=(20,14))
list_features = FinancialAnalysis(support_growth=True, optimism=True, support_indices=True, strong_sectors=True, investor_interest=True, economic_data=True, significant_moves=True, risk_approach=True, safe_haven_assets=True, global_events=True, international_impact=True, market_sentiment=True, driving_factors=True).dict().keys()
list_features = list(list_features) 
for i in range(3):
    for j in range(4):
        sns.boxplot(y='ratio_diff', x=list_features[i*4+j], data=df_appl, ax=ax[i,j])
        ax[i,j].set_xlabel(clean_label(list_features[i*4+j]), fontsize=16)
        ax[i,j].set_ylabel('', fontsize=12)
        ax[i,j].set_xticks([0,1])
        ax[i,j].set_xticklabels(['No','Yes'], fontsize=16)
### set bounding box
plt.tight_layout(rect=[0, 0, 1, 0.95])
ax[1,0].set_ylabel('Difference of Ratio of Close Prices',fontsize=24)
### plot xlabel
plt.suptitle('Effect of Different Features Obtained from LLMA on the Ratio of Close Prices', fontsize=24)
plt.savefig('plots/LLMA_features_AAPL.pdf',bbox_inches='tight')
plt.show()
