In [None]:
#Author: Yiying Jiao
#Step 4 Note: adjusted prices from yahoo finance are dynamically updated
#step 4 needs to be rerun from time to time

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
#if special case: given that our new data is probably just a short period of time,
#it is not ideal or possible for some metrics like macd to capture trends,
#we add it to the whole year of 2025 and analyze all the dates together
#meta_df=pd.concat([prev_df,curr_df])
#early_df = pd.read_csv("24_20250304_probs_df.csv")
#early_df = early_df.drop(['sentimen_probs'], axis=1)
#early_df['sentiment_score']=early_df['pos']-early_df['neg']
#late_df = pd.read_csv("part2_probs_df.csv")
#combo_df = pd.concat([early_df, late_df], ignore_index=True)

#last_df = pd.read_csv("20250405-20250512_probs_df.csv")
#meta_df = pd.concat([combo_df, last_df], ignore_index=True)
#meta_df.to_csv("/Users/jiao/projects/24_250509_may_25_df.csv", index=False)

In [3]:
#now we get an avg sentiment score for each date
senti_df = pd.read_csv("24_250509_may_25_df.csv")
senti_df=senti_df.groupby('date').mean()
senti_df = senti_df.reset_index()
print(senti_df)

           date       Close        High         Low        Open      Volume  \
0    2024-01-02  184.532074  187.315366  182.792518  186.033057  82488700.0   
1    2024-01-03  183.150391  184.770668  182.335277  183.120571  58414500.0   
2    2024-01-04  180.824356  181.997307  179.800504  181.062914  71983600.0   
3    2024-01-05  180.098709  181.669281  179.094742  180.903888  62303300.0   
4    2024-01-08  184.452560  184.492330  180.416793  181.003268  59144500.0   
..          ...         ...         ...         ...         ...         ...   
334  2025-05-05  198.889999  204.100006  198.210007  203.100006  69018500.0   
335  2025-05-06  198.509995  200.649994  197.020004  198.210007  51216500.0   
336  2025-05-07  196.250000  199.440002  193.250000  199.169998  68536700.0   
337  2025-05-08  197.490005  200.050003  194.679993  197.720001  50478900.0   
338  2025-05-09  198.529999  200.539993  197.539993  199.000000  36415700.0   

          pos      neut       neg  sentiment_score 

In [4]:
#get the most updated adjusted close price from yahoo finance
#note: adjusted close price is dyamically updated, 
import yfinance as yf

# Define the ticker and date range
ticker = "AAPL"
start_date = "2024-01-01"
end_date = "2025-05-10"

# Download the data
prices_df = yf.download(ticker, start=start_date, end=end_date)

# Get rid of all the indices, set the top level index as column names
prices_df.columns = prices_df.columns.get_level_values(0)
prices_df = prices_df.reset_index()
prices_df = prices_df.rename(columns={'Date': 'date'})

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


In [5]:
#set up the format of the date column of price dataset and sentiment dataframes
#prepare to merge two dataframes
prices_df['date'] = pd.to_datetime(prices_df['date'], unit='ns')
senti_df['date'] = pd.to_datetime(senti_df['date'], unit='ns')
senti_df = senti_df.drop(columns=['Close','High', 'Low', 'Open','Volume'], axis=1)


In [6]:
meta_df = pd.merge(prices_df, senti_df, on='date')

In [7]:
#Feature Engineering

#sentiment features
meta_df["rolling_sentiment"] = meta_df["sentiment_score"].rolling(window=3).mean()
meta_df["sentiment_volatility"] = meta_df["sentiment_score"].rolling(window=3).std()


In [8]:
#stock indicator features
meta_df["price_change"] = meta_df["Close"].pct_change()  
meta_df["volatility"] = meta_df["Close"].rolling(5).std()
meta_df["momentum"] = meta_df["Close"] - meta_df["Close"].shift(5)  # 5-day momentum


In [9]:
#RSI detects overbought/oversold conditions.
#MACD helps track trend strength.
#EMA smooths price trends.

import pandas_ta as ta

meta_df["rsi"] = ta.rsi(meta_df["Close"], length=14)  # Relative Strength Index

meta_df["macd"] = ta.macd(meta_df["Close"])["MACD_12_26_9"]  # MACD indicator
meta_df["ema"] = ta.ema(meta_df["Close"], length=10)  # Exponential Moving Average



        date       Close        High         Low        Open    Volume  \
0 2024-01-02  184.290421  187.070068  182.553143  185.789438  82488700   
1 2024-01-03  182.910538  184.528693  182.096492  182.880757  58414500   
2 2024-01-04  180.587524  181.758939  179.565014  180.825770  71983600   
3 2024-01-05  179.862839  181.431354  178.860187  180.666963  62303300   
4 2024-01-08  184.210999  184.250716  180.180517  180.766224  59144500   

        pos      neut       neg  sentiment_score  rolling_sentiment  \
0  0.714635  0.052634  0.232731         0.481904                NaN   
1  0.778517  0.057405  0.164078         0.614440                NaN   
2  0.848109  0.103794  0.048097         0.800012           0.632119   
3  0.841648  0.080877  0.077475         0.764172           0.726208   
4  0.817603  0.083643  0.098754         0.718849           0.761011   

   sentiment_volatility  price_change  volatility  momentum  
0                   NaN           NaN         NaN       NaN  
1   

In [10]:
for lag in range(1, 6):  # sentiment score and price from the past 5 days
    meta_df[f"sentiment_lag_{lag}"] = meta_df["sentiment_score"].shift(lag)
    meta_df[f"price_lag_{lag}"] = meta_df["Close"].shift(lag)


In [11]:
meta_df["next_day_return"] = meta_df["Close"].pct_change().shift(-1)
#create a binary label, up or down
#1 → Stock goes up tomorrow
#0 → Stock goes down tomorrow
meta_df["price_direction"] = (meta_df["next_day_return"] > 0).astype(int)


In [12]:
meta_df.to_csv("/Users/jiao/projects/24_250508_may_25_featuresdf.csv", index=False)