In [None]:
from datetime import datetime, time, timedelta

import pandas as pd

from ib_async.ib import IB
from ib_async.contract import Stock
from ib_async import util

from tqdm.auto import tqdm

### Connect IB


In [None]:
util.startLoop()
ib = IB()
ib.connect("127.0.0.1", 4001, clientId=1)

In [13]:
news_df = pd.read_csv(
    "/Users/akseljoonas/Documents/news-sentiment/data/raw/news_validated-new.csv"
)

### Import prices

In [14]:
def get_day_price(ticker, start_time):
    contract = Stock(ticker, "SMART", "USD")
    if not ib.qualifyContracts(contract):
        return pd.DataFrame()

    date = datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S%z").date()
    end_of_day = datetime.combine(date, time(23, 59, 59))

    bar_data = ib.reqHistoricalData(
        contract,
        endDateTime=end_of_day.strftime("%Y%m%d %H:%M:%S"),
        durationStr="2 D",
        barSizeSetting="20 mins",
        whatToShow="TRADES",
        useRTH=False,
    )
    bar_data = util.df(bar_data)

    if bar_data is None or bar_data.empty:
        end_of_day = end_of_day + timedelta(days=1)
        bar_data = ib.reqHistoricalData(
            contract,
            endDateTime=end_of_day.strftime("%Y%m%d %H:%M:%S"),
            durationStr="3 D",
            barSizeSetting="20 mins",
            whatToShow="TRADES",
            useRTH=False,
        )
        bar_data = util.df(bar_data)
        if bar_data is None or bar_data.empty:
            bar_data = pd.DataFrame()

    return bar_data

In [15]:
import yfinance as yf


def get_yf_data(ticker, start_date):
    try:
        stock = yf.Ticker(ticker)
    except Exception:
        return None, None
    info = stock.info
    if info is None:
        return None, None

    try:
        float_shares = info["sharesOutstanding"]
    except KeyError:
        float_shares = None

    try:
        short_interest = info["dateShortInterest"]
    except KeyError:
        short_interest = None

    return float_shares, short_interest

In [16]:
def get_avg_volume(ticker, start_time):
    contract = Stock(ticker, "SMART", "USD")

    bar_data = ib.reqHistoricalData(
        contract,
        endDateTime=start_time.strftime("%Y%m%d %H:%M:%S"),
        durationStr="10 D",
        barSizeSetting="1 day",
        whatToShow="TRADES",
        useRTH=False,
    )
    
    if not bar_data:
        return 0
    
    bar_data = util.df(bar_data)
    return int(bar_data["volume"].mean())

In [17]:
def get_ibb_trend(start_time):
    # Calculate the start of the week

    contract = Stock("IBB", "SMART", "USD")
    bar_data = ib.reqHistoricalData(
        contract,
        endDateTime=start_time.strftime("%Y%m%d %H:%M:%S"),
        durationStr="10 D",
        barSizeSetting="1 day",
        whatToShow="TRADES",
        useRTH=False,
    )
    bar_data = util.df(bar_data)

    if bar_data.empty:
        return None  # No data available

    # Calculate the trend
    first_price = bar_data.iloc[0]["close"]
    last_price = bar_data.iloc[-1]["close"]

    # Return 1 if trend is positive, 0 otherwise
    return 1 if last_price > first_price else 0

In [18]:
def get_times(published_time):
    tzinfo = published_time.tzinfo
    # Define market times in EST
    premarket_start = time(0, 0, tzinfo=tzinfo)
    premarket_end = time(9, 30, tzinfo=tzinfo)
    market_start = time(9, 30, tzinfo=tzinfo)
    market_end = time(16, 0, tzinfo=tzinfo)
    after_hours_start = time(16, 0, tzinfo=tzinfo)
    after_hours_end = time(23, 59, 59, tzinfo=tzinfo)


    # Get the date component of the published_time
    prev_date = published_time - timedelta(days=1)
    next_date = published_time + timedelta(days=1)

    time_tz = published_time.time().replace(tzinfo=published_time.tzinfo)

    if premarket_start <= time_tz < premarket_end:
        start_time = pd.to_datetime(datetime.combine(prev_date, market_end))
        end_time = pd.to_datetime(datetime.combine(published_time, market_start))
    elif market_start <= time_tz < market_end:
        start_time = published_time
        end_time = published_time
    elif after_hours_start <= time_tz < after_hours_end:
        start_time = pd.to_datetime(datetime.combine(published_time, market_end))
        end_time = pd.to_datetime(datetime.combine(next_date, market_start))
    else:
        raise ValueError(
            f"Invalid time period for trading hours {published_time}, time_tz {time_tz}"
        )

    return start_time, end_time

In [19]:
import yfinance as yf


def get_market_cap(ticker: str) -> float | None:

    try:
        stock = yf.Ticker(ticker)
        market_cap = stock.info["marketCap"]
        # Convert to millions and format with 2 decimal places
        market_cap_millions = round(market_cap / 1_000_000, 2)
        return market_cap_millions
    except Exception:   
        return None



In [20]:
def format_bars(df, target_time, ticker):
    target_time = pd.to_datetime(target_time)

    market_start_time, market_end_time = get_times(target_time)

    start_time = market_start_time - timedelta(minutes=1)
    end_time = market_end_time + timedelta(hours=8)


    # day_df = df
    # daily_volume = day_df["volume"].sum()

    # Get the time needed
    df = df[(df["date"] >= start_time) & (df["date"] <= end_time)]

    if df.empty:
        return {
            "open": None,
            "high": None,
            "low": None,
            "close": None,
            "average": None,
            "volume": None,
            "trade_count": None,
            "vwap": None,
            # "relative_volume_daily": None,
            # "relative_volume_5m": None,
            # "float_rotation": None,
            # "short_interest": None,
            # "index_trend": None,
            "price_move": None,
            "labels": None,
        }

    # float_shares, short_interest = get_yf_data(ticker, start_time)
    # avg_daily_volume = get_avg_volume(ticker, start_time)
    # index_trend = get_ibb_trend(start_time)

    first_close = df["close"].iloc[0]
    first_open = df["open"].iloc[0]
    last_close = df["close"].iloc[-1]
    max_high = df["high"].max()
    min_low = df["low"].min()
    average = df["average"].mean()
    sum_volume = df["volume"].sum()
    sum_trade_count = df["barCount"].sum()

    # relative_volume_daily = (
    #     daily_volume - avg_daily_volume
    # ) / avg_daily_volume + 0.00000000000000001
    # prev_5m_volume = day_df[
    #     (day_df["date"] >= target_time - pd.Timedelta(minutes=6))
    #     & (day_df["date"] <= start_time)
    # ]["volume"].mean()
    # relative_volume_5m = (
    #     sum_volume - prev_5m_volume
    # ) / prev_5m_volume + 0.00000000000000001
    # float_rotation = daily_volume / float_shares

    price_move = (last_close - first_close) / first_close

    market_cap = get_market_cap(ticker)

    if market_cap is None:
        market_cap = 0

    if market_cap > 10000:
        threshold = 0.01
    elif market_cap > 2000:
        threshold = 0.02
    elif market_cap > 250:
        threshold = 0.03
    else:
        threshold = 0.04

    if price_move > threshold:
        labels = 2
    elif price_move < -threshold:
        labels = 0
    else:
        labels = 1

    pv = df["average"] * df["volume"]
    cumulative_pv = sum(pv)
    aggregated_vwap = cumulative_pv / sum_volume

    return {
        "open": first_open,
        "high": max_high,
        "low": min_low,
        "close": last_close,
        "average": average,
        "volume": sum_volume,
        "trade_count": sum_trade_count,
        "vwap": aggregated_vwap,
        # "relative_volume_daily": relative_volume_daily,
        # "relative_volume_5m": relative_volume_5m,
        # "float_rotation": float_rotation,
        # "short_interest": short_interest,
        # "index_trend": index_trend,
        "price_move": price_move,
        "labels": labels,
    }

In [23]:
import math


def add_stock_data(df):
    print(len(df))
    for index, row in tqdm(df.iterrows(), total=len(df)):
        if index < 0 or not math.isnan(row["labels"]):
            continue
        ticker = row["ticker"]

        start_time = row["published_eastern"]

        day_df = get_day_price(ticker, start_time)

        if not day_df.empty:
            print(f"{index} {ticker}")
            stock_info = format_bars(day_df, start_time, ticker)

            # Update row
            for col in stock_info.keys():
                df.loc[index, col] = stock_info[col]

        if index % 100 == 0:
            df.to_csv(
                "/Users/akseljoonas/Documents/news-sentiment/data/processed/news+prices-new.csv",
                index=False,
            )

    return df

In [None]:
import os


if os.path.exists(
    "/Users/akseljoonas/Documents/news-sentiment/data/processed/news+prices-new.csv"
):
    print("File exists")
    news_df = pd.read_csv(
        "/Users/akseljoonas/Documents/news-sentiment/data/processed/news+prices-new.csv"
    )


stock_df = add_stock_data(news_df)

stock_df.dropna(subset=["labels"], axis=0, inplace=True)
stock_df["topic"] = (
    stock_df["topic"]
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("/", "_")
    .str.replace("&", "")
    .str.replace("'", "")
)

In [None]:
stock_df.describe()

In [26]:
stock_df.to_csv(
    "/Users/akseljoonas/Documents/news-sentiment/data/processed/news+prices-new.csv",
    index=False,
)