In [42]:
from datetime import datetime, time, timedelta

import pandas as pd

from ib_async.ib import IB
from ib_async.contract import Stock
from ib_async import util

from tqdm.auto import tqdm

### Connect IB


In [None]:
util.startLoop()
ib = IB()
ib.connect("127.0.0.1", 4001, clientId=1)

In [None]:
news_df = pd.read_csv("/Users/akseljoonas/Documents/predtrade/news_data_nov.csv")

### Validate tickers


In [None]:
import yaml
def load_config(sector = 'biotech'):
    config_file = f"{sector}.yaml"
    with open(config_file, "r") as file:
        rss_dict = yaml.safe_load(file)

    return rss_dict

In [None]:
rss_dict = load_config()
valid_tickers = []
for ticker in rss_dict.keys():
    contract = Stock(ticker, "SMART", "USD")
    if ib.qualifyContracts(contract):
        valid_tickers.append(ticker)
        
for ticker in rss_dict.keys():
    if ticker not in valid_tickers:
        print(ticker)
print(len(rss_dict.keys()), len(valid_tickers))

remove invalid tickers from df


In [None]:
news_df = news_df[news_df["ticker"].isin(valid_tickers)]


In [None]:
news_df.to_csv("/Users/akseljoonas/Documents/predtrade/news_data_nov_validated.csv", index=False)

### Turn into eastern time

In [None]:
import pytz

utc = pytz.utc
eastern = pytz.timezone("US/Eastern")

# Convert 'published_gmt' from UTC to Eastern Time
news_df["published_gmt"] = news_df["published_gmt"].apply(
    lambda x: datetime.fromisoformat(x)
    .astimezone(eastern)
    .strftime("%Y-%m-%d %H:%M:%S%z")
)
news_df.head()

In [None]:
news_df.to_csv(
    "/Users/akseljoonas/Documents/predtrade/news_data_nov_validated.csv", index=False
)

### Import prices

In [44]:
def get_day_price(ticker, start_time):
    contract = Stock(ticker, "SMART", "USD")
    if not ib.qualifyContracts(contract):
        return pd.DataFrame()

    date = datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S%z").date()
    end_of_day = datetime.combine(date, time(23, 59, 59))

    bar_data = ib.reqHistoricalData(
        contract,
        endDateTime=end_of_day.strftime("%Y%m%d %H:%M:%S"),
        durationStr="2 D",
        barSizeSetting="1 min",
        whatToShow="TRADES",
        useRTH=False,
    )
    bar_data = util.df(bar_data)

    if bar_data is None or bar_data.empty:

        end_of_day = end_of_day + timedelta(days=1)
        bar_data = ib.reqHistoricalData(
            contract,
            endDateTime=end_of_day.strftime("%Y%m%d %H:%M:%S"),
            durationStr="3 D",
            barSizeSetting="1 min",
            whatToShow="TRADES",
            useRTH=False,
        )
        bar_data = util.df(bar_data)
        if bar_data is None or bar_data.empty:
            bar_data = pd.DataFrame()

    return bar_data

In [45]:
import yfinance as yf


def get_yf_data(ticker, start_date):
    try:
        stock = yf.Ticker(ticker)
    except Exception:
        return None, None
    info = stock.info
    if info is None:
        return None, None

    try:
        float_shares = info["sharesOutstanding"]
    except KeyError:
        float_shares = None

    try:
        short_interest = info["dateShortInterest"]
    except KeyError:
        short_interest = None

    return float_shares, short_interest

In [46]:
def get_avg_volume(ticker, start_time):
    contract = Stock(ticker, "SMART", "USD")

    bar_data = ib.reqHistoricalData(
        contract,
        endDateTime=start_time.strftime("%Y%m%d %H:%M:%S"),
        durationStr="10 D",
        barSizeSetting="1 day",
        whatToShow="TRADES",
        useRTH=False,
    )
    
    if not bar_data:
        return 0
    
    bar_data = util.df(bar_data)
    return int(bar_data["volume"].mean())

In [47]:
def get_ibb_trend(start_time):
    # Calculate the start of the week

    contract = Stock("IBB", "SMART", "USD")
    bar_data = ib.reqHistoricalData(
        contract,
        endDateTime=start_time.strftime("%Y%m%d %H:%M:%S"),
        durationStr="10 D",
        barSizeSetting="1 day",
        whatToShow="TRADES",
        useRTH=False,
    )
    bar_data = util.df(bar_data)

    if bar_data.empty:
        return None  # No data available

    # Calculate the trend
    first_price = bar_data.iloc[0]["close"]
    last_price = bar_data.iloc[-1]["close"]

    # Return 1 if trend is positive, 0 otherwise
    return 1 if last_price > first_price else 0

In [48]:
def get_times(published_time):
    tzinfo = published_time.tzinfo
    # Define market times in EST
    premarket_start = time(0, 0, tzinfo=tzinfo)
    premarket_end = time(9, 30, tzinfo=tzinfo)
    market_start = time(9, 30, tzinfo=tzinfo)
    market_end = time(16, 0, tzinfo=tzinfo)
    after_hours_start = time(16, 0, tzinfo=tzinfo)
    after_hours_end = time(23, 59, 59, tzinfo=tzinfo)

    # Get the date component of the published_time
    published_date = published_time.date()
    prev_date = published_time - timedelta(days=1)
    next_date = published_time + timedelta(days=1)

    time_tz = published_time.time().replace(tzinfo=published_time.tzinfo)

    if premarket_start < time_tz < premarket_end:
        start_time = pd.to_datetime(datetime.combine(prev_date, market_end))
        end_time = pd.to_datetime(datetime.combine(published_time, market_start))
    elif market_start <= time_tz < market_end:
        start_time = published_time
        end_time = published_time
    elif after_hours_start <= time_tz < after_hours_end:
        start_time = pd.to_datetime(datetime.combine(published_time, market_end))
        end_time = pd.to_datetime(datetime.combine(next_date, market_start))
    else:
        raise ValueError("Invalid time period for trading hours")

    
    return start_time, end_time

In [49]:
def format_bars(df, target_time, ticker):
    target_time = pd.to_datetime(target_time)

    start_time, end_time = get_times(target_time)

    start_time = start_time - timedelta(minutes=1)
    end_time = end_time + timedelta(minutes=5)

    print(target_time)
    print(start_time)
    print(end_time)

    day_df = df
    daily_volume = day_df["volume"].sum()

    # Get the time needed
    df = df[(df["date"] >= start_time) & (df["date"] <= end_time)]
    if df.empty:
        return {
            "open": None,
            "high": None,
            "low": None,
            "close": None,
            "average": None,
            "volume": None,
            "trade_count": None,
            "vwap": None,
            "relative_volume_daily": None,
            "relative_volume_5m": None,
            "float_rotation": None,
            "short_interest": None,
            "index_trend": None,
            "price_move": None,
            "increased_3_percent": None,
        }


    float_shares, short_interest = get_yf_data(ticker, start_time)
    avg_daily_volume = get_avg_volume(ticker, start_time)
    index_trend = get_ibb_trend(start_time)

    first_close = df["close"].iloc[0]
    first_open = df["open"].iloc[0]
    last_close = df["close"].iloc[-1]
    max_high = df["high"].max()
    min_low = df["low"].min()
    average = df["average"].mean()
    sum_volume = df["volume"].sum()
    sum_trade_count = df["barCount"].sum()

    relative_volume_daily = (
        daily_volume - avg_daily_volume
    ) / avg_daily_volume + 0.00000000000000001
    prev_5m_volume = day_df[
        (day_df["date"] >= target_time - pd.Timedelta(minutes=6))
        & (day_df["date"] <= start_time)
    ]["volume"].mean()
    relative_volume_5m = (
        sum_volume - prev_5m_volume
    ) / prev_5m_volume + 0.00000000000000001
    float_rotation = daily_volume / float_shares

    price_move = (last_close - first_close) / first_close

    print(f"price move {price_move}")

    increased_3_percent = 1 if price_move >= 0.03 else 0

    pv = df["average"] * df["volume"]
    cumulative_pv = sum(pv)
    aggregated_vwap = cumulative_pv / sum_volume

    return {
        "open": first_open,
        "high": max_high,
        "low": min_low,
        "close": last_close,
        "average": average,
        "volume": sum_volume,
        "trade_count": sum_trade_count,
        "vwap": aggregated_vwap,
        "relative_volume_daily": relative_volume_daily,
        "relative_volume_5m": relative_volume_5m,
        "float_rotation": float_rotation,
        "short_interest": short_interest,
        "index_trend": index_trend,
        "price_move": price_move,
        "increased_3_percent": increased_3_percent,
    }

In [50]:
import math


def add_stock_data(df):
    print(len(df))
    for index, row in tqdm(df.iterrows()):
        if index < 3500 or not math.isnan(row["increased_3_percent"]):
            continue
        ticker = row["ticker"]

        start_time = row["published_gmt"]
        day_df = get_day_price(ticker, start_time)

        if not day_df.empty:
            print(f"\n\n{index} {ticker}")
            stock_info = format_bars(day_df, start_time, ticker)

            # Update row
            for col in stock_info.keys():
                df.loc[index, col] = stock_info[col]

        if index % 100 == 0:
            df.to_csv("/Users/akseljoonas/Documents/predtrade/stock_bars.csv", index=False)

    return df

In [None]:
import os


if os.path.exists("/Users/akseljoonas/Documents/predtrade/stock_bars.csv"):
    news_df = pd.read_csv("/Users/akseljoonas/Documents/predtrade/stock_bars.csv")


stock_df = add_stock_data(news_df)

stock_df.dropna(subset=["increased_3_percent"], axis=0, inplace=True)
stock_df["topic"] = (
    stock_df["topic"]
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("/", "_")
    .str.replace("&", "")
    .str.replace("'", "")
)

In [None]:
stock_df.head()

In [53]:
# def get_ta_indicators(df):
#     EMA = abstract.Function("ema")
#     BBANDS = abstract.Function("bbands")
#     MACD = abstract.Function("macd")
#     RSI = abstract.Function("rsi")

#     macd, macdsignal, macdhist = MACD(
#         df["close"], fastperiod=12, slowperiod=26, signalperiod=9
#     )
#     bb_upper, bb_middle, bb_lower = BBANDS(
#         df["close"], timeperiod=5, nbdevup=2, nbdevdn=2, matype=0
#     )
#     real = RSI(df["close"], timeperiod=14)

#     ema_21 = EMA(df["close"], timeperiod=21)
#     ema_100 = EMA(df["close"], timeperiod=100)
#     ema_7 = EMA(df["close"], timeperiod=7)

#     exit(1)

In [54]:
stock_df.to_csv("/Users/akseljoonas/Documents/predtrade/stock_bars_final.csv", index=False)