In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
df_balance= pd.read_csv("./data/us-balance-quarterly.csv", sep=";")
df_cashflow = pd.read_csv("./data/us-cashflow-quarterly.csv", sep=";")
df_income = pd.read_csv("./data/us-income-quarterly.csv", sep=";")

In [4]:
df_prices_copy = pd.read_csv("./data/us-shareprices-daily.csv", sep=";")
df_prices = df_prices_copy.copy()
df_prices.loc['Date'] = pd.to_datetime(df_prices['Date'], errors = 'coerce')

In [5]:
from pandas.tseries.offsets import DateOffset

def get_price_on_or_after(df_prices, simfinid, target_date):
    prices = df_prices[df_prices['SimFinId'] == simfinid]
    return prices[prices['Date'] >= target_date].sort_values('Date').head(1)

def get_price_3m_later(df_prices, simfinid, publish_date):
    future_date = publish_date + DateOffset(months=3)
    return get_price_on_or_after(df_prices, simfinid, future_date)

In [3]:
df = pd.merge(df_balance, df_cashflow, on='SimFinId', how= 'outer')
df = pd.merge(df, df_income, on ='SimFinId', how='outer')

In [4]:
df.head(5)

Unnamed: 0,Ticker_x,SimFinId,Currency_x,Fiscal Year_x,Fiscal Period_x,Report Date_x,Publish Date_x,Restated Date_x,Shares (Basic)_x,Shares (Diluted)_x,...,Non-Operating Income (Loss),"Interest Expense, Net","Pretax Income (Loss), Adj.",Abnormal Gains (Losses),Pretax Income (Loss),"Income Tax (Expense) Benefit, Net",Income (Loss) from Continuing Operations,Net Extraordinary Gains (Losses),Net Income,Net Income (Common)
0,GOOG,18,USD,2019.0,Q2,2019-06-30,2019-07-26,2019-07-26,13879380000.0,14929580000.0,...,2967000000.0,627056859.0,12147000000.0,0.0,12147000000.0,-2200000000.0,9947000000.0,,9947000000.0,9947000000.0
1,GOOG,18,USD,2019.0,Q2,2019-06-30,2019-07-26,2019-07-26,13879380000.0,14929580000.0,...,-549000000.0,608046614.0,8628000000.0,0.0,8628000000.0,-1560000000.0,7068000000.0,,7068000000.0,7068000000.0
2,GOOG,18,USD,2019.0,Q2,2019-06-30,2019-07-26,2019-07-26,13879380000.0,14929580000.0,...,1438000000.0,603208347.0,10704000000.0,0.0,10704000000.0,-33000000.0,10671000000.0,,10671000000.0,10671000000.0
3,GOOG,18,USD,2019.0,Q2,2019-06-30,2019-07-26,2019-07-26,13879380000.0,14929580000.0,...,-220000000.0,563792119.0,7757000000.0,,7757000000.0,-921000000.0,6836000000.0,,6836000000.0,6836000000.0
4,GOOG,18,USD,2019.0,Q2,2019-06-30,2019-07-26,2019-07-26,13879380000.0,14929580000.0,...,1894000000.0,420441603.0,8277000000.0,,8277000000.0,-1318000000.0,6959000000.0,,6959000000.0,6959000000.0


In [8]:
df_income['Publish Date'] = pd.to_datetime(df_income['Publish Date'], errors = 'coerce')

In [None]:
yield_data = []

for idx, row in df_income.iterrows():
    simfinid = row['SimFinId']
    publish_date = row['Publish Date']
    
    try:
        price_t = get_price_on_or_after(df_prices, simfinid, publish_date)
        price_t3m = get_price_3m_later(df_prices, simfinid, publish_date)

        if not price_t.empty and not price_t3m.empty:
            p0 = price_t['Adj. Close'].values[0]
            p1 = price_t3m['Adj. Close'].values[0]
            yield_3m = (p1 - p0) / p0
        else:
            yield_3m = None

    except Exception as e:
        # if idx % 1000 == 0:
        #     print(f"Error at {idx}: {e}")
        yield_3m = None
    
    yield_data.append(yield_3m)

df['Yield_3M'] = yield_data

In [10]:
import yfinance as yf

In [11]:
tickers = df['Ticker'].unique().tolist()
tickers

['GOOG',
 'ATVI',
 'BLK',
 'ADBE',
 'MMM',
 'PPL',
 'TWTR',
 'NWY',
 'AMSC',
 'VRTU',
 'QRVO',
 'ASNA',
 'ENS',
 'NGL',
 'EGHT',
 'GWRE',
 'CBRL',
 'CPRI',
 'BOOT',
 'SHOS',
 'PLAB',
 'JVA',
 'EXAS',
 'SIOX',
 'SSI',
 'TESS',
 'THO',
 'MLAB',
 'GME',
 'PLAY',
 'SMRT_delisted',
 'ZUMZ',
 'A',
 'PVH',
 'URBN',
 'HPE',
 'BOX',
 'FL',
 'GSIT',
 'CAL',
 'CSWI',
 'BIG',
 'CIEN',
 'VRA',
 'AMBA',
 'ENZ',
 'MIK',
 'DCI',
 'EXPR',
 'SPLK',
 'FRED',
 'GCO',
 'CGRN',
 'UNFI',
 'CMD',
 'HQY',
 'PURE',
 'ONVO',
 'SPPI',
 'VIRC',
 'ADT',
 'EEI',
 'LXRX',
 'TSLA',
 'NATI',
 'WY',
 'ENG',
 'AZO',
 'DELL',
 'MSFT',
 'ITI',
 'FFIV',
 'BBBY',
 'TRNS',
 'MU',
 'ABBV',
 'ACN',
 'AGN',
 'AYI',
 'AES',
 'AMG',
 'AMZN',
 'SITE',
 'LZB',
 'BREW',
 'ABT',
 'INFOR',
 'MEI',
 'CYAN',
 'HLI',
 'ALXN',
 'ALK',
 'AMGN',
 'APH',
 'ADI',
 'AVB',
 'AVY',
 'AN',
 'T',
 'AJG',
 'ADM',
 'AMAT',
 'AIV',
 'AON',
 'APC',
 'AME',
 'AMP',
 'AMT',
 'AAL',
 'AEE',
 'ALLE',
 'AKAM',
 'IBM',
 'BXP',
 'BWA',
 'BA',
 'BIIB',
 'BBY',

In [None]:
for ticker in tickers:
    try:
        df_yf = yf.download(ticker, start="2019-01-01", end="2024-01-01", progress=False)
        df_yf = df.reset_index()
        df_yf['Ticker'] = ticker
        all_data.append(df_yf)
    except Exception as e:
        print(f"Error fetching {ticker}: {e}")

df_prices_yf = pd.concat(all_data, ignore_index=True)

Failed to get ticker 'GOOG' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116dcd160>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['GOOG']: ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'fc.yahoo.com\', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bcf9d0>: Failed to resolve \'fc.yahoo.com\' ([Errno 8] nodename nor servname provided, or not known)"))'))


YF.download() has changed argument auto_adjust default to True


Failed to get ticker 'ATVI' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116df8690>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['ATVI']: ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'fc.yahoo.com\', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116df8910>: Failed to resolve \'fc.yahoo.com\' ([Errno 8] nodename nor servname provided, or not known)"))'))


Error fetching GOOG: name 'all_data' is not defined


Failed to get ticker 'BLK' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116df9590>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['BLK']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching ATVI: name 'all_data' is not defined


Failed to get ticker 'ADBE' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116df9a90>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['ADBE']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching BLK: name 'all_data' is not defined


Failed to get ticker 'MMM' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116df9f90>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['MMM']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching ADBE: name 'all_data' is not defined


Failed to get ticker 'PPL' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bcfed0>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['PPL']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching MMM: name 'all_data' is not defined


Failed to get ticker 'TWTR' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bce490>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['TWTR']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching PPL: name 'all_data' is not defined


Failed to get ticker 'NWY' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bcf9d0>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['NWY']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching TWTR: name 'all_data' is not defined


Failed to get ticker 'AMSC' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bce850>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['AMSC']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching NWY: name 'all_data' is not defined


Failed to get ticker 'VRTU' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bcf4d0>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['VRTU']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching AMSC: name 'all_data' is not defined


Failed to get ticker 'QRVO' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bcf110>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['QRVO']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching VRTU: name 'all_data' is not defined


Failed to get ticker 'ASNA' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116df9e50>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['ASNA']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching QRVO: name 'all_data' is not defined


Failed to get ticker 'ENS' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116df9a90>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['ENS']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching ASNA: name 'all_data' is not defined


Failed to get ticker 'NGL' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116df9950>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['NGL']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching ENS: name 'all_data' is not defined


Failed to get ticker 'EGHT' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116df9090>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['EGHT']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching NGL: name 'all_data' is not defined


Failed to get ticker 'GWRE' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bcefd0>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['GWRE']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching EGHT: name 'all_data' is not defined


Failed to get ticker 'CBRL' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bced50>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['CBRL']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching GWRE: name 'all_data' is not defined


Failed to get ticker 'CPRI' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bcfb10>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['CPRI']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching CBRL: name 'all_data' is not defined


Failed to get ticker 'BOOT' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bcf750>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['BOOT']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching CPRI: name 'all_data' is not defined


Failed to get ticker 'SHOS' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bce5d0>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['SHOS']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching BOOT: name 'all_data' is not defined


Failed to get ticker 'PLAB' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bcfc50>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['PLAB']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching SHOS: name 'all_data' is not defined


Failed to get ticker 'JVA' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116df9590>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['JVA']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching PLAB: name 'all_data' is not defined


Failed to get ticker 'EXAS' reason: HTTPSConnectionPool(host='fc.yahoo.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116df9d10>: Failed to resolve 'fc.yahoo.com' ([Errno 8] nodename nor servname provided, or not known)"))

1 Failed download:
['EXAS']: YFTzMissingError('possibly delisted; no timezone found')


Error fetching JVA: name 'all_data' is not defined
