# data availability and states

date format in this notebook is yyyy-mm-dd

In [1]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle 
import random
import matplotlib.pyplot as plt
import datetime
from collections import defaultdict
from dateutil.relativedelta import relativedelta
from datetime import datetime
from datetime import datetime, timedelta

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from utils.evaluate import evaluate, r4
from utils.openai import OpenAIEmbedding
from utils.utils import human_format_number, human_format_number2
from utils.companies import company100_name, company100_name_shorten, company100_ticker

OAE = OpenAIEmbedding()

In [3]:
from DataProcessor.historic import get_target, get_momentums, historic_moving_average_6m
from DataProcessor.historic import historic_vol_moving_average_6m, historic_6m, historic_moving_std_6m

In [4]:
from DataProcessor.news import historic_news_wsj, historic_news_nyt, historic_news_cmin, historic_news_alphav

In [5]:
from DataProcessor.financials import financials_nq

## Directories

In [6]:
dataset_path = '../dataset/'
historic_data_path = '../dataset/Historical Price/'
financials_data_path = '../dataset/Financial Quarterly Reports/'
news_data_path = '../dataset/News Articles/'
news_wsj_path = '../dataset/News Articles/WSJ-Header/'
news_nyt_path = '../dataset/News Articles/NYT/110/'
news_alphav_path = '../dataset/News Articles/Alpha-V/without text 104/'
news_CMINUS_path = '../dataset/News Articles/CMIN-US/'
dotcsv = '.csv'

In [7]:
def get_dir_list(path):
    listt = os.listdir(path)
    if '.DS_Store' in listt:
        listt.remove('.DS_Store')
    if '.ipynb_checkpoints' in listt:
        listt.remove('.ipynb_checkpoints')
    return listt

### Historic Data

In [8]:
for ticker, name in zip(company100_ticker,company100_name_shorten):
    try:
        a = get_target(ticker, '2020-01-01', binn = False)
        a = get_momentums(ticker, '2020-01-01', binn = False)
        a = historic_moving_average_6m(ticker, '2020-01-01')
        a = historic_moving_std_6m(ticker, '2020-01-01')
        a = historic_vol_moving_average_6m(ticker, '2020-01-01')
        a = historic_6m(ticker, '2020-01-01')
    except:
        print('no data for:', ticker, name)

no data for: BBL BHP
no data for: PTR PetroChina
no data for: RDS-B Royal Dutch


### News Data

In [9]:
all_news_count = []
for ticker, name in zip(company100_ticker,company100_name_shorten):
    try:
        a = 100 #historic_news_wsj(ticker, '2024-06-31', '120m')
    except:
        print('no wsj data for:', ticker, name)
    try:
        b = len(historic_news_nyt(ticker, '2024-06-30', '120m'))
    except:
        print('no nyt data for:', ticker, name)
    try:
        c = len(historic_news_cmin(ticker, '2024-06-30', '120m'))
    except:
        print('no cmin data for:', ticker, name)
    try:
        d = len(historic_news_alphav(ticker, '2024-06-30', '120m'))
    except:
        print('no alphav data for:', ticker, name)
    all_news_count.append(d)
print('total of', sum(all_news_count), 'articles.')

BBL 2024-06-30 error occured reading alphav
DD 2024-06-30 error occured reading nyt
SNPMF 2024-06-30 error occured reading nyt
PTR 2024-06-30 error occured reading alphav
RDS-B 2024-06-30 error occured reading alphav
JPM 2024-06-30 error occured reading alphav
total of 75541 articles.


### Financials

In [10]:
for ticker, name in zip(company100_ticker,company100_name_shorten):
    try:
        a = financials_nq(ticker, '2024-12-01', 40)
    except:
        print('no data for:', ticker, name)


no data for: FB facebook
no data for: BBL BHP
no data for: SNPMF China Petroleum
no data for: SPG Simon Property
no data for: C Citigroup
no data for: PTR PetroChina
no data for: RDS-B Royal Dutch
no data for: WFC Wells Fargo


### numbers

In [11]:
numbers = [50, 5000, -2500000, 750000000, -1234567890123]
formatted_numbers = [human_format_number(num) for num in numbers]
print(formatted_numbers)

['50.000', '5.0K', '-2.5M', '0.8B', '-1.2T']


### Create Movement Prompts

In [12]:
def process_historic(ticker, qdate, include_volume = 0.7):
    try:
        ma = ", ".join(r4(historic_moving_average_6m(ticker, qdate)))
        nancount = ma.count('nan')
        if nancount > 1:
            return -1
        moving_average_temp = """average market price for last 6 monts from old to new: $ma$"""
        res = moving_average_temp.replace('$ma$', ma)
        if np.random.rand() >= include_volume:
            formatted_numbers = [human_format_number2(num) for num in historic_vol_moving_average_6m(ticker, qdate)]
            va = ", ".join(formatted_numbers)
            moving_volume_temp = """average volume of exchange for last 6 monts from old to new: $va$"""
            moving_volume_temp = moving_volume_temp.replace('$va$', va)
            res = res + '\n' + moving_volume_temp
    except:
        print(ticker, qdate, 'grabber: no historic data for: ', ticker)
        return -1
    return str('historic price data:\n'+res)

def process_financials(ticker, qdate, include = 2):
    try: 
        res = 'financial data for last four quarters from old to new:\n'
        fins = financials_nq(ticker, qdate, 4)
        keys = list(fins.keys())[1:]
        if len(keys) <= 10:
            return -1
        include = min(include, len(keys)-1)
        sampled_keys = random.sample(keys, include)
        sampled_dict = {key: fins[key] for key in sampled_keys}
        a = [kl + ': ' + ", "
             .join([str(human_format_number2(num)) for num in sampled_dict[kl]]) for kl in sampled_dict]
        for i in a:
            if len(i) < 30:
                return -1
            if i.count('nan') > 2:
                return -1
    except:
        print(ticker, qdate, 'grabber: no financials data for: ', ticker)
        return -1
    return res + '\n'.join(a)

def process_news(ticker, qdate, include = 3):
    
    news = historic_news_nyt(ticker, qdate, '1m')
    news += historic_news_wsj(ticker, qdate, '1m')
    news += historic_news_alphav(ticker, qdate, '1m')
    news += historic_news_cmin(ticker, qdate, '1m')

    if len(news) < 10:
        news = historic_news_nyt(ticker, qdate, '3m')
        news += historic_news_wsj(ticker, qdate, '3m')
        news += historic_news_alphav(ticker, qdate, '3m')
        news += historic_news_cmin(ticker, qdate, '3m')
    
    if len(news)>10:
        news1 = []
        for new in news:
            if len(str(new['abstract']))>20:
                news1.append(new)
        news = news1
        
    news1 = []
    for new in news:
        if len(str(new['headline']))>10:
            news1.append(new)
    news = news1
    
    news1 = []
    for new in news:
        a = new['headline']+ ' ' + str(new['abstract'])
        a.split()
        if len(a)<500:
            news1.append(new)
    news = news1
    
    if len(news) < 3:
        return -1, -1
    
    enews = [OAE.get_embedding(i['headline']+ ' ' + str(i['abstract'])) for i in news]
    query = "investing in $$ company. influence on stock price. news that have information on the value of stock."
    query = query.replace('$$', company100_name[company100_ticker.index(ticker)])
    equery = OAE.get_embedding(query)
    sims = list(np.argsort(cosine_similarity([equery], enews)[0]))
    
    news1 = [news[sims.index(0)], news[sims.index(1)], news[sims.index(2)]]
    news1 = ['title: ' + i['headline'] + '\n' + 'news summary: ' + str(i['abstract']) for i in news1]
    news1 = '\n'.join(news1)
    
    if len(news) < 6:
        return news1, -1
    
    news2 = [news[sims.index(3)], news[sims.index(4)], news[sims.index(5)]]
    news2 = ['title: ' + i['headline'] + '\n' + 'news summary: ' + str(i['abstract']) for i in news2]
    news2 = '\n'.join(news2)
    return 'recent news: \n' + news1, 'recent news: \n' + news2

In [13]:
start_date = '2015-01-01'
end_date = '2024-01-01'
date_range = pd.date_range(start=start_date, end=end_date, freq='MS')
formatted_dates = date_range.strftime('%Y-%m-%d').tolist()

# formatted_dates, company100_ticker

In [14]:
news_included = 3
financials_included = 10
volume_included = 0.4

In [22]:
for ticker in ['BBL', 'SPG', 'C', 'PTR', 'RDS-B']:
    print(ticker)
    prompts = []
    targets = []
    targets_bin = []
    dates = []
    companies = []
    
    try:
        financials = pd.read_csv(financials_data_path+ticker+dotcsv)
        if not len(financials)>10:
            print('precheck: company does not have financials', ticker)
            continue
    except:
        print('precheck: company does not have financials', ticker)
        continue
    
    try:
        historics = pd.read_csv(historic_data_path+ticker+dotcsv)
        if not len(historics)>10:
            print('precheck: company does not have historics', ticker)
            continue
    except:
        print('precheck: company does not have historics', ticker)
        continue
    
    for qdate in tqdm(formatted_dates[1:]):
        
        target_bin = get_target(ticker, qdate, binn = True)
        target = get_target(ticker, qdate, binn = False)
        if target_bin == -1 or target == -1:
            print(qdate, ticker, 'no target unavailable.')
            continue
            
        historic = process_historic(ticker, qdate, volume_included) # for 0.4 chance include volume
        financials = process_financials(ticker, qdate, include = random.randint(13, 15)) # 10 features included
        
        if historic != -1 and financials != -1:
            fin = list([historic, financials])
        elif historic == -1 and financials != -1:
            fin = [financials]
        elif historic != -1 and financials == -1:
            fin = [historic]
        else:
            print(qdate, ticker, 'main for: no financials.')
            continue
        
        news1, news2 = process_news(ticker, qdate, include = news_included) # 3 news included
        
        if news1 == -1 and news2 == -1:
            print(qdate, ticker, 'main for: no news data available.')
            continue
        elif news1 != -1 and news2 == -1:
            prompt_list1 = fin + [news1]
            prompt_list2 = -1
        elif news1 != -1 and news2 != -1:
            news12 = [news1, news2]
            random.shuffle(news12)
            [news1, news2] = news12
            prompt_list1 = fin + [news1]
            prompt_list2 = fin + [news2]
        
        random.shuffle(prompt_list1)
        prompt1 = '\n'.join(prompt_list1)
        prompts.append(prompt1)
        targets.append(target)
        targets_bin.append(target_bin)
        dates.append(qdate)
        companies.append(ticker)
        
        if len(prompt_list1) == 3:
            sample_prompt = random.sample(prompt_list1, 2)
            random.shuffle(sample_prompt)
            prompt3 = '\n'.join(sample_prompt)
            prompts.append(prompt3)
            targets.append(target)
            targets_bin.append(target_bin)
            dates.append(qdate)
            companies.append(ticker)
        
        if prompt_list2 != -1:
            random.shuffle(prompt_list2)
            prompt2 = '\n'.join(prompt_list2)
            prompts.append(prompt2)
            targets.append(target)
            targets_bin.append(target_bin)
            dates.append(qdate)
            companies.append(ticker)
            
            if len(prompt_list2) == 3:
                sample_prompt = random.sample(prompt_list2, 2)
                random.shuffle(sample_prompt)
                prompt4 = '\n'.join(sample_prompt)
                prompts.append(prompt4)
                targets.append(target)
                targets_bin.append(target_bin)
                dates.append(qdate)
                companies.append(ticker)
                
    print('-'*3, len(companies), 'data created for', ticker)
        
    prompts_dict = {'prompts': prompts, 'targets': targets, 
                'targets_bin': targets_bin, 'dates': dates, 'companies':companies}
 
    with open('./prompts/' + ticker + '.pkl', 'wb') as file:  
        pickle.dump(prompts_dict, file)

BBL
precheck: company does not have financials BBL
SPG
precheck: company does not have financials SPG
C
precheck: company does not have financials C
PTR
precheck: company does not have financials PTR
RDS-B
precheck: company does not have financials RDS-B


In [23]:
asdf= []
for i in os.listdir('./prompts'):
    if not i == '.DS_Store':
        file = open("./prompts/"+i,'rb')
        object_file = pickle.load(file)
        if len(object_file['targets']) <= 200:
            print(i)
        asdf.append(len(object_file['targets']))
sum(asdf)

PSA.pkl
PLD.pkl
SNPMF.pkl
WELL.pkl
DD.pkl
CMCSA.pkl
LOW.pkl


34276

### Create Sentiment Analysis Prompts