In [1]:
import os
import pandas as pd
import numpy as np
from openai import OpenAI
from loguru import logger
import datetime

In [5]:

os.environ['OPENAI_API_KEY'] = ''

In [6]:
def get_gpt_embeddings(client, text: str, model: str = "text-embedding-ada-002", mock: bool = False):
    """
    Get embeddings for a given text using OpenAI API.
    
    Args:
        client (object): OpenAI client object.
        text (str): The text to get embeddings for.
        model (str): The embedding model to use (default is "text-embedding-ada-002").
        mock (bool): If True, return a mock embedding with shape of 100 and all 0 values.
    
    Returns:
        list: The embeddings for the given text.
    """
    if mock:
        return [0] * 100
    
    if text:
        text = text.replace("\n", " ")
        try:
            response = client.embeddings.create(
                input=[text],
                model=model
            )
            return response.data[0].embedding
        
        except Exception as e:
            logger.error(f"An error occurred: {e}")
            return None
    else:
        return None


In [7]:
client = OpenAI()
logger.success("OpenAI client connected")

[32m2024-07-19 15:31:39.511[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [32m[1mOpenAI client connected[0m


In [8]:
x = "Hello, World!"
e = get_gpt_embeddings(client, x, model='text-embedding-ada-002')
e

[-0.002516599837690592,
 -0.0061294762417674065,
 -0.006764671765267849,
 -0.032759327441453934,
 -0.004462490789592266,
 0.003933698870241642,
 -0.021435433998703957,
 -0.008454227820038795,
 -0.0016943922964856029,
 -0.022725170478224754,
 0.032759327441453934,
 0.009021712467074394,
 -0.01750173419713974,
 -0.01563161611557007,
 0.013787291012704372,
 0.009273210540413857,
 0.026929713785648346,
 -0.009305453859269619,
 0.005942464340478182,
 0.009556952863931656,
 -0.004194870591163635,
 0.004591464530676603,
 0.015541333705186844,
 -0.00039961704169400036,
 -0.006964581087231636,
 -0.008312356658279896,
 0.0006883972673676908,
 -0.020622897893190384,
 0.03913062810897827,
 -0.026542792096734047,
 0.011801095679402351,
 -0.010704819113016129,
 -0.0033726629335433245,
 -0.021925533190369606,
 0.012465310283005238,
 -0.01895913854241371,
 0.0032710961531847715,
 -0.01449664682149887,
 0.018804369494318962,
 -0.013400370255112648,
 0.00274230376817286,
 0.00833815149962902,
 0.0003845

In [24]:
filtered_price = pd.read_csv('data/training/in/input/filtered_price.csv')
price_news_return = pd.read_csv('data/training/in/input/price_news_return.csv')

In [28]:
five_year_data = price_news_return[(price_news_return['Date'] >= '2015-06-01') & (price_news_return['Date'] < '2020-06-01')]
five_year_data

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Latest_News,Concatenated_News,1_day_return,2_day_return,3_day_return,4_day_return,7_day_return,10_day_return,30_day_return
41,2020-05-29,CSCO,41.665731,43.276611,41.095586,43.276611,43553600.0,Stocks to Buy in a Post-Pandemic World,Stocks to Buy in a Post-Pandemic World - US In...,-0.032829,-0.019833,-0.018747,-0.020487,0.004787,-0.061016,-0.033012
42,2020-05-29,JWN,15.714695,15.826675,14.930826,15.052138,25072100.0,"Nordstrom (JWN) Q1 Loss Wider Than Expected, S...","Nordstrom (JWN) Q1 Loss Wider Than Expected, S...",0.027728,0.092801,0.169840,0.243788,0.240226,0.085082,-0.050814
43,2020-05-29,KO,42.223438,42.684747,41.734991,42.223438,51125000.0,Learning from Warren Buffett's Decision to Buy...,Learning from Warren Buffett's Decision to Buy...,0.006597,0.004691,0.025470,0.025877,0.047347,-0.014468,-0.022314
44,2020-05-29,V,190.180375,192.370633,188.987469,190.903946,10211900.0,8 Stocks Viking Global Investors Continues to Buy,8 Stocks Viking Global Investors Continues to Buy,-0.004579,0.005704,0.008279,-0.008263,0.019289,-0.015500,-0.032907
45,2020-05-29,AAPL,78.254387,78.720112,77.572955,77.933281,153532400.0,DXC Technology (DXC) Q4 Earnings Top Estimates...,DXC Technology (DXC) Q4 Earnings Top Estimates...,0.012148,0.016701,0.022084,0.013589,0.075729,0.061570,0.167500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8831,2015-06-02,HLT,56.787502,57.099843,56.631330,56.982716,2869981.0,Hilton Worldwide building out Curio brand,Hilton Worldwide building out Curio brand,0.003414,0.002733,0.006805,-0.014951,-0.013542,-0.041384,-0.037314
8832,2015-06-01,HLT,56.807024,57.216972,56.611811,56.943672,3460136.0,Timeshares Trend: How Marriott Vacations Revam...,Timeshares Trend: How Marriott Vacations Revam...,0.000685,0.004097,0.003416,0.007485,-0.017440,-0.026390,-0.027837
8833,2015-06-01,MAR,72.173422,72.681038,71.398161,71.961151,1928200.0,Fitch ratings weighs in on lodging sector,Fitch ratings weighs in on lodging sector - Ti...,0.005865,0.017762,0.006498,0.005231,-0.009582,-0.012335,-0.009190
8834,2015-06-01,CL,54.954373,55.292126,54.666051,55.110893,3447100.0,"Dividend, Yield, And Price: Inseparably Linked","Dividend, Yield, And Price: Inseparably Linked",-0.002548,-0.004957,-0.014866,-0.021686,0.002535,-0.023875,0.005500


In [29]:
five_year_data['Concatenated_News_Embedding'] = five_year_data['Concatenated_News'].apply(lambda x: get_gpt_embeddings(client, x, model='text-embedding-ada-002'))
five_year_data.to_csv('data/training/in/input/price_news_return_embedding.csv', index=False)
five_year_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  five_year_data['Concatenated_News_Embedding'] = five_year_data['Concatenated_News'].apply(lambda x: get_gpt_embeddings(client, x, model='text-embedding-ada-002'))


Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Latest_News,Concatenated_News,1_day_return,2_day_return,3_day_return,4_day_return,7_day_return,10_day_return,30_day_return,Concatenated_News_Embedding
41,2020-05-29,CSCO,41.665731,43.276611,41.095586,43.276611,43553600.0,Stocks to Buy in a Post-Pandemic World,Stocks to Buy in a Post-Pandemic World - US In...,-0.032829,-0.019833,-0.018747,-0.020487,0.004787,-0.061016,-0.033012,"[0.001997282262891531, -0.018077492713928223, ..."
42,2020-05-29,JWN,15.714695,15.826675,14.930826,15.052138,25072100.0,"Nordstrom (JWN) Q1 Loss Wider Than Expected, S...","Nordstrom (JWN) Q1 Loss Wider Than Expected, S...",0.027728,0.092801,0.169840,0.243788,0.240226,0.085082,-0.050814,"[-0.032860055565834045, -0.036573056131601334,..."
43,2020-05-29,KO,42.223438,42.684747,41.734991,42.223438,51125000.0,Learning from Warren Buffett's Decision to Buy...,Learning from Warren Buffett's Decision to Buy...,0.006597,0.004691,0.025470,0.025877,0.047347,-0.014468,-0.022314,"[-0.007852538488805294, -0.015758998692035675,..."
44,2020-05-29,V,190.180375,192.370633,188.987469,190.903946,10211900.0,8 Stocks Viking Global Investors Continues to Buy,8 Stocks Viking Global Investors Continues to Buy,-0.004579,0.005704,0.008279,-0.008263,0.019289,-0.015500,-0.032907,"[-0.02816659025847912, -0.03834887966513634, 0..."
45,2020-05-29,AAPL,78.254387,78.720112,77.572955,77.933281,153532400.0,DXC Technology (DXC) Q4 Earnings Top Estimates...,DXC Technology (DXC) Q4 Earnings Top Estimates...,0.012148,0.016701,0.022084,0.013589,0.075729,0.061570,0.167500,"[-0.015577776357531548, -0.005795145407319069,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8831,2015-06-02,HLT,56.787502,57.099843,56.631330,56.982716,2869981.0,Hilton Worldwide building out Curio brand,Hilton Worldwide building out Curio brand,0.003414,0.002733,0.006805,-0.014951,-0.013542,-0.041384,-0.037314,"[-0.003722486784681678, 0.005681690294295549, ..."
8832,2015-06-01,HLT,56.807024,57.216972,56.611811,56.943672,3460136.0,Timeshares Trend: How Marriott Vacations Revam...,Timeshares Trend: How Marriott Vacations Revam...,0.000685,0.004097,0.003416,0.007485,-0.017440,-0.026390,-0.027837,"[0.006758211646229029, -0.001204228145070374, ..."
8833,2015-06-01,MAR,72.173422,72.681038,71.398161,71.961151,1928200.0,Fitch ratings weighs in on lodging sector,Fitch ratings weighs in on lodging sector - Ti...,0.005865,0.017762,0.006498,0.005231,-0.009582,-0.012335,-0.009190,"[0.008254693821072578, -4.126824478589697e-06,..."
8834,2015-06-01,CL,54.954373,55.292126,54.666051,55.110893,3447100.0,"Dividend, Yield, And Price: Inseparably Linked","Dividend, Yield, And Price: Inseparably Linked",-0.002548,-0.004957,-0.014866,-0.021686,0.002535,-0.023875,0.005500,"[-0.02472446858882904, -0.021279681473970413, ..."


In [30]:
five_year_data['Latest_News_Embedding'] = five_year_data['Latest_News'].apply(lambda x: get_gpt_embeddings(client, x, model='text-embedding-ada-002'))
five_year_data.to_csv('data/training/in/input/price_news_return_embedding.csv', index=False)
five_year_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  five_year_data['Latest_News_Embedding'] = five_year_data['Latest_News'].apply(lambda x: get_gpt_embeddings(client, x, model='text-embedding-ada-002'))


Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Latest_News,Concatenated_News,1_day_return,2_day_return,3_day_return,4_day_return,7_day_return,10_day_return,30_day_return,Concatenated_News_Embedding,Latest_News_Embedding
41,2020-05-29,CSCO,41.665731,43.276611,41.095586,43.276611,43553600.0,Stocks to Buy in a Post-Pandemic World,Stocks to Buy in a Post-Pandemic World - US In...,-0.032829,-0.019833,-0.018747,-0.020487,0.004787,-0.061016,-0.033012,"[0.001997282262891531, -0.018077492713928223, ...","[0.011239121668040752, -0.02143305167555809, -..."
42,2020-05-29,JWN,15.714695,15.826675,14.930826,15.052138,25072100.0,"Nordstrom (JWN) Q1 Loss Wider Than Expected, S...","Nordstrom (JWN) Q1 Loss Wider Than Expected, S...",0.027728,0.092801,0.169840,0.243788,0.240226,0.085082,-0.050814,"[-0.032860055565834045, -0.036573056131601334,...","[-0.032860055565834045, -0.036573056131601334,..."
43,2020-05-29,KO,42.223438,42.684747,41.734991,42.223438,51125000.0,Learning from Warren Buffett's Decision to Buy...,Learning from Warren Buffett's Decision to Buy...,0.006597,0.004691,0.025470,0.025877,0.047347,-0.014468,-0.022314,"[-0.007852538488805294, -0.015758998692035675,...","[-0.007852538488805294, -0.015758998692035675,..."
44,2020-05-29,V,190.180375,192.370633,188.987469,190.903946,10211900.0,8 Stocks Viking Global Investors Continues to Buy,8 Stocks Viking Global Investors Continues to Buy,-0.004579,0.005704,0.008279,-0.008263,0.019289,-0.015500,-0.032907,"[-0.02816659025847912, -0.03834887966513634, 0...","[-0.02816659025847912, -0.03834887966513634, 0..."
45,2020-05-29,AAPL,78.254387,78.720112,77.572955,77.933281,153532400.0,DXC Technology (DXC) Q4 Earnings Top Estimates...,DXC Technology (DXC) Q4 Earnings Top Estimates...,0.012148,0.016701,0.022084,0.013589,0.075729,0.061570,0.167500,"[-0.015577776357531548, -0.005795145407319069,...","[-0.013864366337656975, -0.014639061875641346,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8831,2015-06-02,HLT,56.787502,57.099843,56.631330,56.982716,2869981.0,Hilton Worldwide building out Curio brand,Hilton Worldwide building out Curio brand,0.003414,0.002733,0.006805,-0.014951,-0.013542,-0.041384,-0.037314,"[-0.003722486784681678, 0.005681690294295549, ...","[-0.003722486784681678, 0.005681690294295549, ..."
8832,2015-06-01,HLT,56.807024,57.216972,56.611811,56.943672,3460136.0,Timeshares Trend: How Marriott Vacations Revam...,Timeshares Trend: How Marriott Vacations Revam...,0.000685,0.004097,0.003416,0.007485,-0.017440,-0.026390,-0.027837,"[0.006758211646229029, -0.001204228145070374, ...","[-0.008595649152994156, -0.0059041185304522514..."
8833,2015-06-01,MAR,72.173422,72.681038,71.398161,71.961151,1928200.0,Fitch ratings weighs in on lodging sector,Fitch ratings weighs in on lodging sector - Ti...,0.005865,0.017762,0.006498,0.005231,-0.009582,-0.012335,-0.009190,"[0.008254693821072578, -4.126824478589697e-06,...","[0.019749974831938744, 0.00014178613491822034,..."
8834,2015-06-01,CL,54.954373,55.292126,54.666051,55.110893,3447100.0,"Dividend, Yield, And Price: Inseparably Linked","Dividend, Yield, And Price: Inseparably Linked",-0.002548,-0.004957,-0.014866,-0.021686,0.002535,-0.023875,0.005500,"[-0.02472446858882904, -0.021279681473970413, ...","[-0.02472446858882904, -0.021279681473970413, ..."
