In [3]:
import os

from dotenv import load_dotenv
import pandas as pd
import numpy as np
import openai
import json
import time

### API Key - This key is private. You can get your own API key

In [4]:
load_dotenv()
API_KEY = os.environ['OPENAI_API_KEY']

## Adding tanacity implement exponential backoff to help with Rate Limit

In [5]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

### importing the dataset

In [6]:
main_data = pd.read_csv("../res/nasdaq100.csv")
num_data = pd.read_csv("../res/nasdaq100_price_change.csv")
main_data

Unnamed: 0,symbol,name,headQuarter,dateFirstAdded,cik,founded
0,AAPL,Apple Inc.,"Cupertino, CA",,320193,1976-04-01
1,ABNB,Airbnb,"San Francisco, CA",,1559720,2008-08-01
2,ADBE,Adobe Inc.,"San Jose, CA",,796343,1982-12-01
3,ADI,Analog Devices,"Wilmington, MA",,6281,1965-01-01
4,ADP,ADP,"Roseland, NJ",,8670,1949-01-01
...,...,...,...,...,...,...
96,WBD,Warner Bros. Discovery,"New York, NY",,1437107,2022-04-08
97,WDAY,"Workday, Inc.","Pleasanton, CA",,1327811,2005-03-01
98,XEL,Xcel Energy,"Minneapolis, MN",,72903,1909-06-17
99,ZM,Zoom Video Communications,"San Jose, CA",,1585521,2011-04-21


In [7]:
num_data

Unnamed: 0,symbol,1D,5D,1M,3M,6M,ytd,1Y,3Y,5Y,10Y,max
0,AAPL,-1.7254,-8.30086,-6.20411,3.04200,15.64824,42.99992,8.479410,60.96299,245.42031,976.99441,139245.53954
1,ABNB,2.1617,-2.21919,9.88336,19.43286,19.64241,68.66902,23.640130,-1.04347,-1.04347,-1.04347,-1.04347
2,ADBE,0.5409,-1.77817,9.16191,52.04650,38.01522,57.22723,21.962060,17.83037,109.05718,1024.69214,251030.66399
3,ADI,0.9291,-4.03352,2.58486,3.65887,5.01602,17.02062,8.097350,63.42847,92.81874,286.77518,26012.63736
4,ADP,2.0589,2.35462,14.66581,16.40059,10.60546,5.53732,0.888943,81.76679,81.87224,248.40950,27613.11042
...,...,...,...,...,...,...,...,...,...,...,...,...
96,WBD,3.5791,13.98188,15.39075,12.25756,-5.60992,51.67715,3.209700,-31.02955,-43.36595,-65.85990,-12.30303
97,WDAY,1.4905,-1.04652,6.77548,28.69187,21.79922,37.90965,40.017990,31.59971,75.93220,221.38434,353.49515
98,XEL,-1.0432,-4.81045,-5.03734,-14.10091,-13.24042,-14.71386,-19.133960,-17.39010,26.61017,100.87395,724.27586
99,ZM,-0.5792,-3.99888,4.55307,8.29653,-18.89913,3.00030,-40.160360,-73.46268,10.74194,10.74194,10.74194


In [8]:
pd.merge(left=main_data,right=num_data['ytd'],how="left",on=main_data.symbol)

Unnamed: 0,key_0,symbol,name,headQuarter,dateFirstAdded,cik,founded,ytd
0,AAPL,AAPL,Apple Inc.,"Cupertino, CA",,320193,1976-04-01,42.99992
1,ABNB,ABNB,Airbnb,"San Francisco, CA",,1559720,2008-08-01,68.66902
2,ADBE,ADBE,Adobe Inc.,"San Jose, CA",,796343,1982-12-01,57.22723
3,ADI,ADI,Analog Devices,"Wilmington, MA",,6281,1965-01-01,17.02062
4,ADP,ADP,ADP,"Roseland, NJ",,8670,1949-01-01,5.53732
...,...,...,...,...,...,...,...,...
96,WBD,WBD,Warner Bros. Discovery,"New York, NY",,1437107,2022-04-08,51.67715
97,WDAY,WDAY,"Workday, Inc.","Pleasanton, CA",,1327811,2005-03-01,37.90965
98,XEL,XEL,Xcel Energy,"Minneapolis, MN",,72903,1909-06-17,-14.71386
99,ZM,ZM,Zoom Video Communications,"San Jose, CA",,1585521,2011-04-21,3.00030


## See all companies present in the data

In [9]:
for comany_name in main_data['name']:
    print(comany_name)

Apple Inc.
Airbnb
Adobe Inc.
Analog Devices
ADP
Autodesk
American Electric Power
Align Technology
Applied Materials
Advanced Micro Devices Inc.
Amgen
Amazon
Ansys
ASML Holding
Activision Blizzard
Broadcom Inc.
AstraZeneca
Biogen
Booking Holdings
Baker Hughes
Cadence Design Systems
Constellation Energy
Charter Communications
Comcast
Costco
Copart
CrowdStrike
Cisco
CoStar Group
CSX Corporation
Cintas
Cognizant
Datadog
Dollar Tree
DexCom
Electronic Arts
eBay
Enphase Energy
Exelon
Diamondback Energy
Fastenal
Fortinet
GE HealthCare
GlobalFoundries
Gilead Sciences
Alphabet Inc. (Class C)
Alphabet Inc. (Class A)
Honeywell
Idexx Laboratories
Illumina, Inc.
Intel
Intuit
Intuitive Surgical
JD.com
Keurig Dr Pepper
Kraft Heinz
KLA Corporation
Lucid Motors
Lam Research
Lululemon
Marriott International
Microchip Technology
Mondelēz International
MercadoLibre
Meta Platforms
Monster Beverage
Moderna
Marvell Technology
Microsoft
Micron Technology
Netflix
Nvidia
NXP
Old Dominion Freight Line
Onsemi
O'Re

In [10]:
#creating a sector column for data
main_data['sector'] = 'None'
main_data

Unnamed: 0,symbol,name,headQuarter,dateFirstAdded,cik,founded,sector
0,AAPL,Apple Inc.,"Cupertino, CA",,320193,1976-04-01,
1,ABNB,Airbnb,"San Francisco, CA",,1559720,2008-08-01,
2,ADBE,Adobe Inc.,"San Jose, CA",,796343,1982-12-01,
3,ADI,Analog Devices,"Wilmington, MA",,6281,1965-01-01,
4,ADP,ADP,"Roseland, NJ",,8670,1949-01-01,
...,...,...,...,...,...,...,...
96,WBD,Warner Bros. Discovery,"New York, NY",,1437107,2022-04-08,
97,WDAY,"Workday, Inc.","Pleasanton, CA",,1327811,2005-03-01,
98,XEL,Xcel Energy,"Minneapolis, MN",,72903,1909-06-17,
99,ZM,Zoom Video Communications,"San Jose, CA",,1585521,2011-04-21,


### Building API gateway

#### Initialize the openAPI client

In [11]:
client = openai.OpenAI(
    api_key=API_KEY
)

#### Testing OpenAPI prompt

In [12]:
test_prompt = f"""Classify the company Apple Inc into one of the following sectors: Technology, Consumer, Cyclical, Industrial, Utilities, Healthcare, Marketing, Communication, Energy, Consumer, Defense, Real Estate, or Finance. Please only answer with one sector."""


### Using wait_random_exponential to avoid 'Too many Requests' Error

In [13]:
@retry(wait=wait_random_exponential(min=20,max=60),stop=stop_after_attempt(3))
def completion_with_backoff(**kwargs):
    return client.completions.create(**kwargs)

In [15]:
chat_completion = client.completions.create(
    prompt = test_prompt,
    model = "gpt-3.5-turbo-instruct")

chat_completion

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for organization org-POBwWcMaawfgViEPp97KkkSO on requests per day (RPD): Limit 200, Used 200, Requested 1. Please try again in 7m12s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}

In [24]:
for chunks in chat_completion:
    print(chunks)

('id', 'cmpl-8zon3DZ9dbKUlL3EtfW3Uf5E9Q531')
('choices', [CompletionChoice(finish_reason='stop', index=0, logprobs=None, text='\n\nTechnology')])
('created', 1709743937)
('model', 'gpt-3.5-turbo-instruct')
('object', 'text_completion')
('system_fingerprint', None)
('usage', CompletionUsage(completion_tokens=2, prompt_tokens=49, total_tokens=51))


In [25]:
data_to_store = (chat_completion.choices[0].text).strip()
data_to_store

'Technology'

### Note
While using the an OpenAI API account with no payment method, it is quite frequent to encounter a 'RetryError' error. To counteract this, we will run the function of filling columns until there are no null values in the 'sector' field. When we encounter a 'RetryError' we will make our program wait for a few seconds to retry again. 

We will not fill the columns that have already been filled and will only fill columns that need filling

In [11]:
#This will find the rows with a 'sector' as None
n = "Cognizant"
if main_data.loc[main_data['name'] == n,'sector'].values[0] == 'None':
    print("Already classified")


Already classified


### Learnings and how the function was edited accordinly:
#### - Better loggings helps easy debugging the code
#### - Automating everything makes things easier - automate looping until total None fields is 0 and skip non-None fields
#### - if more than 50% of requests are resulting in error makes it useless to run the function. Therefore terminate it, save the results and try again later

In [1]:
while main_data.loc[main_data['sector'] == "None"].shape[0] != 0 and errors < main_data.loc[main_data['sector'] == "None"].shape[0]:
    errors = 0
    print("Looping through the dataset to classify the companies into sectors. This may take a while. Please be patient.")
    print("No of None values in the dataset is:", main_data.loc[main_data['sector'] == "None"].shape[0])
    for company_name in main_data['name']:
        print("At company: ", company_name)
        if main_data.loc[main_data['name'] == company_name,'sector'].values[0] == 'None':
            try:
            # The below line creates a prompt to input to API for every company_name in the dataset
                gpt_prompt = f"""Classify the company {company_name} into one of the following sectors: Technology, Consumer, Cyclical, Industrial, Utilities, Healthcare, Marketing, Communication, Energy, Consumer, Defense, Real Estate, or Finance. Please only answer with one sector"""

                result = client.completions.create(
                    prompt = gpt_prompt,
                    model = "gpt-3.5-turbo-instruct",
                    temperature= 0.0
                )
                
                print("Successfully fetched data. Moving on to the next company.")
                #here we are storing the fetched sector from chatGPT
                fetched_sector = (result.choices[0].text).strip()
                
                #saving the values inti the main dataset
                main_data.loc[main_data['name'] == company_name , 'sector'] = fetched_sector
            except Exception as e:
                errors += 1
                print("Too many Requests, Skipping and trying again later")
                time.sleep(20)
                continue
        else:
            print(f"{company_name} already classified")
            
            
if main_data.loc[main_data['sector'] == "None"].shape[0] != 0:
    print("Error Rate has crossed 50%. Save now and try later")
else:
    print("Done!")
    
print("Cleaning and saving data")
main_data['sector'] = main_data['sector'].str.replace('\.','',regex=True)

print("Saving data to csv")
main_data.to_csv("../res/updated_nasdaq100.csv")
print("Saved")
main_data

NameError: name 'main_data' is not defined

In [None]:
for sectors in main_data['sector'].unique():
    print(sectors)

None


### The below was written for test. Used the below to update the fetch function
#### As you can see, sometimes we can have few unwanted datafields like '.' in the answer. So let's fix them

In [None]:
main_data['sector'] = main_data['sector'].str.replace('\.','',regex=True)

In [2]:
for sectors in main_data['sector'].unique():
    print(sectors)

NameError: name 'main_data' is not defined

### Now we export the data and save it somewhere so we can easily use it for later

In [None]:
main_data.to_csv("../res/updated_nasdaq100.csv")

In [None]:
# main_data = pd.read_csv("../res/updated_nasdaq100.csv").drop(columns=['Unnamed: 0','dateFirstAdded'],axis=1)
main_data

Unnamed: 0,symbol,name,headQuarter,cik,founded,sector
0,AAPL,Apple Inc.,"Cupertino, CA",320193,1976-04-01,
1,ABNB,Airbnb,"San Francisco, CA",1559720,2008-08-01,
2,ADBE,Adobe Inc.,"San Jose, CA",796343,1982-12-01,
3,ADI,Analog Devices,"Wilmington, MA",6281,1965-01-01,
4,ADP,ADP,"Roseland, NJ",8670,1949-01-01,
...,...,...,...,...,...,...
96,WBD,Warner Bros. Discovery,"New York, NY",1437107,2022-04-08,
97,WDAY,"Workday, Inc.","Pleasanton, CA",1327811,2005-03-01,
98,XEL,Xcel Energy,"Minneapolis, MN",72903,1909-06-17,
99,ZM,Zoom Video Communications,"San Jose, CA",1585521,2011-04-21,


In [47]:
sectors_all = main_data["sector"].unique()
sectors_all

array(['Technology', 'Consumer', 'Finance', 'Utilities', 'Healthcare',
       'Energy', 'Communication', 'Cyclical', 'Real Estate', 'Industrial'],
      dtype=object)

### Converting the dataframe to dictionary to pass it to openAI API

In [57]:
main_data = main_data.sort_values(by='sector')
pass_dict = main_data.to_dict(orient='records')
pass_dict

[{'symbol': 'CHTR',
  'name': 'Charter Communications',
  'headQuarter': 'Stamford, CT',
  'cik': 1091667,
  'founded': '1993-01-01',
  'sector': 'Communication'},
 {'symbol': 'TMUS',
  'name': 'T-Mobile US',
  'headQuarter': 'Bellevue, WA',
  'cik': 1283699,
  'founded': '2001-09-02',
  'sector': 'Communication'},
 {'symbol': 'WBD',
  'name': 'Warner Bros. Discovery',
  'headQuarter': 'New York, NY',
  'cik': 1437107,
  'founded': '2022-04-08',
  'sector': 'Communication'},
 {'symbol': 'CMCSA',
  'name': 'Comcast',
  'headQuarter': 'Philadelphia, PA',
  'cik': 1166691,
  'founded': '1963-11-13',
  'sector': 'Communication'},
 {'symbol': 'NFLX',
  'name': 'Netflix',
  'headQuarter': 'Los Gatos, CA',
  'cik': 1065280,
  'founded': '1997-09-29',
  'sector': 'Communication'},
 {'symbol': 'SIRI',
  'name': 'Sirius XM',
  'headQuarter': 'New York, NY',
  'cik': 908937,
  'founded': '2008-07-29',
  'sector': 'Communication'},
 {'symbol': 'DLTR',
  'name': 'Dollar Tree',
  'headQuarter': 'Che

In [52]:
@retry(wait=wait_random_exponential(min=30,max=60),stop=stop_after_attempt(3))
def completion_with_backoff_better(**kwargs):
    return client.chat.completions.create(**kwargs)

In [55]:
# The below line creates a prompt to input to API for every company_name in the dataset
gpt_prompt = f"""Use the dataset passed with the prompt in form of a list converted from a dataframe to a records using pandas,to_dict() method. 
Please return me the best sector to invest in. Just give me the answer in one word.
Dataset Dictionary : {pass_dict}
"""

result = completion_with_backoff_better(
    messages=[
        {"role":"user",
         "content": gpt_prompt}
    ],
        model = "gpt-3.5-turbo-1106",
        temperature= 0.0
    )

print("The best sector to invest in is",result.choices[0].message.content)

The best sector to invest in is Technology


## The above makes sense because technology actually is the best sector

In [58]:
lis = []
for cur_sector in sectors_all:
# The below line creates a prompt to input to API for every company_name in the dataset
    gpt_prompt = f"""Give me top 3 companies in the {cur_sector} to invest in based on the dataset passed in the prompt.
    
    The output needs to be in the format of "1. company 1, 2. company 2 , 3. company" and only company names must be specified.
    Make sure all the companies specified are in lowercase and the first letter is in uppercase.
    
    Give me the output in either above specified format or just give me 'NULL' with the reason for why the answer is NULL.
    If there are less than 3 companies in the sector, then just give me the companies as the answer


    
    Dataset : {pass_dict}
    """

    result = completion_with_backoff_better(messages=[
        {
            "role":"system",
            "content":"""You will be given a dataset passed in the form of a list of dictionary. A single dictionary being a single record(row) of the dataset. The keys of the dictionary being the column of the dataset.
            All the data is grouped based upon their sectors. You must use this dataset to predict whatever is passed in the prompt.
            """},
        {
            "role": "user",
            "content": gpt_prompt
        }
    ], model="gpt-3.5-turbo-1106",
        temperature=0.6)
    
    mess = result.choices[0].message.content
    lis.append(mess)

    print(f"The top 3 companies to invest in the {cur_sector} sector are: ",mess)
    

The top 3 companies to invest in the Technology sector are:  NULL, The dataset provided does not contain any Technology sector companies.
The top 3 companies to invest in the Consumer sector are:  The dataset provided does not contain enough information related to the financial performance of the companies, such as stock prices, revenue, profit margin, or any other financial indicators that are generally used to evaluate and recommend investments. Therefore, I am unable to provide the top 3 companies in the Consumer sector to invest in based on the dataset provided. Hence, the answer is NULL.
The top 3 companies to invest in the Finance sector are:  NULL. The dataset provided does not contain any companies in the Finance sector.
The top 3 companies to invest in the Utilities sector are:  NULL, There are no companies in the Utilities sector in the dataset provided.
The top 3 companies to invest in the Healthcare sector are:  The dataset provided doesn't contain any companies in the Heal

### The main issue that arrises here is that there is not enough data to predict this. Therefore let's join tables to make enough data

In [None]:
pd.merge(left=main_data,right=num_data,how="left",on=main_data.symbol)
pass_dict = main_data.to_dict(orient='records')
pass_dict

Unnamed: 0,key_0,symbol_x,name,headQuarter,cik,founded,sector,symbol_y,1D,5D,1M,3M,6M,ytd,1Y,3Y,5Y,10Y,max
0,CHTR,CHTR,Charter Communications,"Stamford, CT",1091667,1993-01-01,Communication,AAPL,-1.7254,-8.30086,-6.20411,3.04200,15.64824,42.99992,8.479410,60.96299,245.42031,976.99441,139245.53954
1,TMUS,TMUS,T-Mobile US,"Bellevue, WA",1283699,2001-09-02,Communication,ABNB,2.1617,-2.21919,9.88336,19.43286,19.64241,68.66902,23.640130,-1.04347,-1.04347,-1.04347,-1.04347
2,WBD,WBD,Warner Bros. Discovery,"New York, NY",1437107,2022-04-08,Communication,ADBE,0.5409,-1.77817,9.16191,52.04650,38.01522,57.22723,21.962060,17.83037,109.05718,1024.69214,251030.66399
3,CMCSA,CMCSA,Comcast,"Philadelphia, PA",1166691,1963-11-13,Communication,ADI,0.9291,-4.03352,2.58486,3.65887,5.01602,17.02062,8.097350,63.42847,92.81874,286.77518,26012.63736
4,NFLX,NFLX,Netflix,"Los Gatos, CA",1065280,1997-09-29,Communication,ADP,2.0589,2.35462,14.66581,16.40059,10.60546,5.53732,0.888943,81.76679,81.87224,248.40950,27613.11042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,FTNT,FTNT,Fortinet,"Sunnyvale, CA",1262039,2000-01-01,Technology,WBD,3.5791,13.98188,15.39075,12.25756,-5.60992,51.67715,3.209700,-31.02955,-43.36595,-65.85990,-12.30303
97,INTU,INTU,Intuit,"Mountain View, CA",896878,1983-01-01,Technology,WDAY,1.4905,-1.04652,6.77548,28.69187,21.79922,37.90965,40.017990,31.59971,75.93220,221.38434,353.49515
98,AEP,AEP,American Electric Power,"Columbus, OH",4904,1906-01-01,Utilities,XEL,-1.0432,-4.81045,-5.03734,-14.10091,-13.24042,-14.71386,-19.133960,-17.39010,26.61017,100.87395,724.27586
99,XEL,XEL,Xcel Energy,"Minneapolis, MN",72903,1909-06-17,Utilities,ZM,-0.5792,-3.99888,4.55307,8.29653,-18.89913,3.00030,-40.160360,-73.46268,10.74194,10.74194,10.74194


### Now let's run the model again

In [60]:
lis = []
for cur_sector in sectors_all:
# The below line creates a prompt to input to API for every company_name in the dataset
    gpt_prompt = f"""Give me top 3 companies in the {cur_sector} to invest in based on the dataset passed in the prompt.
    
    The output needs to be in the format of "1. company 1, 2. company 2 , 3. company" and only company names must be specified.
    Make sure all the companies specified are in lowercase and the first letter is in uppercase.
    
    Give me the output in either above specified format or just give me 'NULL' with the reason for why the answer is NULL.
    If there are less than 3 companies in the sector, then just give me the companies as the answer


    
    Dataset : {pass_dict}
    """

    result = completion_with_backoff_better(messages=[
        {
            "role":"system",
            "content":"""You will be given a dataset passed in the form of a list of dictionary. A single dictionary being a single record(row) of the dataset. The keys of the dictionary being the column of the dataset.
            All the data is grouped based upon their sectors. You must use this dataset to predict whatever is passed in the prompt.
            """},
        {
            "role": "user",
            "content": gpt_prompt
        }
    ], model="gpt-3.5-turbo-1106",
        temperature=0.6)
    
    mess = result.choices[0].message.content
    lis.append(mess)

    print(f"The top 3 companies to invest in the {cur_sector} sector are: ",mess)
    

The top 3 companies to invest in the Technology sector are:  Based on the dataset provided, there are less than 3 companies in the Technology sector. Therefore, the answer is 'NULL' as there are not enough companies in the Technology sector to provide a top 3 list.
The top 3 companies to invest in the Consumer sector are:  The top 3 companies in the Consumer sector to invest in are:
1. Starbucks
2. JD.com
3. Dollar Tree
The top 3 companies to invest in the Finance sector are:  The dataset provided does not contain any companies in the Finance sector. Therefore, I am unable to provide you with the top 3 companies in the Finance sector to invest in. Hence, the answer is NULL.
The top 3 companies to invest in the Utilities sector are:  1. American Electric Power, 2. Xcel Energy, 3. Exelon
The top 3 companies to invest in the Healthcare sector are:  The dataset provided does not contain any companies in the Healthcare sector. Therefore, I cannot provide the top 3 companies in the Healthcar