### Installing dependencies

In [None]:
!pip install openai
!pip install PyPDF2
!pip install xlsxwriter
!pip install -q bitsandbytes datasets accelerate
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# cd /content/drive/MyDrive/Stocks/raw_models/
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
# !git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf

In [None]:
cd /content/drive/MyDrive/Stocks/web_scraping/

In [None]:
import requests
from bs4 import BeautifulSoup
import time
from datetime import date, timedelta
import pandas as pd
import numpy as np
import json
import os
import re
import glob
import io
import datetime
from PyPDF2 import PdfReader
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from peft import prepare_model_for_int8_training, prepare_model_for_kbit_training, get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, PrefixTuningConfig, LoraConfig, TaskType, PeftType, PeftConfig, PeftModel
import torch
import openai
import yfinance as yf
import warnings
warnings.filterwarnings("ignore")

In [None]:
openai.organization = "org_key"
openai.api_key = "api_key"

# model_path = "/content/drive/MyDrive/Stocks/raw_models/Llama-2-7b-chat-hf"
# model_path = "/content/drive/MyDrive/Stocks/finetune_models/Llama-2-7b-chat-hf/ep1_bs1_lr0.001_rank8_maxtoken3000"
# model_path = "/content/drive/MyDrive/Stocks/finetune_models/Llama-2-7b-chat-hf/ep1_bs2_lr0.001_rank8_maxtoken2000"
model_path = "/content/drive/MyDrive/Stocks/finetune_models/Llama-2-7b-chat-hf/ep1.5_bs2_lr0.001_rank8_maxtoken2000"
# model_path = "/content/drive/MyDrive/Stocks/finetune_models/Llama-2-7b-chat-hf/ep2_bs2_lr0.001_rank8_maxtoken2000"
# model_path = "/content/drive/MyDrive/Stocks/raw_models/gpt2-xl"
# model_path = "/content/drive/MyDrive/Stocks/raw_models/falcon-7b-instruct"

data_path = f"/content/drive/MyDrive/Stocks/web_scraping/nse_data/19Oct23"
raw_file_name = "CF-AN-equities-19-10-2023-to-19-10-2023"

### Preprocessing on raw data

In [None]:
raw_df = pd.read_csv(os.path.join(data_path, f"{raw_file_name}.csv"))
raw_df['ATTACHMENT1'] = raw_df['ATTACHMENT'].apply(lambda a: a.split('/')[-1].split('.')[-1])
raw_df = raw_df[raw_df['ATTACHMENT1'] == 'pdf'].reset_index(drop=True)
raw_df = raw_df.drop(['ATTACHMENT1'], axis=1)

excluding_sub_list = ["Acquisition-XBRL","Alteration Of Capital and Fund Raising-XBRL","Certificate under SEBI (Depositories and Participants) Regulations, 2018","Copy of Newspaper Publication","Incorporation-XBRL","Loss of Share Certificates","Notice Of Shareholders Meetings-XBRL","Outcome of Board Meeting-XBRL","Sale or disposal-XBRL","Statement of deviation(s) or variation(s) under Reg. 32","Financial Result Updates","Consolidated Result Updates - IFRS"]
raw_df = raw_df[~raw_df['SUBJECT'].isin(excluding_sub_list)].reset_index(drop=True)

raw_df = raw_df.drop_duplicates(subset=['COMPANY NAME', 'SUBJECT', 'DETAILS']).reset_index(drop=True)
raw_df['BROADCAST DATE/TIME'] = pd.to_datetime(raw_df['BROADCAST DATE/TIME'])
raw_df['BROADCAST_time_flg'] = raw_df['BROADCAST DATE/TIME'].apply(lambda a: 1 if a.time()>datetime.time(15, 30, 00) else 0)
raw_df = raw_df[raw_df['BROADCAST_time_flg'] == 1].reset_index(drop=True)
raw_df = raw_df.drop(['BROADCAST_time_flg'], axis=1)

In [None]:
def func_to_get_1d_return(historical_data):
  one_day_return = (historical_data['Close'] / historical_data['Open'] - 1) * 100
  return one_day_return.iloc[0].round(2)

In [None]:
mark_cap_ls = []
ct = 1
for i in list(raw_df['SYMBOL']):
  try:
    if ct%10 == 0:
      print(f"Running iteration no {ct}")

    stock = yf.Ticker(f"{i}.NS")
    mark_cap = stock.basic_info['marketCap']
    if pd.isnull(mark_cap) == True:
      mark_cap_ls.append(0)
    else:
      mark_cap = mark_cap/10000000
      mark_cap_ls.append(mark_cap)
    ct = ct + 1
  except:
    print(f"Error: {i}")
    mark_cap_ls.append(0)
    pass

raw_df['market_cap'] = mark_cap_ls

In [None]:
raw_df = raw_df[raw_df['market_cap']>100].reset_index(drop=True)

In [None]:
def extract_pdf_content_from_url(url):
  response = requests.get(url, stream=True, headers={'user-agent': 'Mozilla/5.0'})

  # response.raise_for_status()  # Check for any request errors
  if response.status_code != 200:
    return None

  pdf_reader = PdfReader(io.BytesIO(response.content))
  extracted_text = "\n\n"

  # Iterate through each page in the PDF and extract text
  ct = 1
  for page in pdf_reader.pages:
    if ct > 2:
        break
    extracted_text = extracted_text + page.extract_text()
    ct = ct + 1

  return extracted_text.strip()

In [None]:
pdf_extracted_data = []
ct = 1
for i in list(raw_df['ATTACHMENT']):
  try:
    if ct%10 == 0:
      print(ct)

    if i.split('.')[-1] == 'pdf':
      content = extract_pdf_content_from_url(i)
      pdf_extracted_data.append(f"""{content}""")
    else:
      pdf_extracted_data.append(None)

    ct = ct + 1
  except:
    print(f"Error: {i}")
    pdf_extracted_data.append(None)
    pass

raw_df['pdf_extracted_data'] = pdf_extracted_data
raw_df = raw_df[~raw_df['pdf_extracted_data'].isna()]
raw_df.to_excel(os.path.join(data_path, f"{raw_file_name}.xlsx"), engine='xlsxwriter', index=False)

#### Using finetune model to inference

In [None]:
# model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(model_path)

config = PeftConfig.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, load_in_4bit=True, device_map="auto")
model = PeftModel.from_pretrained(model, model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
def call_chatGPT(text, max_tokens = 1000):
  COMPLETIONS_MODEL = "text-davinci-003"
  COMPLETIONS_API_PARAMS = {"temperature" : 0.0, "max_tokens" : max_tokens, "model": COMPLETIONS_MODEL}
  prompt = text + "\nTl;dr"

  response = openai.Completion.create(prompt=prompt, **COMPLETIONS_API_PARAMS)
  return response['choices'][0]['text']

In [None]:
def greedy_search_answer(input_text):
  try:
    with torch.no_grad():
      inputs = tokenizer(input_text, return_tensors="pt")

      # Greedy Search
      outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], do_sample=False, num_beams=1, max_new_tokens=400)
      return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0].split("Report:")[1].strip()
  except Exception as e:
    print(f"Error: {e}")
    return ""

In [None]:
def top_p_sampling_answer(input_text):
  try:
    with torch.no_grad():
      inputs = tokenizer(input_text, return_tensors="pt")

      # top-p sampling
      outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"],
                               max_new_tokens=400,
                               do_sample=True,
                               top_p=0.75,
                               top_k=0,
                               temperature=0.2,
                               num_return_sequences = 1,
                               no_repeat_ngram_size=2)
      return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0].split("Report:")[1].strip()
  except Exception as e:
    print(f"Error: {e}")
    return ""

In [None]:
prompt_format = """<s>[INST] <<SYS>>
You are a stock market analyst working for a brokerage firm. You are going to help me in analyzing the corporate announcement document submitted to the Indian stock exchange by a company.
If you are not able to analyze, please don't share false information.
<</SYS>

Your task is to analyze the given context and generate a concise report as truthfully as possible by following the provided instructions.
### Instruction:
Extract important short points or keywords that can help me make a decision on whether to purchase or sell the stock of this company?
Based on your response can you also give me a one-liner sentiment(positive/neutral/negative) and a short and crisp conclusion on whether the stock price of the company will go upside or downside?
You have to follow the below format while generating the report.
```
### Key points: (mention the only key points here)

### Sentiment: (mention the sentiment here)

### Conclusion: (mention the final conclusion here)
```

### Context:
{input}

[/INST]
### Report:""".format(
  input="{input}"
)

In [None]:
# You are a stock market analyst working for a brokerage firm. You are going to help me in analyzing the corporate announcement document submitted to the Indian stock exchange by a company.
# If you are not able to analyze, please don't share false information.

# Your task is to analyze the given context and generate a concise report as truthfully as possible by following the provided instructions.
# ### Instruction:
# Extract important short points or keywords that can help me make a decision on whether to purchase or sell the stock of this company?
# Based on your response can you also give me a one-liner sentiment(positive/neutral/negative) and a short and crisp conclusion on whether the stock price of the company will go upside or downside?
# You have to follow the below format while generating the report.
# ```
# ### Key points: (mention the only key points here)

# ### Sentiment: (mention the sentiment here)

# ### Conclusion: (mention the final conclusion here)
# ```

# ### Context:
# {input}

# ### Report:

In [None]:
data = pd.read_excel(os.path.join(data_path, f"{raw_file_name}.xlsx"))
# data = raw_df.copy()

data = data[data['market_cap']>1000].reset_index(drop=True)
# data = data[(data['market_cap']>100) & (data['market_cap']<10000)].reset_index(drop=True)

In [None]:
def func_to_extract_sentiment_label(text):
  sentiment_match = re.search(r'Sentiment:\s*([A-Za-z]+)', text)

  if sentiment_match:
      sentiment = sentiment_match.group(1)
  else:
      sentiment = "Sentiment not found"

  return sentiment

In [None]:
model_summary = []
ct = 1
for i in list(data['pdf_extracted_data']):
  if ct%10 == 0:
    print(f"Running iteration no {ct}")

  if pd.isnull(i) == True:
    model_summary.append(None)
  else:
    complete_prompt = prompt_format.format(input=i)
    res = greedy_search_answer(complete_prompt)
    model_summary.append(res)

  if ct%20 == 0:
    tmp_data = data.iloc[:ct].copy()
    tmp_data['model_summary'] = model_summary
    tmp_data['sentiment'] = tmp_data['model_summary'].apply(lambda a: func_to_extract_sentiment_label(a))
    tmp_data.to_excel(os.path.join(data_path, "checkpoints", f"{raw_file_name}_after_{ct}.xlsx"), index=False)

  ct = ct + 1

data['model_summary'] = model_summary
data['sentiment'] = data['model_summary'].apply(lambda a: func_to_extract_sentiment_label(a))

In [None]:
data.to_excel(os.path.join(data_path, f"{raw_file_name}_v2.xlsx"), index=False)