With this code, you can take information from the SEC, apply an embedding model to it, and upload it to a Qdrant cluster in the cloud.

In [None]:
%pip install sec-downloader langchain sentence-transformers qdrant-client unstructured[all-docs] google-generativeai langchain-google-genai lark langchainhub

In [None]:
# %pip install sec-downloader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Anyone_AI/Final_Project

/content/drive/MyDrive/Anyone_AI/Final_Project


## SEC downloads

In [None]:
from sec_downloader import Downloader
from sec_downloader.types import RequestedFilings

dl = Downloader("Hagrid_Gryffindor", "hagrificent@gmail.com")

In [None]:
metadatas = dl.get_filing_metadatas(
    RequestedFilings(
        ticker_or_cik="718877",
        form_type="10-K",
        limit=4
    )
)

In [None]:
metadatas[0].__dict__

{'accession_number': '0001628280-23-004842',
 'form_type': '10-K',
 'primary_doc_url': 'https://www.sec.gov/Archives/edgar/data/718877/000162828023004842/atvi-20221231.htm',
 'items': '',
 'primary_doc_description': '10-K',
 'filing_date': '2023-02-23',
 'report_date': '2022-12-31',
 'cik': '0000718877',
 'company_name': 'Activision Blizzard, Inc.',
 'tickers': []}

In [None]:
def get_metadata(cik, form_type):
  try:
    data = dl.get_filing_metadatas(
        RequestedFilings(
            ticker_or_cik=cik,
            form_type=form_type,
            limit=1
        )
    )
    metadata = data[0].__dict__
  except:
    metadata = 'No_data'

  return metadata

In [None]:
get_metadata('718877', '8-K')

{'accession_number': '0001104659-23-109427',
 'form_type': '8-K',
 'primary_doc_url': 'https://www.sec.gov/Archives/edgar/data/718877/000110465923109427/tm2328508d1_8k.htm',
 'items': '8.01,9.01',
 'primary_doc_description': 'FORM 8-K',
 'filing_date': '2023-10-16',
 'report_date': '2023-10-16',
 'cik': '0000718877',
 'company_name': 'Activision Blizzard, Inc.',
 'tickers': []}

In [None]:
import time

In [None]:
form_types = ['10-K','S-1','8-K']

In [None]:
def get_metadatas(ciks):
  metadatas = []
  for cik in ciks:
    for form_type in form_types:
      metadata = get_metadata(cik, form_type)
      # time.sleep(6)
      metadatas.append(metadata)
  return metadatas

## 100 Companies

In [None]:
from pathlib import Path
import requests, json
import pandas as pd

In [None]:
def make_request(url, headers):
    response = requests.get(url, headers=headers)
    return response

In [None]:
response = make_request(url="https://www.sec.gov/files/company_tickers.json",
                        headers={'User-Agent': 'Andres_Vargas agvargasb@gmail.com'})

In [None]:
companies_dict = json.loads(response.content)

In [None]:
companies_dict

{'0': {'cik_str': 320193, 'ticker': 'AAPL', 'title': 'Apple Inc.'},
 '1': {'cik_str': 789019, 'ticker': 'MSFT', 'title': 'MICROSOFT CORP'},
 '2': {'cik_str': 1652044, 'ticker': 'GOOGL', 'title': 'Alphabet Inc.'},
 '3': {'cik_str': 1018724, 'ticker': 'AMZN', 'title': 'AMAZON COM INC'},
 '4': {'cik_str': 1045810, 'ticker': 'NVDA', 'title': 'NVIDIA CORP'},
 '5': {'cik_str': 1326801, 'ticker': 'META', 'title': 'Meta Platforms, Inc.'},
 '6': {'cik_str': 1318605, 'ticker': 'TSLA', 'title': 'Tesla, Inc.'},
 '7': {'cik_str': 1067983,
  'ticker': 'BRK-B',
  'title': 'BERKSHIRE HATHAWAY INC'},
 '8': {'cik_str': 59478, 'ticker': 'LLY', 'title': 'ELI LILLY & Co'},
 '9': {'cik_str': 1046179,
  'ticker': 'TSM',
  'title': 'TAIWAN SEMICONDUCTOR MANUFACTURING CO LTD'},
 '10': {'cik_str': 1403161, 'ticker': 'V', 'title': 'VISA INC.'},
 '11': {'cik_str': 1730168, 'ticker': 'AVGO', 'title': 'Broadcom Inc.'},
 '12': {'cik_str': 19617, 'ticker': 'JPM', 'title': 'JPMORGAN CHASE & CO'},
 '13': {'cik_str': 73

In [None]:
companies_df = pd.DataFrame(companies_dict).transpose()

In [None]:
ciks_first_100 = list(companies_df.head(100)['cik_str'].astype('str'))

In [None]:
ciks_first_100

# Data for 100 companies

In [None]:
%%time
metadatas_first_100 = get_metadatas(ciks_first_100)

CPU times: user 1min 7s, sys: 790 ms, total: 1min 8s
Wall time: 4min 28s


In [None]:
len(metadatas_first_100)

300

In [None]:
correct_metadatas = []

for _ in metadatas_first_100:
  if type(_) == dict:
    correct_metadatas.append(_)

In [None]:
len(correct_metadatas)

169

In [None]:
correct_metadatas_100_df = pd.DataFrame(correct_metadatas)

In [None]:
correct_metadatas_100_df

Unnamed: 0,accession_number,form_type,primary_doc_url,items,primary_doc_description,filing_date,report_date,cik,company_name,tickers
0,0000320193-23-000106,10-K,https://www.sec.gov/Archives/edgar/data/320193...,,10-K,2023-11-03,2023-09-30,0000320193,Apple Inc.,"[Ticker(symbol='AAPL', exchange='Nasdaq')]"
1,0000320193-23-000104,8-K,https://www.sec.gov/Archives/edgar/data/320193...,"2.02,9.01",8-K,2023-11-02,2023-11-02,0000320193,Apple Inc.,"[Ticker(symbol='AAPL', exchange='Nasdaq')]"
2,0000950170-23-035122,10-K,https://www.sec.gov/Archives/edgar/data/789019...,,10-K,2023-07-27,2023-06-30,0000789019,MICROSOFT CORP,"[Ticker(symbol='MSFT', exchange='Nasdaq')]"
3,0001193125-23-291720,8-K,https://www.sec.gov/Archives/edgar/data/789019...,5.07,8-K,2023-12-08,2023-12-07,0000789019,MICROSOFT CORP,"[Ticker(symbol='MSFT', exchange='Nasdaq')]"
4,0001652044-23-000016,10-K,https://www.sec.gov/Archives/edgar/data/165204...,,10-K,2023-02-03,2022-12-31,0001652044,Alphabet Inc.,"[Ticker(symbol='GOOGL', exchange='Nasdaq'), Ti..."
...,...,...,...,...,...,...,...,...,...,...
164,0000891836-05-000421,8-K,https://www.sec.gov/Archives/edgar/data/863064...,4.01,CURRENT REPORT,2005-10-28,2005-10-14,0000863064,RIO TINTO PLC,"[Ticker(symbol='RIO', exchange='NYSE'), Ticker..."
165,0000101829-23-000009,10-K,https://www.sec.gov/Archives/edgar/data/101829...,,10-K,2023-02-07,2022-12-31,0000101829,RTX Corp,"[Ticker(symbol='RTX', exchange='NYSE')]"
166,0000101829-24-000003,8-K,https://www.sec.gov/Archives/edgar/data/101829...,5.02,8-K,2024-01-04,2024-01-04,0000101829,RTX Corp,"[Ticker(symbol='RTX', exchange='NYSE')]"
167,0000950170-23-004343,10-K,https://www.sec.gov/Archives/edgar/data/136474...,,10-K,2023-02-24,2022-12-31,0001364742,BlackRock Inc.,"[Ticker(symbol='BLK', exchange='NYSE')]"


In [None]:
correct_metadatas_100_df.to_csv('metadatas_first_100_df.csv')

In [None]:
correct_metadatas_100_df['cik'].isin(['0001585521']).sum()

0

In [None]:
correct_metadatas_100_df['company_name'][0:18]

0               Apple Inc.
1               Apple Inc.
2           MICROSOFT CORP
3           MICROSOFT CORP
4            Alphabet Inc.
5            Alphabet Inc.
6           AMAZON COM INC
7           AMAZON COM INC
8           AMAZON COM INC
9              NVIDIA CORP
10             NVIDIA CORP
11             NVIDIA CORP
12    Meta Platforms, Inc.
13    Meta Platforms, Inc.
14    Meta Platforms, Inc.
15             Tesla, Inc.
16             Tesla, Inc.
17             Tesla, Inc.
Name: company_name, dtype: object

In [None]:
correct_metadatas_100_df['company_name'][18:35]

18    BERKSHIRE HATHAWAY INC
19    BERKSHIRE HATHAWAY INC
20            ELI LILLY & Co
21            ELI LILLY & Co
22                 VISA INC.
23                 VISA INC.
24                 VISA INC.
25             Broadcom Inc.
26             Broadcom Inc.
27       JPMORGAN CHASE & CO
28       JPMORGAN CHASE & CO
29    UNITEDHEALTH GROUP INC
30    UNITEDHEALTH GROUP INC
31              Walmart Inc.
32              Walmart Inc.
33          EXXON MOBIL CORP
34          EXXON MOBIL CORP
Name: company_name, dtype: object

In [None]:
correct_metadatas_100_df.to_dict('index')[0]

{'accession_number': '0000320193-23-000106',
 'form_type': '10-K',
 'primary_doc_url': 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930.htm',
 'items': nan,
 'primary_doc_description': '10-K',
 'filing_date': '2023-11-03',
 'report_date': '2023-09-30',
 'cik': 320193,
 'company_name': 'Apple Inc.',
 'tickers': "[Ticker(symbol='AAPL', exchange='Nasdaq')]"}

In [None]:
len(correct_metadatas_100_df.to_dict('index').keys())

169

In [None]:
for k, v in correct_metadatas_100_df.to_dict('index').items():
  print(k, v['primary_doc_url'])

0 https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930.htm
1 https://www.sec.gov/Archives/edgar/data/320193/000032019323000104/aapl-20231102.htm
2 https://www.sec.gov/Archives/edgar/data/789019/000095017023035122/msft-20230630.htm
3 https://www.sec.gov/Archives/edgar/data/789019/000119312523291720/d636199d8k.htm
4 https://www.sec.gov/Archives/edgar/data/1652044/000165204423000016/goog-20221231.htm
5 https://www.sec.gov/Archives/edgar/data/1652044/000165204423000088/goog-20231018.htm
6 https://www.sec.gov/Archives/edgar/data/1018724/000101872423000004/amzn-20221231.htm
7 https://www.sec.gov/Archives/edgar/data/1018724/000089161897001309/
8 https://www.sec.gov/Archives/edgar/data/1018724/000110465923113444/tm2329405d1_8k.htm
9 https://www.sec.gov/Archives/edgar/data/1045810/000104581023000017/nvda-20230129.htm
10 https://www.sec.gov/Archives/edgar/data/1045810/000101287098000618/
11 https://www.sec.gov/Archives/edgar/data/1045810/000104581023000225/nvda-2023112

## Save html files and send to Qdrant

In [None]:
import os
from pathlib import Path
import requests
import pandas as pd
import re
import unicodedata

In [None]:
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.schema import Document

In [None]:
from qdrant_client import QdrantClient
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
from google.colab import userdata

In [None]:
correct_metadatas_100_df = pd.read_csv('metadatas_first_100_df.csv')
correct_metadatas_100_df.drop(labels=['Unnamed: 0'], axis=1, inplace=True)

In [None]:
correct_metadatas_100_df.head(10)

Unnamed: 0,accession_number,form_type,primary_doc_url,items,primary_doc_description,filing_date,report_date,cik,company_name,tickers
0,0000320193-23-000106,10-K,https://www.sec.gov/Archives/edgar/data/320193...,,10-K,2023-11-03,2023-09-30,320193,Apple Inc.,"[Ticker(symbol='AAPL', exchange='Nasdaq')]"
1,0000320193-23-000104,8-K,https://www.sec.gov/Archives/edgar/data/320193...,"2.02,9.01",8-K,2023-11-02,2023-11-02,320193,Apple Inc.,"[Ticker(symbol='AAPL', exchange='Nasdaq')]"
2,0000950170-23-035122,10-K,https://www.sec.gov/Archives/edgar/data/789019...,,10-K,2023-07-27,2023-06-30,789019,MICROSOFT CORP,"[Ticker(symbol='MSFT', exchange='Nasdaq')]"
3,0001193125-23-291720,8-K,https://www.sec.gov/Archives/edgar/data/789019...,5.07,8-K,2023-12-08,2023-12-07,789019,MICROSOFT CORP,"[Ticker(symbol='MSFT', exchange='Nasdaq')]"
4,0001652044-23-000016,10-K,https://www.sec.gov/Archives/edgar/data/165204...,,10-K,2023-02-03,2022-12-31,1652044,Alphabet Inc.,"[Ticker(symbol='GOOGL', exchange='Nasdaq'), Ti..."
5,0001652044-23-000088,8-K,https://www.sec.gov/Archives/edgar/data/165204...,"2.02,5.02,9.01",8-K,2023-10-24,2023-10-18,1652044,Alphabet Inc.,"[Ticker(symbol='GOOGL', exchange='Nasdaq'), Ti..."
6,0001018724-23-000004,10-K,https://www.sec.gov/Archives/edgar/data/101872...,,10-K,2023-02-03,2022-12-31,1018724,AMAZON COM INC,"[Ticker(symbol='AMZN', exchange='Nasdaq')]"
7,0000891618-97-001309,S-1,https://www.sec.gov/Archives/edgar/data/101872...,,FORM S-1,1997-03-24,,1018724,AMAZON COM INC,"[Ticker(symbol='AMZN', exchange='Nasdaq')]"
8,0001104659-23-113444,8-K,https://www.sec.gov/Archives/edgar/data/101872...,"1.01,1.02,2.03,9.01",FORM 8-K,2023-11-01,2023-11-01,1018724,AMAZON COM INC,"[Ticker(symbol='AMZN', exchange='Nasdaq')]"
9,0001045810-23-000017,10-K,https://www.sec.gov/Archives/edgar/data/104581...,,10-K,2023-02-24,2023-01-29,1045810,NVIDIA CORP,"[Ticker(symbol='NVDA', exchange='Nasdaq')]"


In [None]:
html_files_folder = 'HTML_Raw_Data'

QDRANT_API_KEY = userdata.get('QDRANT_API_KEY')
Qdrant_url = "https://d3c36c99-73dd-4df5-8340-93131b214e3d.us-east4-0.gcp.cloud.qdrant.io:6333"
collection_name = "financial_chatbot"

In [None]:
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [None]:
def clean_text(text):
  new_text = unicodedata.normalize("NFKC", text).replace('\n', ' ')
  new_text = re.sub(r'\s+', ' ', text).strip()
  return new_text

In [None]:
def html_metadata_to_Qdrant(correct_metadata_dict):

  cik = str(correct_metadata_dict['cik'])
  html_url = correct_metadata_dict['primary_doc_url']
  company_name = correct_metadata_dict['company_name']

  folder_path = str(Path(html_files_folder) / cik)
  os.makedirs(folder_path, exist_ok=True)

  html_path = str(Path(folder_path) / (html_url.split('/')[-1]))

  response = requests.get(html_url,
                          headers={
                              "User-Agent": 'Elol_Iam eloliam@gmail.com'
                          }
  )
  open(html_path, 'wb').write(response.content)

  loader = UnstructuredFileLoader(html_path, mode="elements")
  docs = loader.load()

  docs_metadata = [{'page_content': doc.page_content,
                    'page_number': doc.metadata.get('page_number'),
                    'category': doc.metadata.get('category')
                    } for doc in docs]

  docs_metadata_df = pd.DataFrame(docs_metadata)
  docs_metadata_df = docs_metadata_df[(docs_metadata_df['category'] == 'NarrativeText') | (docs_metadata_df['category'] == 'ListItem')]

  docs_metadata_df = docs_metadata_df.groupby('page_number').agg({'page_content':lambda col: ' '.join(col)}).reset_index()
  docs_metadata_df['page_content'] = docs_metadata_df['page_content'].map(lambda text: clean_text(text))

  new_docs = []

  for index, row in docs_metadata_df.iterrows():
    new_doc = Document(page_content=row['page_content'],
                       metadata={'html_url': html_url,
                                 'page_number': row['page_number'],
                                 'company_name': company_name,
                                 'cik': cik
                       }
    )
    new_docs.append(new_doc)

  texts = text_splitter.split_documents(new_docs)

  qdrant_db = Qdrant.from_documents(
      documents=texts,
      embedding=embeddings,
      url=Qdrant_url,
      prefer_grpc=False,
      api_key=QDRANT_API_KEY,
      collection_name=collection_name
  )


In [None]:
len(correct_metadatas_100_df)

169

In [None]:
correct_metadatas_100_df.to_dict('index')[0]

{'accession_number': '0000320193-23-000106',
 'form_type': '10-K',
 'primary_doc_url': 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930.htm',
 'items': nan,
 'primary_doc_description': '10-K',
 'filing_date': '2023-11-03',
 'report_date': '2023-09-30',
 'cik': 320193,
 'company_name': 'Apple Inc.',
 'tickers': "[Ticker(symbol='AAPL', exchange='Nasdaq')]"}

In [None]:
html_metadata_to_Qdrant(correct_metadatas_100_df.to_dict('index')[0])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
more_correct_metadatas_100_df = correct_metadatas_100_df[correct_metadatas_100_df['primary_doc_url'].str.endswith('htm')].reset_index()

In [None]:
len(more_correct_metadatas_100_df['cik'].value_counts())

76

In [None]:
more_correct_metadatas_100_df['company_name'].value_counts()

T-Mobile US, Inc.              3
Mastercard Inc                 3
ServiceNow, Inc.               3
Salesforce, Inc.               3
Meta Platforms, Inc.           3
                              ..
COCA COLA CO                   2
BANK OF AMERICA CORP /DE/      2
BlackRock Inc.                 2
HSBC HOLDINGS PLC              1
INVESCO QQQ TRUST, SERIES 1    1
Name: company_name, Length: 76, dtype: int64

In [None]:
more_correct_metadatas_100_df.to_dict('index')

{0: {'index': 0,
  'accession_number': '0000320193-23-000106',
  'form_type': '10-K',
  'primary_doc_url': 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930.htm',
  'items': nan,
  'primary_doc_description': '10-K',
  'filing_date': '2023-11-03',
  'report_date': '2023-09-30',
  'cik': 320193,
  'company_name': 'Apple Inc.',
  'tickers': "[Ticker(symbol='AAPL', exchange='Nasdaq')]"},
 1: {'index': 1,
  'accession_number': '0000320193-23-000104',
  'form_type': '8-K',
  'primary_doc_url': 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000104/aapl-20231102.htm',
  'items': '2.02,9.01',
  'primary_doc_description': '8-K',
  'filing_date': '2023-11-02',
  'report_date': '2023-11-02',
  'cik': 320193,
  'company_name': 'Apple Inc.',
  'tickers': "[Ticker(symbol='AAPL', exchange='Nasdaq')]"},
 2: {'index': 2,
  'accession_number': '0000950170-23-035122',
  'form_type': '10-K',
  'primary_doc_url': 'https://www.sec.gov/Archives/edgar/data/789019/000

In [None]:
for k, v in more_correct_metadatas_100_df.to_dict('index').items():
  print(k)
  html_metadata_to_Qdrant(v)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159


# Data for 32 companies

In [1]:
from pathlib import Path
import requests, json
import pandas as pd

In [2]:
def make_request(url, headers):
    response = requests.get(url, headers=headers)
    return response

In [3]:
response = make_request(url="https://www.sec.gov/files/company_tickers.json",
                        headers={'User-Agent': 'Yoshi_Tesaco yoxites@gmail.com'})

In [4]:
companies_dict = json.loads(response.content)

In [5]:
companies_dict

{'0': {'cik_str': 320193, 'ticker': 'AAPL', 'title': 'Apple Inc.'},
 '1': {'cik_str': 789019, 'ticker': 'MSFT', 'title': 'MICROSOFT CORP'},
 '2': {'cik_str': 1652044, 'ticker': 'GOOGL', 'title': 'Alphabet Inc.'},
 '3': {'cik_str': 1018724, 'ticker': 'AMZN', 'title': 'AMAZON COM INC'},
 '4': {'cik_str': 1045810, 'ticker': 'NVDA', 'title': 'NVIDIA CORP'},
 '5': {'cik_str': 1326801, 'ticker': 'META', 'title': 'Meta Platforms, Inc.'},
 '6': {'cik_str': 1067983,
  'ticker': 'BRK-B',
  'title': 'BERKSHIRE HATHAWAY INC'},
 '7': {'cik_str': 1318605, 'ticker': 'TSLA', 'title': 'Tesla, Inc.'},
 '8': {'cik_str': 59478, 'ticker': 'LLY', 'title': 'ELI LILLY & Co'},
 '9': {'cik_str': 1403161, 'ticker': 'V', 'title': 'VISA INC.'},
 '10': {'cik_str': 1046179,
  'ticker': 'TSM',
  'title': 'TAIWAN SEMICONDUCTOR MANUFACTURING CO LTD'},
 '11': {'cik_str': 1730168, 'ticker': 'AVGO', 'title': 'Broadcom Inc.'},
 '12': {'cik_str': 731766, 'ticker': 'UNH', 'title': 'UNITEDHEALTH GROUP INC'},
 '13': {'cik_str'

In [None]:
companies_df = pd.DataFrame(companies_dict).transpose()

In [None]:
ciks_first_100 = list(companies_df.head(100)['cik_str'].astype('str'))

In [None]:
ciks_first_100

In [None]:
import os
from pathlib import Path
import requests
import pandas as pd
import re
import unicodedata

In [None]:
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.schema import Document

In [None]:
from qdrant_client import QdrantClient
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
from google.colab import userdata

In [None]:
correct_metadatas_100_df = pd.read_csv('metadatas_first_100_df.csv')
correct_metadatas_100_df.drop(labels=['Unnamed: 0'], axis=1, inplace=True)

In [None]:
correct_metadatas_100_df.head(10)

In [None]:
html_files_folder = 'HTML_Raw_Data'

QDRANT_API_KEY = userdata.get('QDRANT_API_KEY')
Qdrant_url = "https://d3c36c99-73dd-4df5-8340-93131b214e3d.us-east4-0.gcp.cloud.qdrant.io:6333"
collection_name = "financial_chatbot"

In [None]:
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [None]:
def clean_text(text):
  new_text = unicodedata.normalize("NFKC", text).replace('\n', ' ')
  new_text = re.sub(r'\s+', ' ', text).strip()
  return new_text

In [None]:
def html_metadata_to_Qdrant(correct_metadata_dict):

  cik = str(correct_metadata_dict['cik'])
  html_url = correct_metadata_dict['primary_doc_url']
  company_name = correct_metadata_dict['company_name']

  folder_path = str(Path(html_files_folder) / cik)
  os.makedirs(folder_path, exist_ok=True)

  html_path = str(Path(folder_path) / (html_url.split('/')[-1]))

  response = requests.get(html_url,
                          headers={
                              "User-Agent": 'Elver_Gandalf elvergaf@gmail.com'
                          }
  )
  open(html_path, 'wb').write(response.content)

  loader = UnstructuredFileLoader(html_path, mode="elements")
  docs = loader.load()

  docs_metadata = [{'page_content': doc.page_content,
                    'page_number': doc.metadata.get('page_number'),
                    'category': doc.metadata.get('category')
                    } for doc in docs]

  docs_metadata_df = pd.DataFrame(docs_metadata)
  docs_metadata_df = docs_metadata_df[(docs_metadata_df['category'] == 'NarrativeText') | (docs_metadata_df['category'] == 'ListItem')]

  docs_metadata_df = docs_metadata_df.groupby('page_number').agg({'page_content':lambda col: ' '.join(col)}).reset_index()
  docs_metadata_df['page_content'] = docs_metadata_df['page_content'].map(lambda text: clean_text(text))

  new_docs = []

  for index, row in docs_metadata_df.iterrows():
    new_doc = Document(page_content=row['page_content'],
                       metadata={'html_url': html_url,
                                 'page_number': row['page_number'],
                                 'company_name': company_name,
                                 'cik': cik
                       }
    )
    new_docs.append(new_doc)

  texts = text_splitter.split_documents(new_docs)

  qdrant_db = Qdrant.from_documents(
      documents=texts,
      embedding=embeddings,
      url=Qdrant_url,
      prefer_grpc=False,
      api_key=QDRANT_API_KEY,
      collection_name=collection_name
  )


In [None]:
len(correct_metadatas_100_df)

In [None]:
more_correct_metadatas_100_df = correct_metadatas_100_df[correct_metadatas_100_df['primary_doc_url'].str.endswith('htm')].reset_index()

In [None]:
len(more_correct_metadatas_100_df['cik'].value_counts())

In [None]:
more_correct_metadatas_100_df.to_dict('index')

In [None]:
for k, v in more_correct_metadatas_100_df.to_dict('index').items():
  print(k)
  html_metadata_to_Qdrant(v)