In [None]:
import google.generativeai as genai
from pinecone import Pinecone, ServerlessSpec
import os
# API Keys
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")

# C·∫•u h√¨nh Google Gemini API
genai.configure(api_key=GOOGLE_API_KEY)

# Kh·ªüi t·∫°o Pinecone client m·ªõi
pc = Pinecone(api_key=PINECONE_API_KEY)

# Ki·ªÉm tra xem index ƒë√£ t·ªìn t·∫°i ch∆∞a
index_name = "chatbotfinance"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # K√≠ch th∆∞·ªõc vector c·ªßa "all-MiniLM-L6-v2" l√† 384
        metric="cosine"
    )

# K·∫øt n·ªëi ƒë·∫øn index
index = pc.Index(index_name)

# Ki·ªÉm tra th√¥ng tin index
print("‚úÖ Pinecone Index ƒë√£ s·∫µn s√†ng!")
print(index.describe_index_stats())



‚úÖ Pinecone Index ƒë√£ s·∫µn s√†ng!
{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [22]:
import pandas as pd

csv_path = "2022-Vietnam1.csv"
df = pd.read_csv(csv_path, encoding="utf-8-sig")

# ‚úÖ X·ª¨ L√ù D·ªÆ LI·ªÜU: X√≥a c·ªôt tr·ªëng, ƒë·∫∑t l·∫°i ti√™u ƒë·ªÅ
column_headers = df.iloc[0].tolist()  # L·∫•y ti√™u ƒë·ªÅ t·ª´ h√†ng ƒë·∫ßu ti√™n
df_cleaned = df.iloc[1:].reset_index(drop=True)  # X√≥a h√†ng ƒë·∫ßu ti√™n kh·ªèi d·ªØ li·ªáu ch√≠nh
print(df_cleaned.head())


  Unnamed: 0 Unnamed: 1         Unnamed: 2 Unnamed: 3 Unnamed: 4  \
0          1        ACB                ACB       HOSE  Ng√¢n h√†ng   
1          2        BCM        Becamex IDC       HOSE  T√†i ch√≠nh   
2          3        BID               BIDV       HOSE  Ng√¢n h√†ng   
3          4        BVH  T·∫≠p ƒëo√†n B·∫£o Vi·ªát       HOSE  T√†i ch√≠nh   
4          5        CTG         VietinBank       HOSE  Ng√¢n h√†ng   

     Unnamed: 5         Unnamed: 6         Unnamed: 7 C√ÇN ƒê·ªêI K·∫æ TO√ÅN  \
0     Ng√¢n h√†ng          Ng√¢n h√†ng          Ng√¢n h√†ng               5   
1  B·∫•t ƒë·ªông s·∫£n       B·∫•t ƒë·ªông s·∫£n       B·∫•t ƒë·ªông s·∫£n               5   
2     Ng√¢n h√†ng          Ng√¢n h√†ng          Ng√¢n h√†ng               5   
3      B·∫£o hi·ªÉm  B·∫£o hi·ªÉm nh√¢n th·ªç  B·∫£o hi·ªÉm nh√¢n th·ªç               5   
4     Ng√¢n h√†ng          Ng√¢n h√†ng          Ng√¢n h√†ng               5   

  Unnamed: 9  ...    Unnamed: 158          Unnamed: 159        Unnamed

In [23]:
import torch
from sentence_transformers import SentenceTransformer
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"‚úÖ ƒêang ch·∫°y tr√™n: {device.upper()}")

# ‚úÖ C·∫§U H√åNH BERT (Ch·∫°y tr√™n GPU n·∫øu c√≥)
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

‚úÖ ƒêang ch·∫°y tr√™n: CUDA


In [24]:
def get_bert_embedding(text, model, device="cuda"):
    embedding = model.encode(text, convert_to_tensor=True, device=device)
    return embedding.tolist()


In [25]:
from tqdm import tqdm
import time
# ‚úÖ CHIA NH·ªé D·ªÆ LI·ªÜU & T·∫†O EMBEDDINGS
batch_size = 1  # S·ªë d√≤ng g·ª≠i m·ªói l·∫ßn
total_rows = len(df_cleaned)
output_file = "embeddings_data.txt"

with open(output_file, "w", encoding="utf-8") as f:
    for i in tqdm(range(0, total_rows, batch_size)):
        batch_vectors = []
        batch = df_cleaned.iloc[i:i+batch_size]

        for idx, row in batch.iterrows():
            # üöÄ G·ªôp d·ªØ li·ªáu ƒë√∫ng format: "T√™n c·ªôt: Gi√° tr·ªã"
            text = " | ".join([f"{col_name}: {row.iloc[col_index]}" 
                               for col_index, col_name in enumerate(column_headers)])
            text = text.replace("#####", "MISSING_DATA")

            try:
                # üöÄ T·∫°o embedding t·ª´ BERT
                embedding = get_bert_embedding(text, bert_model, device)

                # üöÄ L∆∞u v√†o danh s√°ch v·ªõi metadata
                batch_vectors.append((str(idx), embedding, {"data": text}))

                # ‚úÖ Ghi v√†o file txt
                f.write(f"ID: {idx} | Text: {text} | Embedding: {embedding}\n")

            except Exception as e:
                print(f"‚ö†Ô∏è L·ªói x·ª≠ l√Ω d√≤ng {idx}: {str(e)}")
                continue

        # ‚úÖ G·ª≠i batch v√†o Pinecone
        if batch_vectors:
            try:
                index.upsert(vectors=batch_vectors)
            except Exception as e:
                print(f"‚ö†Ô∏è L·ªói khi upsert v√†o Pinecone: {str(e)}")
                continue

        time.sleep(1)

print(f"‚úÖ ƒê√£ th√™m {total_rows} vector embeddings v√†o Pinecone th√†nh c√¥ng!")
print("‚úÖ D·ªØ li·ªáu ƒë√£ ƒë∆∞·ª£c l∆∞u v√†o:", output_file)
print(index.describe_index_stats())

 34%|‚ñà‚ñà‚ñà‚ñé      | 565/1675 [21:08<41:32,  2.25s/it]   


KeyboardInterrupt: 

In [None]:
index.upsert(vectors)
print(f"‚úÖ ƒê√£ th√™m {len(vectors)} vector embeddings v√†o Pinecone th√†nh c√¥ng!")
print(index.describe_index_stats())