### Import Libraries

In [1]:
import pandas as pd
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
import pickle
import requests
import logging
from datetime import datetime

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

### Load Data

In [2]:
# Path to the JSONL file
file_path = './data/test_data.jsonl'

# Load and process the data
logging.info("Loading dataset from file.")
data = pd.read_json(file_path, lines=True)
logging.info(f"Dataset loaded with {len(data)} rows and {len(data.columns)} columns.")

2024-12-19 14:27:36,723 - INFO - Loading dataset from file.
2024-12-19 14:27:36,748 - INFO - Dataset loaded with 10 rows and 56 columns.


### Embedding Data

In [3]:
# Columns to embed
columns_to_embed = [
    'nama', 'produk', 'layanan', 'alamat', 'project_nama', 'no_wo', 
    'jenis_workorder', 'jenis_order', 'status_nodelink', 
    'customer', 'customer_direct', 'channeling', 'segmen', 'start_kontrak', 'end_kontrak'
]

# Combine columns into a single text field per row
data['combined_text'] = data.apply(
    lambda row: " \n ".join(
        f"{col}: {row[col]}" for col in columns_to_embed if pd.notnull(row[col]) and row[col] != "-"
    ),
    axis=1
)

# Initialize embedding model
embedding_model_name = "all-MiniLM-L6-v2"
logging.info(f"Initializing embedding model: {embedding_model_name}")
embedding_model = SentenceTransformerEmbeddings(model_name=embedding_model_name)

2024-12-19 14:27:36,773 - INFO - Initializing embedding model: all-MiniLM-L6-v2
  embedding_model = SentenceTransformerEmbeddings(model_name=embedding_model_name)
  from .autonotebook import tqdm as notebook_tqdm
2024-12-19 14:27:43,423 - INFO - Use pytorch device_name: cpu
2024-12-19 14:27:43,423 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


### Create Vector Store

In [4]:
# Create separate vector stores for each column
vector_stores = {}

logging.info("Creating vector stores for each column.")
for column in columns_to_embed:
    logging.info(f"Processing column: {column}")
    # Drop rows with null or placeholder values for the current column
    valid_rows = data[data[column].notnull() & (data[column] != "-")]
    texts = valid_rows[column].tolist()
    metadata = [{"index": idx, "column": column} for idx in valid_rows.index]
    
    # Create FAISS vector store for the column
    vector_store = FAISS.from_texts(texts, embedding_model, metadatas=metadata)
    vector_stores[column] = vector_store
    logging.info(f"Vector store created for column: {column}, with {len(texts)} entries.")

# Save the vector stores and the original data
save_data = {
    "vector_stores": vector_stores,
    "data": data
}

vector_store_file = "vector_stores_by_column.pkl"
with open(vector_store_file, "wb") as f:
    pickle.dump(save_data, f)
logging.info(f"Vector stores saved to '{vector_store_file}'.")


2024-12-19 14:27:46,857 - INFO - Creating vector stores for each column.
2024-12-19 14:27:46,870 - INFO - Processing column: nama
2024-12-19 14:27:46,988 - INFO - Loading faiss with AVX512 support.
2024-12-19 14:27:46,988 - INFO - Could not load library with AVX512 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx512'")
2024-12-19 14:27:46,988 - INFO - Loading faiss with AVX2 support.
2024-12-19 14:27:47,037 - INFO - Successfully loaded faiss with AVX2 support.
2024-12-19 14:27:47,055 - INFO - Vector store created for column: nama, with 10 entries.
2024-12-19 14:27:47,055 - INFO - Processing column: produk
2024-12-19 14:27:47,121 - INFO - Vector store created for column: produk, with 10 entries.
2024-12-19 14:27:47,122 - INFO - Processing column: layanan
2024-12-19 14:27:47,174 - INFO - Vector store created for column: layanan, with 10 entries.
2024-12-19 14:27:47,174 - INFO - Processing column: alamat
2024-12-19 14:27:47,354 - INFO - Vector store created for co

### Helper Functions (Querying Ollama, Context Processing)

In [5]:
def query_ollama(prompt):
    """Query the Ollama API with a given prompt."""
    logging.info("Sending query to Ollama API.")
    url = "http://localhost:11434/api/chat"
    headers = {"Content-Type": "application/json"}
    payload = {
        "model": "llama3.2:1b",
        "stream": False,
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "options": {
            "temperature": 0.5,
            "top_p": 0.95,
            "max_tokens": 150
        }
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        result = response.json()
        if 'message' in result and 'content' in result['message']:
            return result['message']['content'].strip()
        else:
            logging.warning("Unexpected response format from Ollama API.")
            return "Error: Message or content not found in response."
    except Exception as e:
        logging.error(f"Error querying Ollama API: {e}")
        return None

def retrieve_context(query, vector_stores, target_column=None, data=None):
    """Retrieve relevant context from the FAISS vector store for a specific column."""
    if target_column not in vector_stores:
        logging.error(f"No vector store found for column '{target_column}'.")
        return None

    # Extract exact value from query for no_wo
    if target_column == 'no_wo':
        # Extract the work order number from the query
        wo_number = query.split(": ")[-1].strip().strip("?")
        # Find exact matches in the data
        exact_matches = data[data[target_column] == wo_number].index.tolist()
        if exact_matches:
            docs = []
            for idx in exact_matches:
                docs.append(type('Document', (), {
                    'page_content': data.loc[idx, target_column],
                    'metadata': {'index': idx, 'column': target_column}
                })())
            logging.info(f"Found exact match for work order number: {wo_number}")
        else:
            logging.info(f"No exact match found for work order number: {wo_number}")
            docs = []
    else:
        # Use vector similarity for other columns
        retriever = vector_stores[target_column].as_retriever(
            search_type="similarity",
            search_kwargs={"k": 5}
        )
        docs = retriever.get_relevant_documents(query)

    # Log retrieved documents
    logging.info(f"Retrieved {len(docs)} documents for column '{target_column}'.")
    for i, doc in enumerate(docs):
        logging.info(f"Doc {i + 1}: Content: {doc.page_content}, Metadata: {doc.metadata}")
    
    return docs

def ask_dataset(query, vector_stores, data, target_column):
    """Retrieve context and ask the dataset using the Ollama API."""
    # Retrieve relevant context
    docs = retrieve_context(query, vector_stores, target_column, data)
    if not docs:
        return "Error: No relevant context found for the query."

    # Extract the original data entries
    contexts = []
    for doc in docs:
        index = doc.metadata.get('index')
        if index in data.index:
            original_data = data.loc[index]
            contexts.append({
                'relevance_index': len(contexts) + 1,
                'data': {col: original_data[col] for col in data.columns 
                        if pd.notnull(original_data[col]) and original_data[col] != "-"}
            })

    # Format context for the prompt
    formatted_context = "\n\n".join([
        f"Result {ctx['relevance_index']}:\n" + 
        "\n".join([f"{k}: {v}" for k, v in ctx['data'].items()])
        for ctx in contexts
    ])

    # Construct the prompt
    prompt = f"""Context:
{formatted_context}

Question: {query}
Please analyze the above results and provide a concise answer focusing on the exact matching information."""

    # Query Ollama
    response = query_ollama(prompt)
    return response

### Testing

In [6]:
# Load the saved vector stores
with open(vector_store_file, "rb") as f:
    saved_data = pickle.load(f)
    vector_stores = saved_data["vector_stores"]
    data = saved_data["data"]

# Example query - can search by any column
query = "Which contracts has a working order number of : MI.0010/D1.200/MS.00/TSAT/05.2020?"
target_column = "no_wo"  # Can be any column from the data
response = ask_dataset(query, vector_stores, data, target_column)
print("Response from the dataset:\n", response)

2024-12-19 14:27:48,937 - INFO - Found exact match for work order number: MI.0010/D1.200/MS.00/TSAT/05.2020
2024-12-19 14:27:48,937 - INFO - Retrieved 1 documents for column 'no_wo'.
2024-12-19 14:27:48,937 - INFO - Doc 1: Content: MI.0010/D1.200/MS.00/TSAT/05.2020, Metadata: {'index': 6, 'column': 'no_wo'}
2024-12-19 14:27:48,937 - INFO - Sending query to Ollama API.


Response from the dataset:
 Based on the provided result, the contract with a working order number of "MI.0010/D1.200/MS.00/TSAT/05.2020" is associated with:

- Id: 571 (customer: SANATEL)
- Customer Direct: 571 (customer: SANATEL)
