### Garbage collection

In [2]:
# Clear IPython's global namespace
%reset -f

# Reimport gc module
import gc
# Run garbage collection
gc.collect()

# Clear all cell outputs
from IPython.display import clear_output
clear_output(wait=True)

### Reading data

In [3]:
import os
os.chdir('/home/manimala/Documents/satyakama/paper-farmer-chatbot/')

In [4]:
import polars as pl 
pl.Config.set_tbl_rows(1000)  # or whatever number of rows you want to see
pl.Config.set_tbl_cols(-1)  # Show all columns (-1 means no limit)
pl.Config.set_fmt_str_lengths(1000)  # Increase maximum string length

polars.config.Config

In [84]:
def preprocess_kcc_dataset(file_path: str) -> pl.DataFrame:
    """
    Preprocess the KCC dataset by cleaning and transforming the data.
    
    Args:
        file_path (str): Path to the KCC dataset CSV file
    
    Returns:
        pl.DataFrame: Cleaned and preprocessed DataFrame
    """
    
    # Read CSV with selected columns
    master_df = pl.read_csv(
        source=file_path,
        columns=[
            'Year', 'Month', 'Day', 'Crop', 'BlockName', 
            'DistrictName', 'QueryType', 'Season', 'Sector',
            'StateName', 'QueryText', 'KccAns'
        ],
        has_header=True,
        low_memory=True
    )

    print(f"Starting row count: {len(master_df)}")

    # First clean QueryText and KccAns
    master_df = master_df.with_columns([
        pl.col("QueryText").str.replace_all(r"\s+", " ").str.strip_chars().alias("QueryText"),
        pl.col("KccAns").str.replace_all(r"\s+", " ").str.strip_chars().alias("KccAns")
    ])

    # Then convert all to uppercase
    master_df = master_df.with_columns([
        pl.all().cast(pl.Utf8).str.to_uppercase()
    ])

    # Create Date column and drop date-related columns
    master_df = master_df.with_columns([
        pl.format("{}-{}-{}",
            pl.col("Day").cast(pl.Utf8).str.zfill(2),
            pl.col("Month").cast(pl.Utf8).str.zfill(2),
            pl.col("Year")
        ).str.strptime(pl.Date, format="%d-%m-%Y").alias("Date")
    ]).drop(['Day', 'Month', 'Year'])

    # Combine multiple filtering conditions
    initial_count = len(master_df)
    master_df = master_df.filter(
        # Basic text validations
        pl.col("QueryText").str.contains(r"[a-zA-Z0-9]") & 
        pl.col("KccAns").str.contains(r"[a-zA-Z0-9]") &
        # Remove specific QueryText values
        (pl.col("QueryType") != "WEATHER") &
        (pl.col("QueryText") != "TEST CALL") &
        (pl.col("QueryText") != "BLANK CALL") &
        ~pl.col("QueryText").str.contains("WEATHER") &
        # Remove whitespace-only entries
        ~pl.col("QueryText").str.contains(r"^\s*$") & 
        ~pl.col("KccAns").str.contains(r"^\s*$")
    )
    print(f"Rows removed after initial filtering: {initial_count - len(master_df)}")

    # Remove rows with digits in specific columns
    initial_count = len(master_df)
    for col in ['BlockName', 'Crop', 'QueryType', 'Sector']:
        master_df = master_df.filter(
            ~pl.col(col).str.contains(r"\d")
        )
    print(f"Rows removed after digit filtering: {initial_count - len(master_df)}")

    # Remove numeric-only entries
    initial_count = len(master_df)
    numeric_pattern = r"^[-]?[0-9]*\.?[0-9]+$"
    master_df = master_df.filter(
        ~pl.col("QueryText").str.contains(numeric_pattern) &
        ~pl.col("KccAns").str.contains(numeric_pattern)
    )
    print(f"Rows removed after numeric pattern filtering: {initial_count - len(master_df)}")

    # Handle Season column and null values
    master_df = master_df.with_columns([
        pl.when(pl.col("Season").is_null() | (pl.col("Season") == "0"))
        .then(pl.lit("UNSPECIFIED"))
        .otherwise(pl.col("Season"))
        .alias("Season")
    ]).drop_nulls()

    print(f"Final row count: {len(master_df)}")
    
    return master_df

# Usage example:
filtered_df = preprocess_kcc_dataset('dataset/original_dataset/kcc_dataset.csv')

Starting row count: 41987874
Rows removed after initial filtering: 21539071
Rows removed after digit filtering: 3267964
Rows removed after numeric pattern filtering: 395171
Final row count: 16785668


In [85]:
filtered_df.head()

BlockName,Crop,DistrictName,QueryType,Season,Sector,StateName,QueryText,KccAns,Date
str,str,str,str,str,str,str,str,str,date
"""MOHANPUR""","""COCONUT""","""SAMASTIPUR""","""FERTILIZER USE AND AVAILABILITY""","""KHARIF""","""HORTICULTURE""","""BIHAR""","""FERTILIZER DOSES OF COCONUT""","""FERTILIZER ARE NPK 1:2:2 KGPLANT""",2007-01-05
"""DOLONGGHAT""","""BANANA""","""NAGAON""","""FERTILIZER USE AND AVAILABILITY""","""JAYAD""","""HORTICULTURE""","""ASSAM""","""ASKING ABOUT THE FERTILIZER SCHEDULE FOR BANANA CULTIVATION""","""SUGGESTED TO APPLY UREA242GRAMPLANTSSP206GRAMPLANTMOP551GRAMPLANT AND COMPOST12KGPLANT IN TRENCH METHOD""",2009-09-29
"""DANIYAWAN""","""WHEAT""","""PATNA""","""FERTILIZER USE AND AVAILABILITY""","""KHARIF""","""AGRICULTURE""","""BIHAR""","""ASKING ABOUT FERTILISER DOSE OF WHEAT""","""ASKING ABOUT FERTILISER DOSE OF WHEAT ARE 120KG N60KG P40KG KHECT FOR SOWING STAGE""",2009-12-23
"""AKHORIGOLA""","""CABBAGE""","""ROHTAS""","""CULTURAL PRACTICES""","""KHARIF""","""HORTICULTURE""","""BIHAR""","""EARLY CULTIVAR OF CABBAGE""","""PUSA DRUM HEAD""",2009-02-22
"""HATHUA""","""GLADIOLUS""","""GOPALGANJ""","""CULTURAL PRACTICES""","""RABI""","""HORTICULTURE""","""BIHAR""","""METHOD OF GLADIOLUS CULTIVATION""","""ANSWER GIVEN IN DETAILS""",2009-05-28


In [87]:
# value_counts = (
#     filtered_df.get_column("QueryText")
#     .value_counts(parallel=True)
#     .with_columns([
#         (pl.col("count") / pl.col("count").sum() * 100).alias("percentage")  # Note: "count" not "counts"
#     ])
#     .sort("count", descending=True) 
#     .head(500)
# )

# print(value_counts)

### Component 1: data preprocessing

In [99]:
import polars as pl
from sentence_transformers import SentenceTransformer
from datetime import datetime
import numpy as np
from typing import List, Dict, Union
import logging
from tqdm.auto import tqdm

class DataProcessor:
    def __init__(self, 
                 model_name: str = 'BAAI/bge-large-en-v1.5',
                 device: str = 'cuda',
                 batch_size: int = 128):
        """
        Initialize the DataProcessor with a specified text encoder model.
        
        Args:
            model_name (str): Name of the sentence-transformer model to use
        """
        # Set up logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # Initialize text encoder
        self.logger.info(f"Loading text encoder model: {model_name}")
        try:
            self.text_encoder = SentenceTransformer(model_name)
            self.text_encoder.to(device)
            self.device = device
            self.batch_size = batch_size
        except Exception as e:
            self.logger.error(f"Error loading text encoder: {str(e)}")
            raise
            
        # Initialize state variables
        self.categorical_columns = ['Crop', 'DistrictName', 'QueryType', 
                                  'Season', 'Sector', 'StateName']
        self.text_columns = ['QueryText', 'KccAns']
        self.processed_df = None
        self.embeddings_cache = {}
        
    def preprocess_dataframe(self, df: pl.DataFrame) -> pl.DataFrame:
        """
        Clean and preprocess the input dataframe.
        
        Args:
            df (pl.DataFrame): Input Polars DataFrame
            
        Returns:
            pl.DataFrame: Processed DataFrame
        """
        self.logger.info("Starting dataframe preprocessing")
        
        try:
            # Create a copy to avoid modifying original
            processed = df.clone()
            
            # Clean text fields
            self.logger.info("Cleaning text fields...")
            for col in tqdm(self.text_columns, desc="Processing text columns"):
                processed = processed.with_columns([
                    pl.col(col).str.strip_chars()
                    .str.replace_all(r'\s+', ' ')  # Replace multiple spaces
                    .str.to_lowercase()
                    .alias(col)
                ])
            
            # Clean categorical fields
            self.logger.info("Cleaning categorical fields...")
            for col in tqdm(self.categorical_columns, desc="Processing categorical columns"):
                processed = processed.with_columns([
                    pl.col(col).str.strip_chars()
                    .str.to_lowercase()
                    .alias(col)
                ])
            
            # Check if Date needs conversion
            if df.schema['Date'] == pl.Date:
                self.logger.info("Date column already in correct format")
                processed = processed
            else:
                self.logger.info("Converting Date column to datetime format")
                processed = processed.with_columns([
                    pl.col('Date').str.strptime(pl.Datetime, '%Y-%m-%d').alias('Date')
                ])
            
            # Add derived features from date
            processed = processed.with_columns([
                pl.col('Date').cast(pl.Date).dt.year().alias('Year'),
                pl.col('Date').cast(pl.Date).dt.month().alias('Month')
            ])
            
            self.processed_df = processed
            self.logger.info("Dataframe preprocessing completed successfully")
            return processed
            
        except Exception as e:
            self.logger.error(f"Error in preprocessing: {str(e)}")
            raise
            
    def generate_embeddings(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        """
        Generate embeddings for a list of texts.
        
        Args:
            texts (List[str]): List of texts to encode
            batch_size (int): Batch size for encoding
            
        Returns:
            np.ndarray: Array of embeddings
        """
        self.logger.info(f"Generating embeddings for {len(texts)} texts")
        
        try:
            # Check cache first
            cached_embeddings = []
            texts_to_encode = []
            
            for text in texts:
                if text in self.embeddings_cache:
                    cached_embeddings.append(self.embeddings_cache[text])
                else:
                    texts_to_encode.append(text)
            
            # Generate new embeddings
            if texts_to_encode:
                self.logger.info(f"Generating new embeddings for {len(texts_to_encode)} texts")
                new_embeddings = self.text_encoder.encode(
                    texts_to_encode,
                    batch_size=self.batch_size,
                    show_progress_bar=True
                )
                
                # Update cache
                for text, embedding in zip(texts_to_encode, new_embeddings):
                    self.embeddings_cache[text] = embedding
                    cached_embeddings.append(embedding)
            
            return np.array(cached_embeddings)
            
        except Exception as e:
            self.logger.error(f"Error generating embeddings: {str(e)}")
            raise
            
    def create_categorical_features(self) -> Dict[str, np.ndarray]:
        """
        Create one-hot encoded features for categorical columns.
        
        Returns:
            Dict[str, np.ndarray]: Dictionary of categorical features
        """
        if self.processed_df is None:
            raise ValueError("Please run preprocess_dataframe first")
            
        self.logger.info("Creating categorical features")
        
        try:
            categorical_features = {}
            
            self.logger.info("Creating one-hot encodings for categorical features...")
            for col in tqdm(self.categorical_columns, desc="Processing categorical features"):
                # Get unique values
                unique_values = self.processed_df[col].unique().to_list()
                
                # Create mapping
                value_to_idx = {val: idx for idx, val in enumerate(unique_values)}
                
                # Create one-hot encodings
                n_values = len(unique_values)
                n_samples = len(self.processed_df)
                one_hot = np.zeros((n_samples, n_values))
                
                for idx, value in enumerate(self.processed_df[col]):
                    one_hot[idx, value_to_idx[value]] = 1
                    
                categorical_features[col] = one_hot
                
            self.logger.info("Categorical features created successfully")
            return categorical_features
            
        except Exception as e:
            self.logger.error(f"Error creating categorical features: {str(e)}")
            raise
            
    def create_temporal_features(self) -> np.ndarray:
        """
        Create temporal features from the Date column.
        
        Returns:
            np.ndarray: Array of temporal features
        """
        if self.processed_df is None:
            raise ValueError("Please run preprocess_dataframe first")
            
        self.logger.info("Creating temporal features")
        
        try:
            # Extract year and month
            years = self.processed_df['Year'].to_numpy()
            months = self.processed_df['Month'].to_numpy()
            
            # Normalize
            min_year = years.min()
            years_norm = (years - min_year) / 10  # Decade scale
            months_norm = months / 12
            
            # Combine features
            temporal_features = np.column_stack([years_norm, months_norm])
            
            self.logger.info("Temporal features created successfully")
            return temporal_features
            
        except Exception as e:
            self.logger.error(f"Error creating temporal features: {str(e)}")
            raise
            
    def process_all(self, df: pl.DataFrame, show_progress: bool = True) -> Dict[str, Union[pl.DataFrame, np.ndarray, Dict]]:
        """
        Run all processing steps and return combined results.
        
        Args:
            df (pl.DataFrame): Input DataFrame
            show_progress (bool): Whether to show progress bars
            
        Returns:
            Dict containing processed data and features
        """
        """
        Run all processing steps and return combined results.
        
        Args:
            df (pl.DataFrame): Input DataFrame
            
        Returns:
            Dict containing processed data and features
        """
        self.logger.info("Starting complete data processing pipeline")
        
        try:
            # Run all processing steps
            processed_df = self.preprocess_dataframe(df)
            
            # Generate text embeddings
            query_embeddings = self.generate_embeddings(processed_df['QueryText'].to_list())
            answer_embeddings = self.generate_embeddings(processed_df['KccAns'].to_list())
            
            # Create other features
            categorical_features = self.create_categorical_features()
            temporal_features = self.create_temporal_features()
            
            results = {
                'processed_df': processed_df,
                'query_embeddings': query_embeddings,
                'answer_embeddings': answer_embeddings,
                'categorical_features': categorical_features,
                'temporal_features': temporal_features
            }
            
            self.logger.info("Complete processing pipeline finished successfully")
            return results
            
        except Exception as e:
            self.logger.error(f"Error in processing pipeline: {str(e)}")
            raise

In [102]:
# Use your filtered DataFrame
df = filtered_df.head(10000)
processor = DataProcessor()
results = processor.process_all(df)
print("Processing completed successfully")

# Verify the results
processed_df = results['processed_df']
print("\nSample of processed data:")
print(processed_df.head())

INFO:__main__:Loading text encoder model: BAAI/bge-large-en-v1.5
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5
INFO:__main__:Starting complete data processing pipeline
INFO:__main__:Starting dataframe preprocessing
INFO:__main__:Cleaning text fields...


Processing text columns:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:__main__:Cleaning categorical fields...


Processing categorical columns:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:__main__:Date column already in correct format
INFO:__main__:Dataframe preprocessing completed successfully
INFO:__main__:Generating embeddings for 10000 texts
INFO:__main__:Generating new embeddings for 10000 texts


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

INFO:__main__:Generating embeddings for 10000 texts
INFO:__main__:Generating new embeddings for 9977 texts


Batches:   0%|          | 0/78 [00:00<?, ?it/s]

INFO:__main__:Creating categorical features
INFO:__main__:Creating one-hot encodings for categorical features...


Processing categorical features:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:__main__:Categorical features created successfully
INFO:__main__:Creating temporal features
INFO:__main__:Temporal features created successfully
INFO:__main__:Complete processing pipeline finished successfully


Processing completed successfully

Sample of processed data:
shape: (5, 12)
┌────────┬────────┬────────┬────────┬───────┬───────┬───────┬───────┬───────┬───────┬──────┬───────┐
│ BlockN ┆ Crop   ┆ Distri ┆ QueryT ┆ Seaso ┆ Secto ┆ State ┆ Query ┆ KccAn ┆ Date  ┆ Year ┆ Month │
│ ame    ┆ ---    ┆ ctName ┆ ype    ┆ n     ┆ r     ┆ Name  ┆ Text  ┆ s     ┆ ---   ┆ ---  ┆ ---   │
│ ---    ┆ str    ┆ ---    ┆ ---    ┆ ---   ┆ ---   ┆ ---   ┆ ---   ┆ ---   ┆ date  ┆ i32  ┆ i8    │
│ str    ┆        ┆ str    ┆ str    ┆ str   ┆ str   ┆ str   ┆ str   ┆ str   ┆       ┆      ┆       │
╞════════╪════════╪════════╪════════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪══════╪═══════╡
│ MOHANP ┆ coconu ┆ samast ┆ fertil ┆ khari ┆ horti ┆ bihar ┆ ferti ┆ ferti ┆ 2007- ┆ 2007 ┆ 1     │
│ UR     ┆ t      ┆ ipur   ┆ izer   ┆ f     ┆ cultu ┆       ┆ lizer ┆ lizer ┆ 01-05 ┆      ┆       │
│        ┆        ┆        ┆ use    ┆       ┆ re    ┆       ┆ doses ┆ are   ┆       ┆      ┆       │
│        ┆     

### Component 2

In [104]:
import numpy as np
import torch
import networkx as nx
from typing import Dict, List, Tuple, Union
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
import logging

class GraphBuilder:
    def __init__(
        self,
        similarity_threshold: float = 0.7,
        max_edges_per_node: int = 5,
        device: str = 'cuda'
    ):
        """
        Initialize the GraphBuilder.
        
        Args:
            similarity_threshold: Minimum similarity score to create an edge
            max_edges_per_node: Maximum number of edges per node
            device: 'cuda' or 'cpu'
        """
        # Set up logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        self.similarity_threshold = similarity_threshold
        self.max_edges_per_node = max_edges_per_node
        self.device = device
        self.graph = None
        
    def compute_text_similarity_edges(
        self,
        embeddings: np.ndarray,
        batch_size: int = 128
    ) -> List[Tuple[int, int, float]]:
        """
        Compute edges based on text embedding similarity.
        Uses batched processing to handle large matrices.
        
        Args:
            embeddings: numpy array of text embeddings
            batch_size: size of batches for similarity computation
            
        Returns:
            List of (source, target, weight) tuples
        """
        num_samples = len(embeddings)
        edges = []
        
        # Convert to torch tensors
        embeddings_tensor = torch.tensor(embeddings).to(self.device)
        
        # Normalize embeddings for cosine similarity
        embeddings_tensor = torch.nn.functional.normalize(embeddings_tensor, p=2, dim=1)
        
        self.logger.info("Computing text similarity edges...")
        for i in tqdm(range(0, num_samples, batch_size)):
            batch_end = min(i + batch_size, num_samples)
            batch = embeddings_tensor[i:batch_end]
            
            # Compute similarity for the batch
            similarities = torch.mm(batch, embeddings_tensor.t())
            
            # Get top k similar nodes for each node in batch
            values, indices = torch.topk(similarities, k=self.max_edges_per_node + 1)
            
            # Convert to CPU and numpy for processing
            values = values.cpu().numpy()
            indices = indices.cpu().numpy()
            
            # Create edges for nodes in batch
            for idx, (node_values, node_indices) in enumerate(zip(values, indices)):
                node_idx = i + idx
                
                # Skip self-loops and low similarity edges
                for sim, target in zip(node_values, node_indices):
                    if (sim > self.similarity_threshold and 
                        node_idx != target):
                        edges.append((node_idx, target.item(), sim.item()))
        
        return edges
    
    def compute_metadata_similarity(
        self,
        categorical_features: Dict[str, np.ndarray],
        temporal_features: np.ndarray
    ) -> List[Tuple[int, int, float]]:
        """
        Compute edges based on metadata similarity.
        
        Args:
            categorical_features: Dict of categorical feature matrices
            temporal_features: Matrix of temporal features
            
        Returns:
            List of (source, target, weight) tuples
        """
        edges = []
        num_samples = len(temporal_features)
        
        self.logger.info("Computing metadata similarity edges...")
        for feature_name, feature_matrix in tqdm(categorical_features.items(), 
                                               desc="Processing categorical features"):
            # Convert to torch tensors
            feature_tensor = torch.tensor(feature_matrix).float().to(self.device)
            
            # Compute similarity
            similarities = torch.mm(feature_tensor, feature_tensor.t())
            
            # Get edges where features match exactly
            matches = torch.where(similarities > 0.9)
            source_nodes = matches[0].cpu().numpy()
            target_nodes = matches[1].cpu().numpy()
            
            # Add edges with metadata type as weight
            for src, tgt in zip(source_nodes, target_nodes):
                if src != tgt:
                    edges.append((src, tgt, 0.5))  # Weight of 0.5 for metadata edges
        
        return edges
    
    def build_graph(
        self,
        processed_data: Dict[str, Union[np.ndarray, Dict]]
    ) -> nx.Graph:
        """
        Build the knowledge graph using processed features.
        
        Args:
            processed_data: Dictionary containing processed features
            
        Returns:
            NetworkX graph
        """
        self.logger.info("Starting graph construction...")
        
        # Initialize graph
        G = nx.Graph()
        
        # Add nodes with features
        for idx in range(len(processed_data['query_embeddings'])):
            G.add_node(
                idx,
                query_embedding=processed_data['query_embeddings'][idx],
                answer_embedding=processed_data['answer_embeddings'][idx]
            )
        
        # Compute edges based on text similarity
        text_edges = self.compute_text_similarity_edges(
            processed_data['query_embeddings']
        )
        
        # Compute edges based on metadata
        metadata_edges = self.compute_metadata_similarity(
            processed_data['categorical_features'],
            processed_data['temporal_features']
        )
        
        # Add all edges to graph
        self.logger.info("Adding edges to graph...")
        G.add_weighted_edges_from(text_edges)
        G.add_weighted_edges_from(metadata_edges)
        
        # Basic graph statistics
        self.logger.info(f"Graph constructed with {G.number_of_nodes()} nodes and "
                        f"{G.number_of_edges()} edges")
        
        self.graph = G
        return G
    
    def get_node_neighbors(
        self,
        node_idx: int,
        k: int = 5
    ) -> List[Tuple[int, float]]:
        """
        Get k-nearest neighbors for a node.
        
        Args:
            node_idx: Index of the node
            k: Number of neighbors to return
            
        Returns:
            List of (neighbor_idx, similarity) tuples
        """
        if self.graph is None:
            raise ValueError("Graph not built yet. Call build_graph first.")
            
        neighbors = []
        for neighbor in self.graph.neighbors(node_idx):
            similarity = self.graph[node_idx][neighbor]['weight']
            neighbors.append((neighbor, similarity))
        
        # Sort by similarity and return top k
        neighbors.sort(key=lambda x: x[1], reverse=True)
        return neighbors[:k]

    def visualize_subgraph(
        self,
        center_node: int,
        radius: int = 2
    ) -> nx.Graph:
        """
        Extract a subgraph centered around a node for visualization.
        
        Args:
            center_node: Index of the central node
            radius: Number of hops to include
            
        Returns:
            NetworkX subgraph
        """
        if self.graph is None:
            raise ValueError("Graph not built yet. Call build_graph first.")
            
        # Extract ego network
        subgraph = nx.ego_graph(self.graph, center_node, radius=radius)
        return subgraph


In [109]:
import numpy as np
import torch
import networkx as nx
from typing import Dict, List, Tuple, Union
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
import logging
import pandas as pd 

class GraphBuilder:
    def __init__(
        self,
        semantic_threshold: float = 0.85,
        metadata_threshold: float = 0.95,
        max_semantic_edges: int = 10,
        max_metadata_edges: int = 5,
        device: str = 'cuda'
    ):
        """
        Initialize the GraphBuilder.
        
        Args:
            similarity_threshold: Minimum similarity score to create an edge
            max_edges_per_node: Maximum number of edges per node
            device: 'cuda' or 'cpu'
        """
        # Set up logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        self.semantic_threshold = semantic_threshold
        self.metadata_threshold = metadata_threshold
        self.max_semantic_edges = max_semantic_edges
        self.max_metadata_edges = max_metadata_edges
        self.device = device
        self.graph = None
        
        # Feature weights for combining similarities
        self.weights = {
            'semantic': 0.7,    # Higher weight for semantic similarity
            'crop': 0.1,       # Weight for same crop
            'query_type': 0.1,  # Weight for same query type
            'location': 0.05,   # Weight for same location
            'season': 0.05      # Weight for same season
        }
        
    def compute_text_similarity_edges(
        self,
        embeddings: np.ndarray,
        batch_size: int = 128
    ) -> List[Tuple[int, int, float]]:
        """
        Compute edges based on text embedding similarity.
        Uses batched processing to handle large matrices.
        
        Args:
            embeddings: numpy array of text embeddings
            batch_size: size of batches for similarity computation
            
        Returns:
            List of (source, target, weight) tuples
        """
        num_samples = len(embeddings)
        edges = []
        
        # Convert to torch tensors
        embeddings_tensor = torch.tensor(embeddings).to(self.device)
        
        # Normalize embeddings for cosine similarity
        embeddings_tensor = torch.nn.functional.normalize(embeddings_tensor, p=2, dim=1)
        
        self.logger.info("Computing text similarity edges...")
        for i in tqdm(range(0, num_samples, batch_size)):
            batch_end = min(i + batch_size, num_samples)
            batch = embeddings_tensor[i:batch_end]
            
            # Compute similarity for the batch
            similarities = torch.mm(batch, embeddings_tensor.t())
            
            # Get top k similar nodes for each node in batch
            values, indices = torch.topk(similarities, k=self.max_semantic_edges + 1)
            
            # Convert to CPU and numpy for processing
            values = values.cpu().numpy()
            indices = indices.cpu().numpy()
            
            # Create edges for nodes in batch
            for idx, (node_values, node_indices) in enumerate(zip(values, indices)):
                node_idx = i + idx
                
                # Skip self-loops and low similarity edges
                for sim, target in zip(node_values, node_indices):
                    if (sim > self.semantic_threshold and 
                        node_idx != target):
                        edges.append((node_idx, target.item(), sim.item()))
        
        return edges
    
    def compute_metadata_similarity(
        self,
        categorical_features: Dict[str, np.ndarray],
        temporal_features: np.ndarray,
        processed_df: pd.DataFrame
    ) -> List[Tuple[int, int, float]]:
        """
        Compute edges based on weighted metadata similarity.
        """
        """
        Compute edges based on metadata similarity.
        
        Args:
            categorical_features: Dict of categorical feature matrices
            temporal_features: Matrix of temporal features
            
        Returns:
            List of (source, target, weight) tuples
        """
        edges = []
        num_samples = len(temporal_features)
        
        self.logger.info("Computing metadata similarity edges...")
        for feature_name, feature_matrix in tqdm(categorical_features.items(), 
                                               desc="Processing categorical features"):
            # Convert to torch tensors
            feature_tensor = torch.tensor(feature_matrix).float().to(self.device)
            
            # Compute similarity
            similarities = torch.mm(feature_tensor, feature_tensor.t())
            
            # Calculate weighted similarity
            if feature_name in ['Crop', 'QueryType']:
                weight = self.weights.get(feature_name.lower(), 0.05)
                
                # Get top matches for each node
                values, indices = torch.topk(similarities, k=self.max_metadata_edges + 1)
                values = values.cpu().numpy()
                indices = indices.cpu().numpy()
                
                # Add weighted edges
                for idx, (node_values, node_indices) in enumerate(zip(values, indices)):
                    for sim, target in zip(node_values[1:], node_indices[1:]):  # Skip self
                        if sim > self.metadata_threshold:
                            edges.append((idx, target, sim * weight))
        
        return edges
    
    def build_graph(
        self,
        processed_data: Dict[str, Union[np.ndarray, Dict]]
    ) -> nx.Graph:
        """
        Build the knowledge graph using processed features.
        
        Args:
            processed_data: Dictionary containing processed features
            
        Returns:
            NetworkX graph
        """
        self.logger.info("Starting graph construction...")
        
        # Initialize graph
        G = nx.Graph()
        
        # Add nodes with features
        for idx in range(len(processed_data['query_embeddings'])):
            G.add_node(
                idx,
                query_embedding=processed_data['query_embeddings'][idx],
                answer_embedding=processed_data['answer_embeddings'][idx]
            )
        
        # Compute edges based on text similarity
        text_edges = self.compute_text_similarity_edges(
            processed_data['query_embeddings']
        )
        
        # Compute edges based on metadata
        metadata_edges = self.compute_metadata_similarity(
            processed_data['categorical_features'],
            processed_data['temporal_features']
        )
        
        # Add all edges to graph
        self.logger.info("Adding edges to graph...")
        G.add_weighted_edges_from(text_edges)
        G.add_weighted_edges_from(metadata_edges)
        
        # Basic graph statistics
        self.logger.info(f"Graph constructed with {G.number_of_nodes()} nodes and "
                        f"{G.number_of_edges()} edges")
        
        self.graph = G
        return G
    
    def get_node_neighbors(
        self,
        node_idx: int,
        k: int = 5
    ) -> List[Tuple[int, float]]:
        """
        Get k-nearest neighbors for a node.
        
        Args:
            node_idx: Index of the node
            k: Number of neighbors to return
            
        Returns:
            List of (neighbor_idx, similarity) tuples
        """
        if self.graph is None:
            raise ValueError("Graph not built yet. Call build_graph first.")
            
        neighbors = []
        for neighbor in self.graph.neighbors(node_idx):
            similarity = self.graph[node_idx][neighbor]['weight']
            neighbors.append((neighbor, similarity))
        
        # Sort by similarity and return top k
        neighbors.sort(key=lambda x: x[1], reverse=True)
        return neighbors[:k]

    def visualize_subgraph(
        self,
        center_node: int,
        radius: int = 2
    ) -> nx.Graph:
        """
        Extract a subgraph centered around a node for visualization.
        
        Args:
            center_node: Index of the central node
            radius: Number of hops to include
            
        Returns:
            NetworkX subgraph
        """
        if self.graph is None:
            raise ValueError("Graph not built yet. Call build_graph first.")
            
        # Extract ego network
        subgraph = nx.ego_graph(self.graph, center_node, radius=radius)
        return subgraph


In [112]:
import numpy as np
import torch
import networkx as nx
from typing import Dict, List, Tuple, Union
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
import logging

class GraphBuilder:
    def __init__(
        self,
        semantic_threshold: float = 0.85,
        metadata_threshold: float = 0.95,
        max_semantic_edges: int = 10,
        max_metadata_edges: int = 5,
        device: str = 'cuda'
    ):
        """
        Initialize the GraphBuilder.
        
        Args:
            similarity_threshold: Minimum similarity score to create an edge
            max_edges_per_node: Maximum number of edges per node
            device: 'cuda' or 'cpu'
        """
        # Set up logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        self.semantic_threshold = semantic_threshold
        self.metadata_threshold = metadata_threshold
        self.max_semantic_edges = max_semantic_edges
        self.max_metadata_edges = max_metadata_edges
        self.device = device
        self.graph = None
        
        # Feature weights for combining similarities
        self.weights = {
            'semantic': 0.7,    # Higher weight for semantic similarity
            'crop': 0.1,       # Weight for same crop
            'query_type': 0.1,  # Weight for same query type
            'location': 0.05,   # Weight for same location
            'season': 0.05      # Weight for same season
        }
        
    def compute_text_similarity_edges(
        self,
        embeddings: np.ndarray,
        batch_size: int = 128
    ) -> List[Tuple[int, int, float]]:
        """
        Compute edges based on text embedding similarity.
        Uses batched processing to handle large matrices.
        
        Args:
            embeddings: numpy array of text embeddings
            batch_size: size of batches for similarity computation
            
        Returns:
            List of (source, target, weight) tuples
        """
        num_samples = len(embeddings)
        edges = []
        
        # Convert to torch tensors
        embeddings_tensor = torch.tensor(embeddings).to(self.device)
        
        # Normalize embeddings for cosine similarity
        embeddings_tensor = torch.nn.functional.normalize(embeddings_tensor, p=2, dim=1)
        
        self.logger.info("Computing text similarity edges...")
        for i in tqdm(range(0, num_samples, batch_size)):
            batch_end = min(i + batch_size, num_samples)
            batch = embeddings_tensor[i:batch_end]
            
            # Compute similarity for the batch
            similarities = torch.mm(batch, embeddings_tensor.t())
            
            # Get top k similar nodes for each node in batch
            values, indices = torch.topk(similarities, k=self.max_semantic_edges + 1)
            
            # Convert to CPU and numpy for processing
            values = values.cpu().numpy()
            indices = indices.cpu().numpy()
            
            # Create edges for nodes in batch
            for idx, (node_values, node_indices) in enumerate(zip(values, indices)):
                node_idx = i + idx
                
                # Skip self-loops and low similarity edges
                for sim, target in zip(node_values, node_indices):
                    if (sim > self.semantic_threshold and 
                        node_idx != target):
                        edges.append((node_idx, target.item(), sim.item()))
        
        return edges
    
    def compute_metadata_similarity(
        self,
        categorical_features: Dict[str, np.ndarray],
        temporal_features: np.ndarray,
        processed_df: pd.DataFrame
    ) -> List[Tuple[int, int, float]]:
        """
        Compute edges based on weighted metadata similarity.
        """
        """
        Compute edges based on metadata similarity.
        
        Args:
            categorical_features: Dict of categorical feature matrices
            temporal_features: Matrix of temporal features
            
        Returns:
            List of (source, target, weight) tuples
        """
        edges = []
        num_samples = len(temporal_features)
        
        self.logger.info("Computing metadata similarity edges...")
        for feature_name, feature_matrix in tqdm(categorical_features.items(), 
                                               desc="Processing categorical features"):
            # Convert to torch tensors
            feature_tensor = torch.tensor(feature_matrix).float().to(self.device)
            
            # Compute similarity
            similarities = torch.mm(feature_tensor, feature_tensor.t())
            
            # Calculate weighted similarity
            if feature_name in ['Crop', 'QueryType']:
                weight = self.weights.get(feature_name.lower(), 0.05)
                
                # Get top matches for each node
                values, indices = torch.topk(similarities, k=self.max_metadata_edges + 1)
                values = values.cpu().numpy()
                indices = indices.cpu().numpy()
                
                # Add weighted edges
                for idx, (node_values, node_indices) in enumerate(zip(values, indices)):
                    for sim, target in zip(node_values[1:], node_indices[1:]):  # Skip self
                        if sim > self.metadata_threshold:
                            edges.append((idx, target, sim * weight))
        
        return edges
    
    def build_graph(
        self,
        processed_data: Dict[str, Union[np.ndarray, Dict]],
        processed_df: Union[pd.DataFrame, pl.DataFrame] = None
    ) -> nx.Graph:
        """
        Build the knowledge graph using processed features.
        
        Args:
            processed_data: Dictionary containing processed features
            processed_df: Original processed dataframe with metadata
            
        Returns:
            NetworkX graph
        """
        """
        Build the knowledge graph using processed features.
        
        Args:
            processed_data: Dictionary containing processed features
            
        Returns:
            NetworkX graph
        """
        self.logger.info("Starting graph construction...")
        
        # Initialize graph
        G = nx.Graph()
        
        # Add nodes with features
        for idx in range(len(processed_data['query_embeddings'])):
            G.add_node(
                idx,
                query_embedding=processed_data['query_embeddings'][idx],
                answer_embedding=processed_data['answer_embeddings'][idx]
            )
        
        # Compute edges based on text similarity
        text_edges = self.compute_text_similarity_edges(
            processed_data['query_embeddings']
        )
        
        # Compute edges based on metadata
        metadata_edges = self.compute_metadata_similarity(
            processed_data['categorical_features'],
            processed_data['temporal_features'],
            processed_df if processed_df is not None else processed_data.get('processed_df')
        )
        
        # Add all edges to graph
        self.logger.info("Adding edges to graph...")
        G.add_weighted_edges_from(text_edges)
        G.add_weighted_edges_from(metadata_edges)
        
        # Basic graph statistics
        self.logger.info(f"Graph constructed with {G.number_of_nodes()} nodes and "
                        f"{G.number_of_edges()} edges")
        
        self.graph = G
        return G
    
    def get_node_neighbors(
        self,
        node_idx: int,
        k: int = 5
    ) -> List[Tuple[int, float]]:
        """
        Get k-nearest neighbors for a node.
        
        Args:
            node_idx: Index of the node
            k: Number of neighbors to return
            
        Returns:
            List of (neighbor_idx, similarity) tuples
        """
        if self.graph is None:
            raise ValueError("Graph not built yet. Call build_graph first.")
            
        neighbors = []
        for neighbor in self.graph.neighbors(node_idx):
            similarity = self.graph[node_idx][neighbor]['weight']
            neighbors.append((neighbor, similarity))
        
        # Sort by similarity and return top k
        neighbors.sort(key=lambda x: x[1], reverse=True)
        return neighbors[:k]

    def visualize_subgraph(
        self,
        center_node: int,
        radius: int = 2
    ) -> nx.Graph:
        """
        Extract a subgraph centered around a node for visualization.
        
        Args:
            center_node: Index of the central node
            radius: Number of hops to include
            
        Returns:
            NetworkX subgraph
        """
        if self.graph is None:
            raise ValueError("Graph not built yet. Call build_graph first.")
            
        # Extract ego network
        subgraph = nx.ego_graph(self.graph, center_node, radius=radius)
        return subgraph

In [113]:
# import polars as pl
# import pandas as pd
# from data_processor import DataProcessor
# from graph_builder import GraphBuilder

# Take first 10000 rows
df = filtered_df.head(10000)

# Step 1: Process the data
processor = DataProcessor(
    model_name='BAAI/bge-large-en-v1.5',
    device='cuda',
    batch_size=128
)
processed_data = processor.process_all(df)

# Step 2: Build the graph with stricter parameters
builder = GraphBuilder(
    semantic_threshold=0.85,    # High threshold for semantic similarity
    metadata_threshold=0.95,    # Very high threshold for metadata matching
    max_semantic_edges=10,      # Limit semantic edges
    max_metadata_edges=5,       # Limit metadata edges
    device='cuda'
)

# Create the knowledge graph - now passing the processed dataframe
G = builder.build_graph(
    processed_data=processed_data,
    processed_df=processed_data['processed_df']
)

# Print statistics
print(f"\nGraph Statistics:")
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print(f"Average degree: {2*G.number_of_edges()/G.number_of_nodes():.2f}")

# Look at sample connections
sample_node = 0
neighbors = builder.get_node_neighbors(sample_node, k=3)
print(f"\nExample Query and Similar Questions:")
print(f"\nOriginal Question: {df['QueryText'][sample_node]}")
print("\nSimilar Questions:")
for neighbor_idx, similarity in neighbors:
    print(f"Similarity: {similarity:.3f}")
    print(f"Question: {df['QueryText'][neighbor_idx]}")
    print(f"Answer: {df['KccAns'][neighbor_idx]}")
    print()

INFO:__main__:Loading text encoder model: BAAI/bge-large-en-v1.5
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5
INFO:__main__:Starting complete data processing pipeline
INFO:__main__:Starting dataframe preprocessing
INFO:__main__:Cleaning text fields...


Processing text columns:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:__main__:Cleaning categorical fields...


Processing categorical columns:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:__main__:Date column already in correct format
INFO:__main__:Dataframe preprocessing completed successfully
INFO:__main__:Generating embeddings for 10000 texts
INFO:__main__:Generating new embeddings for 10000 texts


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

INFO:__main__:Generating embeddings for 10000 texts
INFO:__main__:Generating new embeddings for 9977 texts


Batches:   0%|          | 0/78 [00:00<?, ?it/s]

INFO:__main__:Creating categorical features
INFO:__main__:Creating one-hot encodings for categorical features...


Processing categorical features:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:__main__:Categorical features created successfully
INFO:__main__:Creating temporal features
INFO:__main__:Temporal features created successfully
INFO:__main__:Complete processing pipeline finished successfully
INFO:__main__:Starting graph construction...
INFO:__main__:Computing text similarity edges...


  0%|          | 0/79 [00:00<?, ?it/s]

INFO:__main__:Computing metadata similarity edges...


Processing categorical features:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:__main__:Adding edges to graph...
INFO:__main__:Graph constructed with 10000 nodes and 158368 edges



Graph Statistics:
Number of nodes: 10000
Number of edges: 158368
Average degree: 31.67

Example Query and Similar Questions:

Original Question: FERTILIZER DOSES OF COCONUT

Similar Questions:
Similarity: 0.100
Question: COCONUT PRICE
Answer: GIVEN

Similarity: 0.100
Question: COCONUT PRICE
Answer: PRICE DETAILS GIVEN

Similarity: 0.100
Question: COCONUT MITES
Answer: 2 ML MANOLT

