In [4]:
%pip install transformers

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading huggingface_hub-0.29.3-py3-none-any.whl (468 kB)
Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (471 kB)
Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K  

In [None]:
import os
import pandas as pd
import json
import re
import torch
import numpy as np
from datetime import datetime, timedelta
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
from torch.nn import functional as F

# Load CSV data
def load_csv_data():
    """Load all required CSV datasets"""
    soil_df = pd.read_csv('soil_properties.csv')
    crop_df = pd.read_csv('crop_properties.csv')
    constants_df = pd.read_csv('krishnan_kovil_constants.csv')
    
    print(f"Loaded {len(soil_df)} soil types")
    print(f"Loaded {len(crop_df)} crop types")
    
    return soil_df, crop_df, constants_df

# Tamil language mappings
tamil_soil_mappings = {
    'சிவப்பு மண்': 'Red Soil',
    'கருப்பு களிமண்': 'Black Clayey Soil',
    'பழுப்பு மண்': 'Brown Soil',
    'வண்டல் மண்': 'Alluvial Soil'
}

tamil_crop_mappings = {
    'நெல்': 'Rice',
    'கரும்பு': 'Sugarcane',
    'நிலக்கடலை': 'Groundnut',
    'பருத்தி': 'Cotton',
    'வாழை': 'Banana'
}

# Hardcoded weather data (to be replaced with API later)
hardcoded_weather = {
    'temperature': 32.0,  # °C
    'humidity': 75.0,     # %
    'wind_speed': 2.5,    # m/s
    'rainfall': 3.0       # mm
}

class FarmerConversationLLM:
    def __init__(self):
        # Load datasets
        self.soil_df, self.crop_df, self.constants_df = load_csv_data()
        
        # Initialize MuRIL model and tokenizer
        print("Loading MuRIL model...")
        self.tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
        self.model = AutoModelForMaskedLM.from_pretrained("google/muril-base-cased")
        
        # Ensure model is in evaluation mode
        self.model.eval()
        
        # Initialize fill-mask pipeline for text completion
        self.fill_mask = pipeline(
            "fill-mask",
            model=self.model,
            tokenizer=self.tokenizer
        )
        
        # Move model to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        
        print(f"Model loaded and running on {self.device}")
        
        # Initialize conversation state
        self.state = {
            'soil_type': None,
            'crop_type': None,
            'growth_stage': None,
            'planting_info': None,
            'language': 'english',
            'conversation_history': []
        }
        
        # Define prompts for different conversation stages
        self.prompts = {
            'english': {
                'greeting': "Hello! I'm here to help calculate water requirements for your crops. Can you tell me about your farm?",
                'ask_soil': "What type of soil do you have in your farm?",
                'ask_crop': "What crop are you growing?",
                'ask_planting': "When did you plant your crop?",
                'confirm': "Thank you for providing the information. Let me confirm: you have {soil_type} soil, growing {crop_type}, and your crop is in the {growth_stage} stage. Is this correct?",
                'complete': "Great! Here's the information I'll send to calculate your water requirements:\n{data_json}"
            },
            'tamil': {
                'greeting': "வணக்கம்! உங்கள் பயிர்களுக்கான நீர் தேவைகளை கணக்கிட நான் உதவுகிறேன். உங்கள் பண்ணையைப் பற்றி சொல்லுங்கள்?",
                'ask_soil': "உங்கள் பண்ணையில் எந்த வகை மண் உள்ளது?",
                'ask_crop': "நீங்கள் என்ன பயிர் வளர்க்கிறீர்கள்?",
                'ask_planting': "எப்போது உங்கள் பயிரை நட்டீர்கள்?",
                'confirm': "தகவல் வழங்கியதற்கு நன்றி. உறுதிப்படுத்துகிறேன்: உங்களிடம் {soil_type} மண் உள்ளது, {crop_type} வளர்க்கிறீர்கள், மற்றும் உங்கள் பயிர் {growth_stage} நிலையில் உள்ளது. இது சரியா?",
                'complete': "அருமை! உங்கள் நீர் தேவைகளை கணக்கிட நான் அனுப்பும் தகவல்கள்:\n{data_json}"
            }
        }
        
        # Load example conversations for similarity matching
        self.example_conversations = self.load_example_conversations()
    
    def load_example_conversations(self):
        """Load example conversations for similarity matching"""
        # These would ideally come from a database or file
        examples = [
            {
                'english': "I have red soil and I'm growing rice. I planted it 30 days ago.",
                'tamil': "என்னிடம் சிவப்பு மண் உள்ளது, நான் நெல் பயிரிடுகிறேன். 30 நாட்களுக்கு முன் நட்டேன்.",
                'soil_type': 'Red Soil',
                'crop_type': 'Rice',
                'days_ago': 30
            },
            {
                'english': "My farm has black clayey soil and I'm growing sugarcane for 2 months now.",
                'tamil': "என் பண்ணையில் கருப்பு களிமண் உள்ளது, நான் 2 மாதங்களாக கரும்பு வளர்க்கிறேன்.",
                'soil_type': 'Black Clayey Soil',
                'crop_type': 'Sugarcane',
                'days_ago': 60
            },
            {
                'english': "I'm a groundnut farmer. My soil is brown soil. Planted about 45 days back.",
                'tamil': "நான் ஒரு நிலக்கடலை விவசாயி. என் மண் பழுப்பு மண். சுமார் 45 நாட்களுக்கு முன் நட்டேன்.",
                'soil_type': 'Brown Soil',
                'crop_type': 'Groundnut',
                'days_ago': 45
            }
        ]
        
        # Encode examples for faster similarity matching
        for example in examples:
            example['english_encoding'] = self.encode_text(example['english'])
            example['tamil_encoding'] = self.encode_text(example['tamil'])
        
        return examples
    
    def detect_language(self, text):
        """Detect if text is in Tamil or English"""
        tamil_chars = [c for c in text if '\u0B80' <= c <= '\u0BFF']
        return 'tamil' if tamil_chars else 'english'
    
    def encode_text(self, text):
        """Encode text using MuRIL model"""
        # Tokenize text
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Get hidden states
        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)
            # Use last hidden state of [CLS] token as sentence embedding
            hidden_states = outputs.hidden_states[-1]
            cls_embedding = hidden_states[:, 0, :].cpu().numpy()
        
        return cls_embedding
    
    def find_most_similar_example(self, text):
        """Find the most similar example to the input text"""
        text_encoding = self.encode_text(text)
        language = self.detect_language(text)
        
        best_similarity = -1
        best_example = None
        
        for example in self.example_conversations:
            encoding_key = 'tamil_encoding' if language == 'tamil' else 'english_encoding'
            example_encoding = example[encoding_key]
            
            # Calculate cosine similarity
            similarity = np.dot(text_encoding.flatten(), example_encoding.flatten()) / (
                np.linalg.norm(text_encoding) * np.linalg.norm(example_encoding)
            )
            
            if similarity > best_similarity:
                best_similarity = similarity
                best_example = example
        
        # Only return if similarity is above threshold
        if best_similarity > 0.7:
            return best_example
        return None
    
    def extract_entities(self, text):
        """Extract soil type, crop type, and planting information from text"""
        results = {
            'soil_type': None,
            'crop_type': None,
            'planting_info': None,
            'growth_stage': None
        }
        
        # Detect language
        language = self.detect_language(text)
        text_lower = text.lower()
        
        # Try to find similar example first for fast-path extraction
        similar_example = self.find_most_similar_example(text)
        if similar_example:
            results['soil_type'] = similar_example['soil_type']
            results['crop_type'] = similar_example['crop_type']
            days_ago = similar_example['days_ago']
            results['planting_info'] = {'type': 'days_ago', 'value': days_ago}
            
            # Determine growth stage using the crop information from similar example
            if results['crop_type']:
                results['growth_stage'] = self.determine_growth_stage(days_ago, results['crop_type'])
            
            return results
        
        # Extract soil type
        # First check for Tamil soil names
        if language == 'tamil':
            for tamil_soil, english_soil in tamil_soil_mappings.items():
                if tamil_soil in text:
                    results['soil_type'] = english_soil
                    break
        
        # Then check for English soil names
        if not results['soil_type']:
            for _, row in self.soil_df.iterrows():
                soil_type = row['soil_type']
                if soil_type.lower() in text_lower:
                    results['soil_type'] = soil_type
                    break
            
            # Check for alternative soil names
            if not results['soil_type']:
                soil_alternatives = {
                    'red': 'Red Soil',
                    'black': 'Black Clayey Soil',
                    'brown': 'Brown Soil',
                    'alluvial': 'Alluvial Soil',
                    'clay': 'Black Clayey Soil'
                }
                
                for alt, soil in soil_alternatives.items():
                    if alt in text_lower:
                        results['soil_type'] = soil
                        break
        
        # Extract crop type
        # First check for Tamil crop names
        if language == 'tamil':
            for tamil_crop, english_crop in tamil_crop_mappings.items():
                if tamil_crop in text:
                    results['crop_type'] = english_crop
                    break
        
        # Then check for English crop names
        if not results['crop_type']:
            for _, row in self.crop_df.iterrows():
                crop_type = row['crop']
                if crop_type.lower() in text_lower:
                    results['crop_type'] = crop_type
                    break
            
            # Check for alternative crop names
            if not results['crop_type']:
                crop_alternatives = {
                    'paddy': 'Rice',
                    'groundnuts': 'Groundnut',
                    'peanut': 'Groundnut',
                    'sugarcanes': 'Sugarcane',
                    'bananas': 'Banana'
                }
                
                for alt, crop in crop_alternatives.items():
                    if alt in text_lower:
                        results['crop_type'] = crop
                        break
        
        # Extract planting information (days since planting)
        days_pattern = r'(\d+)\s+days?\s+ago'
        months_pattern = r'(\d+)\s+months?\s+ago'
        tamil_days_pattern = r'(\d+)\s+நாட்களுக்கு\s+முன்'
        
        # Check for days ago pattern
        days_match = re.search(days_pattern, text_lower)
        if days_match:
            days = int(days_match.group(1))
            results['planting_info'] = {'type': 'days_ago', 'value': days}
        
        # Check for months ago pattern
        if not results['planting_info']:
            months_match = re.search(months_pattern, text_lower)
            if months_match:
                months = int(months_match.group(1))
                days = months * 30  # Approximate
                results['planting_info'] = {'type': 'days_ago', 'value': days}
        
        # Check for Tamil days pattern
        if not results['planting_info'] and language == 'tamil':
            tamil_days_match = re.search(tamil_days_pattern, text)
            if tamil_days_match:
                days = int(tamil_days_match.group(1))
                results['planting_info'] = {'type': 'days_ago', 'value': days}
        
        # If we have planting info and crop type, determine growth stage
        if results['planting_info'] and results['planting_info']['type'] == 'days_ago' and results['crop_type']:
            days_ago = results['planting_info']['value']
            results['growth_stage'] = self.determine_growth_stage(days_ago, results['crop_type'])
        
        return results
    
    def determine_growth_stage(self, days_ago, crop_type):
        """Determine growth stage based on days since planting and crop type"""
        # Find the crop in our dataset
        crop_row = self.crop_df[self.crop_df['crop'] == crop_type]
        
        if crop_row.empty:
            return None
        
        # Get growth stage durations
        initial_days = crop_row['stage_initial_days'].values[0]
        development_days = crop_row['stage_development_days'].values[0]
        mid_season_days = crop_row['stage_mid_season_days'].values[0]
        
        # Calculate cumulative days
        initial_end = initial_days
        development_end = initial_end + development_days
        mid_season_end = development_end + mid_season_days
        
        # Determine stage
        if days_ago <= initial_end:
            return 'initial'
        elif days_ago <= development_end:
            return 'development'
        elif days_ago <= mid_season_end:
            return 'mid_season'
        else:
            return 'late_season'
    
    def get_next_required_info(self):
        """Determine what information we still need to ask for"""
        if not self.state['soil_type']:
            return 'soil_type'
        elif not self.state['crop_type']:
            return 'crop_type'
        elif not self.state['growth_stage'] and not self.state['planting_info']:
            return 'planting_info'
        return None
    
    def get_completion_with_muril(self, prompt, max_length=50):
        """Use MuRIL to generate a conversational completion"""
        # This is a workaround using a masked language model for completion
        # Not ideal but can provide some variability in responses
        
        prompt = prompt.strip()
        # Add mask token to end of prompt
        completion_prompt = f"{prompt} {self.tokenizer.mask_token}"
        
        generated_text = prompt
        
        # Generate one token at a time
        for _ in range(max_length):
            # Get model prediction for next token
            fill_mask_results = self.fill_mask(completion_prompt)
            next_token = fill_mask_results[0]["token_str"]
            
            # Break if end of sentence is reached
            if next_token in ['.', '?', '!']:
                generated_text += next_token
                break
            
            # Add token to generated text
            generated_text += " " + next_token
            
            # Update completion prompt
            completion_prompt = f"{generated_text} {self.tokenizer.mask_token}"
        
        return generated_text
    
    def generate_response(self, user_input):
        """Generate response using the MuRIL model and conversation state"""
        # Add user input to conversation history
        self.state['conversation_history'].append({'role': 'user', 'content': user_input})
        
        # Detect language
        language = self.detect_language(user_input)
        self.state['language'] = language
        lang_key = 'tamil' if language == 'tamil' else 'english'
        
        # Extract entities from user input
        extracted_data = self.extract_entities(user_input)
        
        # Update conversation state with extracted data
        if extracted_data['soil_type']:
            self.state['soil_type'] = extracted_data['soil_type']
        
        if extracted_data['crop_type']:
            self.state['crop_type'] = extracted_data['crop_type']
        
        if extracted_data['growth_stage']:
            self.state['growth_stage'] = extracted_data['growth_stage']
        
        if extracted_data['planting_info']:
            self.state['planting_info'] = extracted_data['planting_info']
        
        # Determine what to ask next
        next_required = self.get_next_required_info()
        
        # Generate response based on conversation state
        if not self.state['conversation_history'][:-1]:
            # First message - greeting
            response = self.prompts[lang_key]['greeting']
        elif next_required == 'soil_type':
            response = self.prompts[lang_key]['ask_soil']
        elif next_required == 'crop_type':
            response = self.prompts[lang_key]['ask_crop']
        elif next_required == 'planting_info':
            response = self.prompts[lang_key]['ask_planting']
        else:
            # We have all the information we need
            if language == 'tamil':
                tamil_soil = next((k for k, v in tamil_soil_mappings.items() if v == self.state['soil_type']), 
                                  self.state['soil_type'])
                tamil_crop = next((k for k, v in tamil_crop_mappings.items() if v == self.state['crop_type']), 
                                 self.state['crop_type'])
                
                # Map growth stage to Tamil
                growth_stage_tamil = {
                    'initial': 'ஆரம்ப நிலை',
                    'development': 'வளர்ச்சி நிலை',
                    'mid_season': 'நடு பருவம்',
                    'late_season': 'இறுதி பருவம்'
                }
                tamil_stage = growth_stage_tamil.get(self.state['growth_stage'], self.state['growth_stage'])
                
                # Format confirmation message
                response = self.prompts[lang_key]['confirm'].format(
                    soil_type=tamil_soil,
                    crop_type=tamil_crop,
                    growth_stage=tamil_stage
                )
            else:
                # Format confirmation message in English
                response = self.prompts[lang_key]['confirm'].format(
                    soil_type=self.state['soil_type'],
                    crop_type=self.state['crop_type'],
                    growth_stage=self.state['growth_stage']
                )
            
            # Check if user confirms the information
            if 'yes' in user_input.lower() or 'correct' in user_input.lower() or 'சரி' in user_input:
                # Prepare data for backend
                backend_data = {
                    'soil_type': self.state['soil_type'],
                    'crop_type': self.state['crop_type'],
                    'growth_stage': self.state['growth_stage'],
                    'planting_info': self.state['planting_info'],
                    'weather': hardcoded_weather
                }
                
                # Format the data as JSON
                data_json = json.dumps(backend_data, indent=2)
                
                # Return completion message with data
                response = self.prompts[lang_key]['complete'].format(data_json=data_json)
        
        # Add response to conversation history
        self.state['conversation_history'].append({'role': 'assistant', 'content': response})
        
        return response
    
    def reset_conversation(self):
        """Reset the conversation state"""
        self.state = {
            'soil_type': None,
            'crop_type': None,
            'growth_stage': None,
            'planting_info': None,
            'language': 'english',
            'conversation_history': []
        }

# Main function to run the chatbot
def main():
    # Initialize the conversation LLM
    print("Initializing MuRIL-based Farmer Conversation LLM...")
    conversation_llm = FarmerConversationLLM()
    
    print("\nFarmer Conversation System using MuRIL LLM")
    print("==========================================")
    print("Type 'quit', 'exit', or 'reset' to start over.")
    print("\nStart chatting below:\n")
    
    while True:
        user_input = input("> ")
        
        if user_input.lower() in ['quit', 'exit']:
            break
        
        if user_input.lower() == 'reset':
            conversation_llm.reset_conversation()
            print("Conversation reset. Let's start over.")
            continue
        
        try:
            response = conversation_llm.generate_response(user_input)
            print(f"\n{response}\n")
        except Exception as e:
            print(f"Error: {e}")
            print("Sorry, I encountered an error. Let's try again.")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
2025-03-13 16:53:10.482497: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741864990.659649  746519 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741864990.708260  746519 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-13 16:53:11.154462: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Initializing MuRIL-based Farmer Conversation LLM...
Loaded 4 soil types
Loaded 5 crop types
Loading MuRIL model...


Some weights of the model checkpoint at google/muril-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Model loaded and running on cpu

Farmer Conversation System using MuRIL LLM
Type 'quit', 'exit', or 'reset' to start over.

Start chatting below:


Hello! I'm here to help calculate water requirements for your crops. Can you tell me about your farm?


Thank you for providing the information. Let me confirm: you have Red Soil soil, growing Rice, and your crop is in the initial stage. Is this correct?


Thank you for providing the information. Let me confirm: you have Red Soil soil, growing Rice, and your crop is in the initial stage. Is this correct?


Great! Here's the information I'll send to calculate your water requirements:
{
  "soil_type": "Black Clayey Soil",
  "crop_type": "Sugarcane",
  "growth_stage": "development",
  "planting_info": {
    "type": "days_ago",
    "value": 60
  },
  "weather": {
    "temperature": 32.0,
    "humidity": 75.0,
    "wind_speed": 2.5,
    "rainfall": 3.0
  }
}


Thank you for providing the information. Let me confirm: you have Black Clayey Soil so