In [29]:
import pandas as pd
import numpy as np
import os
import re
import json
import time
from pathlib import Path
from typing import Dict, List, Optional

# PDF processing libraries
import pdfplumber
import PyPDF2

# Vertex AI imports (instead of google.generativeai)
import vertexai
from vertexai.generative_models import GenerativeModel, SafetySetting, HarmCategory, HarmBlockThreshold

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

class VertexAIGeotechnicalExtractor:
    """
    Advanced Geotechnical Data Extractor using Vertex AI Gemini
    Uses service account credentials for authentication
    """
    
    def __init__(self, data_directory: str, credentials_file: str):
        self.data_dir = Path(data_directory)
        self.setup_vertex_ai(credentials_file)
        self.complete_reports = []
        self.text_plots = []
        
        # Define comprehensive feature schema
        self.extraction_schema = {
            "bearing_capacity": {
                "description": "Allowable bearing capacity in T/ft² or Tonne/ft²",
                "type": "float",
                "range": [0.1, 50.0]
            },
            "foundation_type": {
                "description": "Type of foundation recommended",
                "type": "string",
                "options": ["Strip", "Raft", "Pile", "Isolated", "Unknown"]
            },
            "liquid_limit_ll": {
                "description": "Liquid limit percentage from Atterberg limits",
                "type": "float",
                "range": [15.0, 100.0]
            },
            "plastic_limit_pl": {
                "description": "Plastic limit percentage from Atterberg limits",
                "type": "float",
                "range": [10.0, 50.0]
            },
            "plasticity_index": {
                "description": "Plasticity index (LL - PL)",
                "type": "float",
                "range": [0.0, 50.0]
            },
            "moisture_content_pct": {
                "description": "Natural moisture content percentage",
                "type": "float",
                "range": [0.0, 60.0]
            },
            "sand_pct": {
                "description": "Sand percentage from grain size analysis",
                "type": "float",
                "range": [0.0, 100.0]
            },
            "gravel_pct": {
                "description": "Gravel percentage from grain size analysis",
                "type": "float",
                "range": [0.0, 100.0]
            },
            "fines_pct": {
                "description": "Fines percentage (passing #200 sieve)",
                "type": "float",
                "range": [0.0, 100.0]
            },
            "spt_n_value": {
                "description": "Standard Penetration Test N-values",
                "type": "float",
                "range": [0.0, 100.0]
            },
            "uscs_classification": {
                "description": "Unified Soil Classification System",
                "type": "string",
                "options": ["CL", "CH", "ML", "MH", "SM", "SC", "SW", "SP", "GW", "GP", "GM", "GC"]
            },
            "bulk_density": {
                "description": "Bulk/dry density in g/cm³",
                "type": "float",
                "range": [1.0, 2.5]
            },
            "specific_gravity": {
                "description": "Specific gravity of soil solids",
                "type": "float",
                "range": [2.4, 2.8]
            },
            "consistency": {
                "description": "Soil consistency description",
                "type": "string",
                "options": ["soft", "firm", "stiff", "hard", "very soft", "very stiff", "loose", "dense"]
            },
            "soil_color": {
                "description": "Visual color of soil",
                "type": "string",
                "options": ["brown", "gray", "grey", "red", "yellow", "black", "white", "orange"]
            },
            "moisture_condition": {
                "description": "Moisture state of soil",
                "type": "string",
                "options": ["dry", "moist", "wet", "saturated", "damp"]
            }
        }
    
    def setup_vertex_ai(self, credentials_file: str):
        """Initialize Vertex AI using service account credentials"""
        try:
            # Set the environment variable for authentication
            os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_file
            
            # Load project ID from credentials file
            with open(credentials_file, 'r') as f:
                credentials = json.load(f)
            
            project_id = credentials.get('project_id', 'aiml-365220')
            
            # Initialize Vertex AI
            vertexai.init(
                project=project_id,
                location="us-central1"  # You can change this if needed
            )
            
            # Initialize the Gemini model
            self.model = GenerativeModel("gemini-1.5-pro")
            
            # Test the connection
            test_response = self.model.generate_content("Hello, test connection")
            print("✅ Vertex AI Gemini configured successfully with service account")
            
        except Exception as e:
            print(f"❌ Error setting up Vertex AI: {e}")
            print("💡 Make sure you have the correct service account permissions")
            self.model = None
    
    def identify_file_types(self):
        """Identify and categorize PDF files"""
        all_files = list(self.data_dir.glob("*.pdf"))
        
        print(f"🔍 Found {len(all_files)} PDF files")
        
        for file in all_files:
            filename = file.name.lower()
            
            # Extract project ID pattern
            project_id_match = re.search(r'(\d{4}\s*-\s*\d{2})', filename)
            if not project_id_match:
                continue
                
            clean_id = project_id_match.group(1).replace(' ', '')
            
            if "complete report" in filename:
                self.complete_reports.append({
                    'project_id': clean_id,
                    'file_path': file,
                    'type': 'complete_report'
                })
                print(f"  ✅ Complete Report: {clean_id}")
                
            elif "text plot" in filename:
                self.text_plots.append({
                    'project_id': clean_id,
                    'file_path': file,
                    'type': 'text_plot'
                })
                print(f"  ✅ Text Plot: {clean_id}")
        
        print(f"\n📊 Summary: {len(self.complete_reports)} Complete Reports, {len(self.text_plots)} Text Plots")
        return self.complete_reports, self.text_plots
    
    def extract_pdf_text_comprehensive(self, pdf_file) -> str:
        """Extract comprehensive text from PDF"""
        full_text = ""
        
        try:
            with pdfplumber.open(pdf_file) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    page_text = page.extract_text()
                    if page_text:
                        full_text += f"\n--- PAGE {page_num + 1} ---\n"
                        full_text += page_text + "\n"
                    
                    # Extract tables separately
                    tables = page.extract_tables()
                    if tables:
                        for table_num, table in enumerate(tables):
                            full_text += f"\n--- TABLE {table_num + 1} ON PAGE {page_num + 1} ---\n"
                            for row in table:
                                if row:
                                    full_text += " | ".join([str(cell) if cell else "" for cell in row]) + "\n"
            
            print(f"  📄 Extracted {len(full_text)} characters from PDF")
            return full_text
            
        except Exception as e:
            print(f"❌ Error extracting PDF text: {e}")
            return ""
    
    def create_extraction_prompt(self, pdf_text: str, project_id: str) -> str:
        """Create comprehensive extraction prompt"""
        
        feature_descriptions = []
        for feature, config in self.extraction_schema.items():
            desc = f"- **{feature}**: {config['description']}"
            if config['type'] == 'float':
                desc += f" (Range: {config['range'][0]}-{config['range'][1]})"
            elif 'options' in config:
                desc += f" (Options: {', '.join(config['options'])})"
            feature_descriptions.append(desc)
        
        prompt = f"""
You are an expert geotechnical engineer analyzing a soil investigation report for Project {project_id}.

EXTRACT THE FOLLOWING GEOTECHNICAL PARAMETERS:

{chr(10).join(feature_descriptions)}

EXTRACTION RULES:
1. **ACCURACY**: Only extract values you are confident about
2. **VALIDATION**: Ensure values are within realistic ranges
3. **FOUNDATION TYPE PRIORITY**: If multiple types mentioned, prioritize recommendations
4. **BEARING CAPACITY**: Look for allowable/safe bearing capacity, not ultimate
5. **GRAIN SIZE**: Extract from sieve analysis tables
6. **ATTERBERG LIMITS**: Extract LL, PL, and calculate PI = LL - PL if not given
7. **SPT VALUES**: Extract Standard Penetration Test N-values from borehole logs
8. **MULTIPLE VALUES**: If multiple values exist, provide the most representative one

RESPONSE FORMAT:
Return ONLY a valid JSON object with these exact keys (use null for missing data):

{{
    "bearing_capacity": float or null,
    "foundation_type": string or null,
    "liquid_limit_ll": float or null,
    "plastic_limit_pl": float or null,
    "plasticity_index": float or null,
    "moisture_content_pct": float or null,
    "sand_pct": float or null,
    "gravel_pct": float or null,
    "fines_pct": float or null,
    "spt_n_value": float or null,
    "uscs_classification": string or null,
    "bulk_density": float or null,
    "specific_gravity": float or null,
    "consistency": string or null,
    "soil_color": string or null,
    "moisture_condition": string or null
}}

GEOTECHNICAL REPORT TEXT:
{pdf_text[:15000]}

Extract the data now:
"""
        return prompt
    
    def extract_with_vertex_ai(self, pdf_text: str, project_id: str) -> Optional[Dict]:
        """Extract geotechnical data using Vertex AI Gemini"""
        if not self.model:
            print("❌ Vertex AI model not available")
            return None
        
        try:
            print(f"  🤖 Analyzing with Vertex AI Gemini...")
            
            # Create extraction prompt
            prompt = self.create_extraction_prompt(pdf_text, project_id)
            
            # Generate response with retry logic
            max_retries = 3
            for attempt in range(max_retries):
                try:
                    response = self.model.generate_content(prompt)
                    response_text = response.text.strip()
                    
                    # Clean response (remove markdown formatting)
                    if response_text.startswith('```json'):
                        response_text = response_text[7:]
                    if response_text.endswith('```'):
                        response_text = response_text[:-3]
                    
                    # Parse JSON
                    extracted_data = json.loads(response_text)
                    
                    # Validate extracted data
                    validated_data = self.validate_extraction(extracted_data)
                    
                    print(f"  ✅ Vertex AI extraction successful")
                    return validated_data
                    
                except json.JSONDecodeError as e:
                    print(f"  ⚠️ JSON parsing error (attempt {attempt + 1}): {e}")
                    if attempt == max_retries - 1:
                        return None
                    time.sleep(1)
                    
                except Exception as e:
                    print(f"  ⚠️ Vertex AI error (attempt {attempt + 1}): {e}")
                    if attempt == max_retries - 1:
                        return None
                    time.sleep(2)
            
        except Exception as e:
            print(f"❌ Vertex AI extraction error: {e}")
            return None
    
    def validate_extraction(self, data: Dict) -> Dict:
        """Validate and clean extracted data"""
        validated = {}
        
        for feature, value in data.items():
            if feature not in self.extraction_schema:
                continue
                
            config = self.extraction_schema[feature]
            
            if value is None:
                validated[feature] = None
                continue
            
            # Validate numeric values
            if config['type'] == 'float':
                try:
                    float_val = float(value)
                    min_val, max_val = config['range']
                    
                    if min_val <= float_val <= max_val:
                        validated[feature] = float_val
                        print(f"    ✅ {feature}: {float_val}")
                    else:
                        print(f"    ⚠️ {feature}: {float_val} outside range {min_val}-{max_val}")
                        validated[feature] = None
                except (ValueError, TypeError):
                    print(f"    ⚠️ {feature}: Invalid numeric value {value}")
                    validated[feature] = None
            
            # Validate string values
            elif config['type'] == 'string':
                str_val = str(value).strip()
                if 'options' in config:
                    if str_val in config['options']:
                        validated[feature] = str_val
                        print(f"    ✅ {feature}: {str_val}")
                    else:
                        print(f"    ⚠️ {feature}: {str_val} not in valid options")
                        validated[feature] = None
                else:
                    validated[feature] = str_val
                    print(f"    ✅ {feature}: {str_val}")
        
        # Calculate plasticity index if LL and PL are available
        if (validated.get('liquid_limit_ll') is not None and 
            validated.get('plastic_limit_pl') is not None and 
            validated.get('plasticity_index') is None):
            
            pi = validated['liquid_limit_ll'] - validated['plastic_limit_pl']
            if 0 <= pi <= 50:
                validated['plasticity_index'] = pi
                print(f"    ✅ plasticity_index: {pi} (calculated)")
        
        return validated
    
    def process_project(self, project_id: str) -> Dict:
        """Process a single project"""
        print(f"\n🤖 Processing Project {project_id} with Vertex AI")
        
        # Find files for this project
        complete_report = next((cr for cr in self.complete_reports if cr['project_id'] == project_id), None)
        text_plot = next((tp for tp in self.text_plots if tp['project_id'] == project_id), None)
        
        project_data = {'project_id': project_id}
        
        # Process Text Plot (for bearing capacity and foundation type)
        if text_plot:
            print(f"  📋 Analyzing Text Plot...")
            text_plot_content = self.extract_pdf_text_comprehensive(text_plot['file_path'])
            if text_plot_content:
                text_plot_data = self.extract_with_vertex_ai(text_plot_content, project_id)
                if text_plot_data:
                    # Prioritize bearing capacity and foundation type from text plot
                    for key in ['bearing_capacity', 'foundation_type']:
                        if text_plot_data.get(key) is not None:
                            project_data[key] = text_plot_data[key]
        
        # Process Complete Report (for lab data and soil properties)
        if complete_report:
            print(f"  📊 Analyzing Complete Report...")
            complete_report_content = self.extract_pdf_text_comprehensive(complete_report['file_path'])
            if complete_report_content:
                complete_report_data = self.extract_with_vertex_ai(complete_report_content, project_id)
                if complete_report_data:
                    # Add all lab data from complete report
                    for key, value in complete_report_data.items():
                        if value is not None and key not in project_data:
                            project_data[key] = value
        
        return project_data
    
    def create_dataset(self) -> pd.DataFrame:
        """Create dataset using Vertex AI extraction"""
        print(f"\n🚀 Creating Vertex AI Geotechnical Dataset")
        print("="*60)
        
        # Identify files
        self.identify_file_types()
        
        if not self.complete_reports and not self.text_plots:
            print("❌ No PDF files found!")
            return pd.DataFrame()
        
        # Process each project
        all_projects = []
        
        # Get unique project IDs
        project_ids = set()
        for cr in self.complete_reports:
            project_ids.add(cr['project_id'])
        for tp in self.text_plots:
            project_ids.add(tp['project_id'])
        
        print(f"\n🔄 Processing {len(project_ids)} projects with Vertex AI...")
        
        for project_id in sorted(project_ids):
            try:
                project_data = self.process_project(project_id)
                all_projects.append(project_data)
                
                # Add delay to respect API rate limits
                time.sleep(2)
                
            except Exception as e:
                print(f"❌ Error processing project {project_id}: {e}")
                all_projects.append({'project_id': project_id})
        
        # Create DataFrame
        dataset = pd.DataFrame(all_projects)
        
        print(f"\n✅ Vertex AI dataset created: {dataset.shape[0]} projects × {dataset.shape[1]} features")
        
        return dataset
    
    def save_dataset(self, df: pd.DataFrame, filename: str = 'geotechnical_dataset_vertex_ai.csv'):
        """Save enhanced dataset"""
        df.to_csv(filename, index=False)
        print(f"\n💾 Dataset saved as: {filename}")
        
        # Generate summary
        print(f"\n📊 VERTEX AI EXTRACTION SUMMARY:")
        print(f"   Projects processed: {len(df)}")
        print(f"   Features extracted: {len(df.columns) - 1}")
        
        # Feature completeness
        for col in df.columns:
            if col != 'project_id':
                non_null = df[col].notna().sum()
                total = len(df)
                coverage = (non_null/total)*100
                status = "✅" if coverage >= 75 else "⚠️" if coverage >= 25 else "❌"
                print(f"   {status} {col:<25}: {non_null}/{total} ({coverage:.0f}%)")
        
        return df

def main():
    """Main execution function"""
    print("🚀 VERTEX AI GEOTECHNICAL DATA EXTRACTION SYSTEM")
    print("="*70)
    
    # Initialize extractor
    try:
        extractor = VertexAIGeotechnicalExtractor(
            data_directory="Data",
            credentials_file="aiml-365220-a4deab52698f.json"
        )
        
        if not extractor.model:
            print("❌ Vertex AI model not available. Exiting.")
            return None
        
        # Create enhanced dataset
        dataset = extractor.create_dataset()
        
        if dataset.empty:
            print("❌ No data extracted!")
            return None
        
        # Save results
        extractor.save_dataset(dataset)
        
        # Display results
        print(f"\n📋 DATASET PREVIEW:")
        print(dataset.head())
        
        print(f"\n🎯 READY FOR AI/ML MODELING!")
        
        return dataset
        
    except Exception as e:
        print(f"❌ Extraction failed: {e}")
        return None

if __name__ == "__main__":
    print("🤖 EXECUTING VERTEX AI EXTRACTION PIPELINE")
    print("="*60)
    
    results = main()
    
    if results is not None:
        print(f"\n🎉 SUCCESS! Vertex AI extraction completed!")
        print(f"📊 Dataset shape: {results.shape}")
    else:
        print("❌ Extraction failed")
    
    print(f"\n🏁 PIPELINE COMPLETE!")

🤖 EXECUTING VERTEX AI EXTRACTION PIPELINE
🚀 VERTEX AI GEOTECHNICAL DATA EXTRACTION SYSTEM
✅ Vertex AI Gemini configured successfully with service account

🚀 Creating Vertex AI Geotechnical Dataset
🔍 Found 6 PDF files
  ✅ Complete Report: 7144-25
  ✅ Complete Report: 7145-25
  ✅ Complete Report: 7155-25
  ✅ Text Plot: 7155-25
  ✅ Complete Report: 7157-25
  ✅ Text Plot: 7157-25

📊 Summary: 4 Complete Reports, 2 Text Plots

🔄 Processing 4 projects with Vertex AI...

🤖 Processing Project 7144-25 with Vertex AI
  📊 Analyzing Complete Report...
  📄 Extracted 53844 characters from PDF
  🤖 Analyzing with Vertex AI Gemini...
    ✅ bearing_capacity: 0.7
    ✅ foundation_type: Strip
    ✅ spt_n_value: 13.0
    ✅ uscs_classification: GM
    ✅ consistency: stiff
    ✅ soil_color: brown
    ✅ moisture_condition: moist
  ✅ Vertex AI extraction successful

🤖 Processing Project 7145-25 with Vertex AI
  📊 Analyzing Complete Report...
  📄 Extracted 52504 characters from PDF
  🤖 Analyzing with Vertex AI G