In [47]:
# =============================================================================
# CELL 1: IMPORTS AND SETUP
# =============================================================================

import pandas as pd
import numpy as np
import os
import re
from pathlib import Path
import PyPDF2
import pdfplumber
import seaborn as sns

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("✅ All libraries imported successfully")
print("📊 Ready for geotechnical data extraction")

✅ All libraries imported successfully
📊 Ready for geotechnical data extraction


In [48]:
# =============================================================================
# CELL 2: FINAL COMPREHENSIVE GEOTECHNICAL DATA EXTRACTOR
# =============================================================================

class GeotechnicalDataExtractor:
    """
    FINAL Comprehensive Geotechnical Data Extraction System
    Combines strict validation with comprehensive feature extraction
    """
    
    def __init__(self, data_directory):
        self.data_dir = Path(data_directory)
        self.complete_reports = []
        self.text_plots = []
        
    def identify_file_types(self):
        """Identify and categorize PDF files"""
        all_files = list(self.data_dir.glob("*.pdf"))
        
        print(f"🔍 Found {len(all_files)} PDF files")
        
        for file in all_files:
            filename = file.name.lower()
            
            # Extract project ID pattern
            project_id_match = re.search(r'(\d{4}\s*-\s*\d{2})', filename)
            if not project_id_match:
                continue
                
            clean_id = project_id_match.group(1).replace(' ', '')
            
            if "complete report" in filename:
                self.complete_reports.append({
                    'project_id': clean_id,
                    'file_path': file,
                    'type': 'complete_report'
                })
                print(f"  ✅ Complete Report: {clean_id}")
                
            elif "text plot" in filename:
                self.text_plots.append({
                    'project_id': clean_id,
                    'file_path': file,
                    'type': 'text_plot'
                })
                print(f"  ✅ Text Plot: {clean_id}")
        
        print(f"\n📊 Summary: {len(self.complete_reports)} Complete Reports, {len(self.text_plots)} Text Plots")
        return self.complete_reports, self.text_plots

    def extract_target_variables(self, text_plot_file):
        """Extract bearing capacity and foundation type from Text Plot - ENHANCED"""
        try:
            with pdfplumber.open(text_plot_file) as pdf:
                full_text = ""
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        full_text += page_text + "\n"
            
            print(f"  🔍 Analyzing Text Plot content...")
            print(f"  📝 Text preview: {full_text[:200]}...")  # Debug output
            
            # COMPREHENSIVE bearing capacity patterns
            bearing_patterns = [
                r'(\d+\.?\d*)\s*(?:Tonne|T|ton)/ft[²2]',
                r'(\d+\.?\d*)\s*T/ft²',
                r'(\d+\.?\d*)\s*ton/ft²',
                r'(?:Net\s*)?(?:Allowable\s*)?(?:Safe\s*)?Bearing\s*(?:Capacity|Pressure)\s*[=:]\s*(\d+\.?\d*)',
                r'qa\s*=\s*(\d+\.?\d*)',
                r'q\s*all\s*=\s*(\d+\.?\d*)',
                r'Bearing\s*Capacity.*?(\d+\.?\d*)\s*(?:T|ton|Tonne)',
                r'BC\s*=\s*(\d+\.?\d*)',
                r'Allowable\s*Bearing\s*Pressure.*?(\d+\.?\d*)',
                r'Safe\s*Bearing\s*Capacity.*?(\d+\.?\d*)',
                # Additional patterns for different formats
                r'(\d+\.?\d*)\s*(?:T|ton|Tonne)\s*/\s*(?:sq\.?\s*)?ft',
                r'(\d+\.?\d*)\s*(?:T|ton|Tonne)\s*per\s*(?:sq\.?\s*)?ft'
            ]
            
            bearing_capacities = []
            for pattern in bearing_patterns:
                matches = re.findall(pattern, full_text, re.IGNORECASE)
                for match in matches:
                    try:
                        value = float(match)
                        # STRICT validation for bearing capacity
                        if 0.1 <= value <= 50:  # Reasonable range
                            bearing_capacities.append(value)
                            print(f"    ✅ Found bearing capacity: {value} T/ft² using pattern: {pattern[:30]}...")
                    except ValueError:
                        continue
            
            # COMPREHENSIVE foundation type detection
            foundation_type = "Unknown"
            text_upper = full_text.upper()
            
            foundation_patterns = {
                "Raft": [
                    "RAFT FOUNDATION", "MAT FOUNDATION", "RAFT FOOTING", "MAT FOOTING",
                    "COMBINED FOOTING", "CONTINUOUS FOOTING"
                ],
                "Pile": [
                    "PILE FOUNDATION", "DEEP FOUNDATION", "PILED FOUNDATION", "PILE FOOTING",
                    "DRIVEN PILE", "BORED PILE", "CAST IN SITU PILE"
                ],
                "Shallow": [
                    "SHALLOW FOUNDATION", "SPREAD FOOTING", "STRIP FOOTING", "SHALLOW FOOTING",
                    "SURFACE FOUNDATION"
                ],
                "Isolated": [
                    "ISOLATED FOOTING", "PAD FOOTING", "INDIVIDUAL FOOTING", "SQUARE FOOTING",
                    "RECTANGULAR FOOTING", "COLUMN FOOTING"
                ]
            }
            
            # Check each foundation type
            for ftype, patterns in foundation_patterns.items():
                for pattern in patterns:
                    if pattern in text_upper:
                        foundation_type = ftype
                        print(f"    ✅ Found foundation type: {foundation_type} (matched: {pattern})")
                        break
                if foundation_type != "Unknown":
                    break
            
            # If still unknown, try more flexible patterns
            if foundation_type == "Unknown":
                flexible_patterns = [
                    (r'(?:raft|mat)', "Raft"),
                    (r'(?:pile|deep)', "Pile"),
                    (r'(?:shallow|spread|strip)', "Shallow"),
                    (r'(?:isolated|pad|individual)', "Isolated")
                ]
                
                for pattern, ftype in flexible_patterns:
                    if re.search(pattern, full_text, re.IGNORECASE):
                        foundation_type = ftype
                        print(f"    ✅ Found foundation type: {foundation_type} (flexible pattern)")
                        break
            
            if foundation_type == "Unknown":
                print(f"    ⚠️ Foundation type not found in text")
            
            return {
                'bearing_capacities': bearing_capacities,
                'foundation_type': foundation_type
            }
            
        except Exception as e:
            print(f"❌ Error extracting targets: {e}")
            return None

    def extract_comprehensive_lab_data(self, complete_report_file):
        """Extract ALL laboratory parameters with validation"""
        try:
            with pdfplumber.open(complete_report_file) as pdf:
                extracted_data = []
                
                for page_num, page in enumerate(pdf.pages):
                    text = page.extract_text()
                    tables = page.extract_tables()
                    
                    if text:
                        text_data = self._extract_from_text_patterns_comprehensive(text)
                        extracted_data.extend(text_data)
                    
                    if tables:
                        for table in tables:
                            table_data = self._extract_from_table_comprehensive(table)
                            extracted_data.extend(table_data)
                
                return extracted_data
                
        except Exception as e:
            print(f"❌ Error extracting lab data: {e}")
            return []

    def _extract_from_text_patterns_comprehensive(self, text):
        """Comprehensive extraction with ALL parameters and strict validation"""
        extracted = []
        
        patterns = {
            'moisture_content_pct': [
                r'(?:moisture|water)\s*content\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                r'w\s*=\s*(\d{1,2}\.?\d*)\s*%',
                r'M\.C\.\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                r'moisture.*?(\d{1,2}\.?\d*)\s*%'
            ],
            'liquid_limit_ll': [
                r'liquid\s*limit\s*[=:]\s*(\d{1,2}\.?\d*)\s*%?',
                r'LL\s*=\s*(\d{1,2}\.?\d*)',
                r'L\.L\.\s*[=:]\s*(\d{1,2}\.?\d*)',
                r'Liquid\s*Limit\s*[=:]\s*(\d{1,2}\.?\d*)'
            ],
            'plastic_limit_pl': [
                r'plastic\s*limit\s*[=:]\s*(\d{1,2}\.?\d*)\s*%?',
                r'PL\s*=\s*(\d{1,2}\.?\d*)',
                r'P\.L\.\s*[=:]\s*(\d{1,2}\.?\d*)',
                r'Plastic\s*Limit\s*[=:]\s*(\d{1,2}\.?\d*)'
            ],
            'sand_pct': [
                # Basic sand patterns
                r'sand\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                r'(\d{1,2}\.?\d*)\s*%\s*sand',
                r'SAND\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                r'Sand\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                
                # Sand fraction patterns
                r'(?:fine|medium|coarse)\s*sand\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                r'(\d{1,2}\.?\d*)\s*%\s*(?:fine|medium|coarse)\s*sand',
                r'sand\s*fraction\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                r'total\s*sand\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                
                # Sieve patterns (sand range)
                r'retained\s*(?:on\s*)?(?:#\s*)?(?:4|8|16|30|50|100)\s*.*?(\d{1,2}\.?\d*)\s*%',
                r'#\s*(?:4|8|16|30|50|100)\s*.*?(\d{1,2}\.?\d*)\s*%'
            ],
            'gravel_pct': [
                # Basic gravel patterns
                r'gravel\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                r'(\d{1,2}\.?\d*)\s*%\s*gravel',
                r'GRAVEL\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                r'Gravel\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                
                # Gravel fraction patterns
                r'(?:fine|coarse)\s*gravel\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                r'(\d{1,2}\.?\d*)\s*%\s*(?:fine|coarse)\s*gravel',
                r'gravel\s*fraction\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                r'total\s*gravel\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                
                # Sieve patterns (gravel range)
                r'retained\s*(?:on\s*)?(?:#\s*)?(?:4|3/8|1/2|3/4)\s*.*?(\d{1,2}\.?\d*)\s*%'
            ],
            'fines_pct': [
                # Basic fines patterns
                r'fines?\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                r'(\d{1,2}\.?\d*)\s*%\s*fines?',
                r'FINES?\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                
                # Sieve #200 patterns (standard for fines)
                r'passing\s*(?:#\s*)?200\s*.*?(\d{1,2}\.?\d*)\s*%',
                r'#\s*200\s*.*?(\d{1,2}\.?\d*)\s*%',
                r'(?:sieve\s*)?200\s*.*?(\d{1,2}\.?\d*)\s*%',
                
                # Combined silt+clay
                r'silt\s*[+&]\s*clay\s*[=:]\s*(\d{1,2}\.?\d*)\s*%',
                r'fine\s*fraction\s*[=:]\s*(\d{1,2}\.?\d*)\s*%'
            ],
            'plasticity_index': [
                r'plasticity\s*index\s*[=:]\s*(\d{1,2}\.?\d*)',
                r'PI\s*=\s*(\d{1,2}\.?\d*)',
                r'P\.I\.\s*[=:]\s*(\d{1,2}\.?\d*)'
            ],
            'bulk_density': [
                r'bulk\s*density\s*[=:]\s*(\d\.?\d*)',
                r'dry\s*density\s*[=:]\s*(\d\.?\d*)',
                r'γd\s*[=:]\s*(\d\.?\d*)',
                r'unit\s*weight\s*[=:]\s*(\d\.?\d*)'
            ],
            'specific_gravity': [
                r'specific\s*gravity\s*[=:]\s*(\d\.?\d*)',
                r'Gs\s*=\s*(\d\.?\d*)',
                r'G\s*=\s*(\d\.?\d*)'
            ]
        }
        
        # STRICT validation ranges
        validation_ranges = {
            'moisture_content_pct': (0, 60),      
            'liquid_limit_ll': (15, 100),         # Stricter range
            'plastic_limit_pl': (10, 50),         
            'sand_pct': (0, 100),
            'gravel_pct': (0, 100),
            'fines_pct': (0, 100),
            'plasticity_index': (0, 50),
            'bulk_density': (1.0, 2.5),
            'specific_gravity': (2.4, 2.8)        # Typical soil range
        }
        
        for param, pattern_list in patterns.items():
            valid_values = []
            
            for pattern in pattern_list:
                matches = re.findall(pattern, text, re.IGNORECASE)
                for match in matches:
                    try:
                        value = float(match)
                        
                        # Apply STRICT validation
                        if param in validation_ranges:
                            min_val, max_val = validation_ranges[param]
                            if min_val <= value <= max_val:
                                valid_values.append(value)
                                
                                # Debug output for grain size parameters
                                if param in ['sand_pct', 'gravel_pct', 'fines_pct'] and value > 0:
                                    print(f"    ✅ Found {param}: {value}%")
                            else:
                                print(f"    ⚠️ Rejected {param}: {value} (outside range {min_val}-{max_val})")
                        else:
                            valid_values.append(value)
                            
                    except ValueError:
                        continue
            
            if valid_values:
                # Use median to avoid outliers
                final_value = sorted(valid_values)[len(valid_values)//2]
                extracted.append({
                    'parameter': param,
                    'value': final_value,
                    'source': 'comprehensive_text'
                })
        
        return extracted

    def _extract_from_table_comprehensive(self, table):
        """Comprehensive table extraction with enhanced mapping"""
        if not table or len(table) < 2:
            return []
        
        extracted = []
        headers = table[0] if table[0] else []
        
        # COMPREHENSIVE header mapping
        header_mapping = {
            'depth': ['depth', 'dep', 'd', 'elevation', 'elev'],
            'moisture_content_pct': [
                'moisture', 'water content', 'w%', 'mc', 'w.c.', 'moisture content',
                'water', 'w', 'moisture %'
            ],
            'liquid_limit_ll': [
                'liquid limit', 'll', 'l.l', 'liquid', 'l limit'
            ],
            'plastic_limit_pl': [
                'plastic limit', 'pl', 'p.l', 'plastic', 'p limit'
            ],
            'sand_pct': [
                'sand', 'sand%', 'sand %', '% sand', 'total sand',
                'fine sand', 'medium sand', 'coarse sand', 'sand fraction',
                'sand content'
            ],
            'gravel_pct': [
                'gravel', 'gravel%', 'gravel %', '% gravel', 'total gravel',
                'fine gravel', 'coarse gravel', 'gravel fraction',
                'gravel content'
            ],
            'fines_pct': [
                'fines', 'fine', 'fines%', 'fines %', '% fines',
                'passing 200', '#200', 'sieve 200', '200 mesh',
                'silt + clay', 'silt+clay', 'clay+silt', 'fine fraction',
                'fines content'
            ],
            'plasticity_index': [
                'plasticity index', 'pi', 'p.i', 'plasticity', 'p index'
            ],
            'bulk_density': [
                'bulk density', 'density', 'γd', 'unit weight', 'dry density',
                'bulk', 'γ', 'gamma'
            ],
            'specific_gravity': [
                'specific gravity', 'gs', 'g', 'sp gravity', 'sp gr'
            ],
            'uscs_classification': [
                'uscs', 'classification', 'class', 'soil type', 'soil class'
            ],
            'spt_n_value': [
                'n value', 'n-value', 'spt', 'n', 'blows', 'n val'
            ],
            'borehole_no': [
                'borehole', 'bh', 'bore', 'hole', 'borehole no', 'bh no'
            ]
        }
        
        # Map headers to parameters
        column_mapping = {}
        for i, header in enumerate(headers):
            if header:
                header_lower = str(header).lower().strip()
                
                for param, keywords in header_mapping.items():
                    if any(keyword in header_lower for keyword in keywords):
                        column_mapping[i] = param
                        print(f"  📋 Mapped column {i} '{header}' to {param}")
                        break
        
        # Extract data from rows with strict validation
        for row_idx, row in enumerate(table[1:]):
            if not row:
                continue
            
            for col_idx, cell_value in enumerate(row):
                if col_idx in column_mapping and cell_value:
                    param = column_mapping[col_idx]
                    
                    if param == 'uscs_classification':
                        uscs_match = re.search(r'\b(CL|CH|ML|MH|SM|SC|SW|SP|GW|GP|GM|GC)\b', 
                                             str(cell_value).upper())
                        if uscs_match:
                            extracted.append({
                                'parameter': param,
                                'value': uscs_match.group(1),
                                'source': 'comprehensive_table'
                            })
                    else:
                        # Extract numeric values with STRICT validation
                        numeric_value = self._extract_safe_numeric_comprehensive(cell_value, param)
                        if numeric_value is not None:
                            extracted.append({
                                'parameter': param,
                                'value': numeric_value,
                                'source': 'comprehensive_table'
                            })
                            
                            # Debug output for grain size
                            if param in ['sand_pct', 'gravel_pct', 'fines_pct'] and numeric_value > 0:
                                print(f"  🎯 Table extracted {param}: {numeric_value}%")
        
        return extracted

    def _extract_safe_numeric_comprehensive(self, cell_value, param):
        """Safely extract numeric values with comprehensive validation"""
        try:
            if isinstance(cell_value, str):
                # Extract only reasonable numbers
                number_match = re.search(r'\b(\d{1,2}\.?\d{0,2})\b', str(cell_value))
                if number_match:
                    value = float(number_match.group(1))
                else:
                    return None
            elif isinstance(cell_value, (int, float)):
                value = float(cell_value)
            else:
                return None
            
            # Parameter-specific STRICT validation
            validation_ranges = {
                'depth': (0, 100),                   
                'moisture_content_pct': (0, 60),
                'liquid_limit_ll': (15, 100),       # Stricter
                'plastic_limit_pl': (10, 50),       
                'sand_pct': (0, 100),
                'gravel_pct': (0, 100),
                'fines_pct': (0, 100),
                'plasticity_index': (0, 50),
                'spt_n_value': (0, 100),            
                'borehole_no': (1, 10),             # Stricter range
                'bulk_density': (1.0, 2.5),
                'specific_gravity': (2.4, 2.8)
            }
            
            if param in validation_ranges:
                min_val, max_val = validation_ranges[param]
                if min_val <= value <= max_val:
                    return value
                else:
                    print(f"    ⚠️ Rejected {param}: {value} (outside range {min_val}-{max_val})")
                    return None
            
            return value
            
        except (ValueError, TypeError):
            return None

    def extract_soil_descriptions(self, complete_report_file):
        """Extract soil descriptions with enhanced parsing"""
        try:
            with pdfplumber.open(complete_report_file) as pdf:
                descriptions = []
                
                for page in pdf.pages:
                    text = page.extract_text()
                    if not text:
                        continue
                    
                    # Enhanced soil description patterns
                    patterns = [
                        r'(\d{1,2}\.?\d*)\s*[-\']\s*(\d{1,2}\.?\d*)\s*[\'"]?\s*:?\s*([^\n]+(?:clay|sand|silt|gravel)[^\n]*)',
                        r'(CL|CH|ML|MH|SM|SC|SW|SP|GW|GP|GM|GC)\s*[:-]?\s*([^\n]+)',
                        r'(brown|gray|grey|black|white|yellow|red|orange)\s*([^\n]*(?:clay|sand|silt|gravel)[^\n]*)'
                    ]
                    
                    for pattern in patterns:
                        matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
                        for match in matches:
                            parsed = self._parse_soil_description_comprehensive(match)
                            if parsed:
                                descriptions.append(parsed)
                
                return descriptions
                
        except Exception as e:
            print(f"❌ Error extracting soil descriptions: {e}")
            return []

    def _parse_soil_description_comprehensive(self, match):
        """Parse soil description with comprehensive feature extraction"""
        if isinstance(match, tuple):
            text = ' '.join(str(m) for m in match)
        else:
            text = str(match)
        
        text_lower = text.lower()
        parsed = {}
        
        # Comprehensive feature extraction
        features = {
            'soil_color': ['brown', 'gray', 'grey', 'black', 'white', 'yellow', 'red', 'orange'],
            'consistency': ['soft', 'firm', 'stiff', 'hard', 'loose', 'dense', 'very soft', 'very stiff'],
            'moisture': ['dry', 'moist', 'wet', 'saturated', 'damp'],
            'primary_soil_type': ['clay', 'sand', 'silt', 'gravel'],
            'uscs_classification': ['CL', 'CH', 'ML', 'MH', 'SM', 'SC', 'SW', 'SP', 'GW', 'GP', 'GM', 'GC']
        }
        
        for feature, values in features.items():
            for value in values:
                if value.lower() in text_lower:
                    parsed[feature] = value
                    break
        
        # Extract depths with validation
        depth_match = re.search(r'(\d{1,2}\.?\d*)\s*[-\']\s*(\d{1,2}\.?\d*)', text)
        if depth_match:
            try:
                start_depth = float(depth_match.group(1))
                end_depth = float(depth_match.group(2))
                # Validate depths
                if 0 <= start_depth <= 100 and 0 <= end_depth <= 100 and start_depth < end_depth:
                    parsed['depth_start'] = start_depth
                    parsed['depth_end'] = end_depth
            except ValueError:
                pass
        
        # Extract SPT values with validation
        spt_match = re.search(r'N\s*=\s*(\d{1,2})', text, re.IGNORECASE)
        if spt_match:
            try:
                spt_value = int(spt_match.group(1))
                if 0 <= spt_value <= 100:
                    parsed['spt_n_value'] = spt_value
            except ValueError:
                pass
        
        return parsed if len(parsed) > 0 else None

    def extract_spt_data(self, complete_report_file):
        """Extract SPT N-values with comprehensive patterns"""
        try:
            with pdfplumber.open(complete_report_file) as pdf:
                spt_values = []
                
                patterns = [
                    r'N\s*=\s*(\d{1,2})',
                    r'SPT\s*[=:]\s*(\d{1,2})',
                    r'N-value\s*[=:]\s*(\d{1,2})',
                    r'N\s*val\s*[=:]\s*(\d{1,2})',
                    r'blow.*?(\d{1,2})'
                ]
                
                for page in pdf.pages:
                    text = page.extract_text()
                    if not text:
                        continue
                    
                    for pattern in patterns:
                        matches = re.findall(pattern, text, re.IGNORECASE)
                        for match in matches:
                            try:
                                value = int(match)
                                # Strict SPT validation
                                if 0 <= value <= 100:
                                    spt_values.append(value)
                            except ValueError:
                                continue
                
                return spt_values
                
        except Exception as e:
            print(f"❌ Error extracting SPT data: {e}")
            return []

    def create_comprehensive_dataset(self):
        """Create comprehensive dataset with all features"""
        print(f"\n🔄 Processing {len(self.complete_reports)} projects...")
        all_data = []
        
        for complete_report in self.complete_reports:
            project_id = complete_report['project_id']
            complete_file = complete_report['file_path']
            
            # Find corresponding text plot
            text_plot = next((tp for tp in self.text_plots if tp['project_id'] == project_id), None)
            
            print(f"\n📁 Processing Project {project_id}")
            
            # Extract all data types
            target_data = None
            if text_plot:
                target_data = self.extract_target_variables(text_plot['file_path'])
                print(f"  ✅ Target variables extracted")
            else:
                print(f"  ⚠️ No text plot found - no target variables")
            
            lab_data = self.extract_comprehensive_lab_data(complete_file)
            soil_data = self.extract_soil_descriptions(complete_file)
            spt_data = self.extract_spt_data(complete_file)
            
            print(f"  📊 Extracted: {len(lab_data)} lab params, {len(soil_data)} soil features, {len(spt_data)} SPT values")
            
            all_data.append({
                'project_id': project_id,
                'target_data': target_data,
                'lab_data': lab_data,
                'soil_data': soil_data,
                'spt_data': spt_data
            })
        
        # Structure into DataFrame
        dataset = self._structure_dataset_comprehensive(all_data)
        print(f"\n✅ Dataset created: {dataset.shape[0]} rows × {dataset.shape[1]} columns")
        
        return dataset

    def _structure_dataset_comprehensive(self, all_data):
        """Structure dataset with comprehensive feature handling"""
        rows = []
        
        # Get list of projects that actually have Text Plot files
        text_plot_projects = {tp['project_id'] for tp in self.text_plots}
        
        for project in all_data:
            project_id = project['project_id']
            row = {'project_id': project_id}
            
            # Handle target variables with proper validation
            if project_id in text_plot_projects and project['target_data']:
                if project['target_data']['bearing_capacities']:
                    row['bearing_capacity'] = float(project['target_data']['bearing_capacities'][0])
                else:
                    row['bearing_capacity'] = None
                
                # Set foundation type (don't convert "Unknown" to None)
                row['foundation_type'] = project['target_data']['foundation_type']
            else:
                # No Text Plot = No bearing capacity data
                row['bearing_capacity'] = None
                row['foundation_type'] = None
            
            # Process lab data with averaging for multiple values
            if project['lab_data']:
                lab_params = {}
                for entry in project['lab_data']:
                    param = entry['parameter']
                    value = entry['value']
                    
                    if param not in lab_params:
                        lab_params[param] = []
                    lab_params[param].append(value)
                
                # Average multiple values for numeric parameters
                for param, values in lab_params.items():
                    if values and isinstance(values[0], (int, float)):
                        row[param] = sum(values) / len(values)
                    elif values:
                        row[param] = values[0]  # Take first for categorical
            
            # Process soil data with proper handling
            if project['soil_data']:
                soil_features = {}
                for desc in project['soil_data']:
                    for key, value in desc.items():
                        if isinstance(value, (int, float)):
                            if key not in soil_features:
                                soil_features[key] = []
                            soil_features[key].append(value)
                        elif isinstance(value, str) and key not in soil_features:
                            soil_features[key] = value
                
                # Process collected soil features
                for feature, values in soil_features.items():
                    if isinstance(values, list) and values:
                        if isinstance(values[0], (int, float)):
                            row[feature] = sum(values) / len(values)
                        else:
                            row[feature] = values[0]
                    elif isinstance(values, str):
                        row[feature] = values
            
            # Process SPT data
            if project['spt_data']:
                row['avg_n_value'] = sum(project['spt_data']) / len(project['spt_data'])
                row['max_n_value'] = max(project['spt_data'])
                row['min_n_value'] = min(project['spt_data'])
                row['n_value_count'] = len(project['spt_data'])
            
            rows.append(row)
        
        return pd.DataFrame(rows)

    def clean_dataset(self, df):
        """Clean dataset without losing information"""
        print(f"\n🧹 Cleaning dataset...")
        print(f"Initial shape: {df.shape}")
        
        # Keep NaN values - don't fill them automatically
        # Only clean obvious errors
        
        print(f"✅ Cleaned dataset shape: {df.shape}")
        return df

    def generate_summary(self, df):
        """Generate comprehensive dataset summary"""
        print("\n" + "="*60)
        print("📊 FINAL COMPREHENSIVE DATASET SUMMARY")
        print("="*60)
        
        print(f"\n📈 Basic Statistics:")
        print(f"   Projects: {len(df)}")
        print(f"   Features: {len(df.columns)}")
        print(f"   Complete projects (with bearing capacity): {df['bearing_capacity'].notna().sum() if 'bearing_capacity' in df.columns else 0}")
        
        print(f"\n🎯 Target Variables:")
        if 'bearing_capacity' in df.columns and df['bearing_capacity'].notna().any():
            bc_stats = df['bearing_capacity'].describe()
            print(f"   Bearing Capacity Mean: {bc_stats['mean']:.2f} T/ft²")
            print(f"   Bearing Capacity Range: {bc_stats['min']:.2f} - {bc_stats['max']:.2f} T/ft²")
        
        print(f"\n🏗️ Foundation Types:")
        if 'foundation_type' in df.columns:
            foundation_counts = df['foundation_type'].value_counts(dropna=False)
            for ftype, count in foundation_counts.items():
                print(f"   {ftype}: {count} projects")
        
        print(f"\n🔍 Grain Size Analysis:")
        grain_cols = ['sand_pct', 'gravel_pct', 'fines_pct']
        for col in grain_cols:
            if col in df.columns:
                non_null = df[col].notna().sum()
                if non_null > 0:
                    values = df[col].dropna()
                    print(f"   ✅ {col:<12}: {non_null}/4 projects, Range: {values.min():.1f}% - {values.max():.1f}%")
                else:
                    print(f"   ❌ {col:<12}: No data extracted")
        
        print(f"\n📋 Feature Completeness:")
        for col in df.columns:
            non_null = df[col].notna().sum()
            total = len(df)
            coverage = (non_null/total)*100
            status = "✅" if coverage == 100 else "⚠️" if coverage >= 50 else "❌"
            print(f"   {status} {col:<25}: {non_null}/{total} ({coverage:.0f}%)")
        
        return df.describe()

    def save_dataset(self, df, filename='geotechnical_dataset_final_comprehensive.csv'):
        """Save comprehensive dataset to CSV"""
        df.to_csv(filename, index=False)
        print(f"\n💾 Dataset saved as: {filename}")
        print(f"   Shape: {df.shape}")
        print(f"   Columns: {list(df.columns)}")

print("✅ FINAL Comprehensive GeotechnicalDataExtractor class defined")

✅ FINAL Comprehensive GeotechnicalDataExtractor class defined


In [49]:
# =============================================================================
# CELL 3: MAIN EXECUTION FUNCTION
# =============================================================================

def main():
    """Main execution function for comprehensive geotechnical data extraction"""
    
    print("🚀 Starting FINAL COMPREHENSIVE Geotechnical Data Extraction System")
    print("="*75)
    
    # Initialize extractor
    extractor = GeotechnicalDataExtractor("Data")
    
    # Step 1: Identify files
    print("\n📁 Step 1: File Identification")
    complete_reports, text_plots = extractor.identify_file_types()
    
    if not complete_reports:
        print("❌ No complete reports found!")
        return None, None
    
    # Step 2: Extract comprehensive dataset
    print("\n🔍 Step 2: FINAL Comprehensive Data Extraction")
    dataset = extractor.create_comprehensive_dataset()
    
    if dataset.empty:
        print("❌ No data extracted!")
        return None, None
    
    # Step 3: Clean dataset
    print("\n🧹 Step 3: Data Cleaning")
    cleaned_dataset = extractor.clean_dataset(dataset)
    
    # Step 4: Generate summary
    print("\n📊 Step 4: Analysis & Summary")
    summary_stats = extractor.generate_summary(cleaned_dataset)
    
    # Step 5: Save results
    print("\n💾 Step 5: Save Results")
    extractor.save_dataset(cleaned_dataset, 'geotechnical_dataset_final_comprehensive.csv')
    
    print("\n✅ FINAL Comprehensive Extraction Complete!")
    print(f"Final dataset: {cleaned_dataset.shape[0]} projects × {cleaned_dataset.shape[1]} features")
    
    return cleaned_dataset, summary_stats

print("✅ FINAL comprehensive main execution function defined")

✅ FINAL comprehensive main execution function defined


In [50]:
# =============================================================================
# CELL 4: EXECUTE THE FINAL COMPREHENSIVE PIPELINE
# =============================================================================

# Execute the final comprehensive geotechnical data extraction pipeline
print("🚀 EXECUTING FINAL COMPREHENSIVE GEOTECHNICAL DATA EXTRACTION PIPELINE")
print("="*80)

final_dataset, summary_stats = main()

if final_dataset is not None:
    print("\n📋 FINAL COMPREHENSIVE DATASET PREVIEW:")
    print(final_dataset.head())
    
    print("\n🔍 FINAL VALIDATION CHECK:")
    
    # Check bearing capacity
    if 'bearing_capacity' in final_dataset.columns:
        bc_data = final_dataset[['project_id', 'bearing_capacity']].dropna(subset=['bearing_capacity'])
        print(f"✅ Bearing Capacity: {len(bc_data)} projects have valid values")
        for _, row in bc_data.iterrows():
            print(f"   {row['project_id']}: {row['bearing_capacity']:.3f} T/ft²")
    
    # Check foundation types
    if 'foundation_type' in final_dataset.columns:
        ft_data = final_dataset[['project_id', 'foundation_type']].dropna(subset=['foundation_type'])
        print(f"\n🏗️ Foundation Types: {len(ft_data)} projects have types")
        for _, row in ft_data.iterrows():
            print(f"   {row['project_id']}: {row['foundation_type']}")
    
    # Check grain size data
    print(f"\n🔍 GRAIN SIZE EXTRACTION RESULTS:")
    grain_cols = ['sand_pct', 'gravel_pct', 'fines_pct']
    for col in grain_cols:
        if col in final_dataset.columns:
            non_null = final_dataset[col].notna().sum()
            if non_null > 0:
                values = final_dataset[col].dropna()
                print(f"   ✅ {col:<12}: {non_null}/4 projects extracted")
                print(f"       Values: {list(values.round(1))}")
            else:
                print(f"   ❌ {col:<12}: No data found")
    
    # Validate realistic ranges
    print(f"\n🔍 REALISTIC VALUE CHECK:")
    problematic_found = False
    for col in final_dataset.columns:
        if final_dataset[col].dtype in ['int64', 'float64']:
            values = final_dataset[col].dropna()
            if len(values) > 0:
                max_val = values.max()
                min_val = values.min()
                
                # Check for unrealistic values
                if max_val > 100000 or min_val < -100:
                    print(f"   ⚠️ {col}: Range {min_val:.1f} to {max_val:.1f} (may be unrealistic)")
                    problematic_found = True
    
    if not problematic_found:
        print("   ✅ All values appear realistic!")
    
    print("\n🎯 READY FOR AI/ML MODELING!")
    print("Use 'geotechnical_dataset_final_comprehensive.csv' for your geotechnical AI system.")
    
    print(f"\n📊 FINAL FEATURES EXTRACTED:")
    feature_count = 0
    for col in final_dataset.columns:
        if col != 'project_id':
            feature_count += 1
            non_null = final_dataset[col].notna().sum()
            total = len(final_dataset)
            coverage = (non_null/total)*100
            status = "✅" if coverage == 100 else "⚠️" if coverage >= 50 else "❌"
            print(f"   {feature_count:2d}. {status} {col:<25}: {non_null}/{total} ({coverage:.0f}%)")
    
else:
    print("❌ Final comprehensive pipeline execution failed")

print("\n🎉 FINAL COMPREHENSIVE PIPELINE COMPLETE!")

🚀 EXECUTING FINAL COMPREHENSIVE GEOTECHNICAL DATA EXTRACTION PIPELINE
🚀 Starting FINAL COMPREHENSIVE Geotechnical Data Extraction System

📁 Step 1: File Identification
🔍 Found 6 PDF files
  ✅ Complete Report: 7144-25
  ✅ Complete Report: 7145-25
  ✅ Complete Report: 7155-25
  ✅ Text Plot: 7155-25
  ✅ Complete Report: 7157-25
  ✅ Text Plot: 7157-25

📊 Summary: 4 Complete Reports, 2 Text Plots

🔍 Step 2: FINAL Comprehensive Data Extraction

🔄 Processing 4 projects...

📁 Processing Project 7144-25
  ⚠️ No text plot found - no target variables
  📋 Mapped column 1 'AAAUUUGGG’’’ 222000222555
ation Report
OOOVVVEEERRRSSSEEEAAASSS
BBBLLLOOOCCCKKK' to liquid_limit_ll
  📋 Mapped column 0 'Depth
(ft)' to depth
  📋 Mapped column 1 'BH – 1' to borehole_no
  📋 Mapped column 3 'BH - 2' to borehole_no
    ⚠️ Rejected borehole_no: 14.0 (outside range 1-10)
    ⚠️ Rejected borehole_no: 20.0 (outside range 1-10)
    ⚠️ Rejected borehole_no: 20.0 (outside range 1-10)
  📋 Mapped column 0 'Depth' to depth
 