In [18]:
# =============================================================================
# CELL 1: IMPORTS AND SETUP
# =============================================================================

import pandas as pd
import numpy as np
import os
import re
from pathlib import Path
import PyPDF2
import pdfplumber
import matplotlib.pyplot as plt
import seaborn as sns

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("✅ All libraries imported successfully")
print("📊 Ready for geotechnical data extraction")

✅ All libraries imported successfully
📊 Ready for geotechnical data extraction


In [19]:
# =============================================================================
# CELL 2: ENHANCED GEOTECHNICAL DATA EXTRACTOR CLASS
# =============================================================================

class GeotechnicalDataExtractor:
    """
    Enhanced Geotechnical Data Extraction System
    Fixed version with improved grain size extraction patterns
    """
    
    def __init__(self, data_directory):
        self.data_dir = Path(data_directory)
        self.complete_reports = []
        self.text_plots = []
        
    def identify_file_types(self):
        """Identify and categorize PDF files"""
        all_files = list(self.data_dir.glob("*.pdf"))
        
        print(f"🔍 Found {len(all_files)} PDF files")
        
        for file in all_files:
            filename = file.name.lower()
            
            # Extract project ID pattern
            project_id_match = re.search(r'(\d{4}\s*-\s*\d{2})', filename)
            if not project_id_match:
                continue
                
            clean_id = project_id_match.group(1).replace(' ', '')
            
            if "complete report" in filename:
                self.complete_reports.append({
                    'project_id': clean_id,
                    'file_path': file,
                    'type': 'complete_report'
                })
                print(f"  ✅ Complete Report: {clean_id}")
                
            elif "text plot" in filename:
                self.text_plots.append({
                    'project_id': clean_id,
                    'file_path': file,
                    'type': 'text_plot'
                })
                print(f"  ✅ Text Plot: {clean_id}")
        
        print(f"\n📊 Summary: {len(self.complete_reports)} Complete Reports, {len(self.text_plots)} Text Plots")
        return self.complete_reports, self.text_plots

    def extract_target_variables(self, text_plot_file):
        """Extract bearing capacity and foundation type from Text Plot"""
        try:
            with pdfplumber.open(text_plot_file) as pdf:
                full_text = ""
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        full_text += page_text + "\n"
            
            # Enhanced bearing capacity patterns
            bearing_patterns = [
                r'(\d+\.?\d*)\s*Tonne/ft[²2]',
                r'(\d+\.?\d*)\s*T/ft[²2]', 
                r'(\d+\.?\d*)\s*ton/ft[²2]',
                r'Net\s*Allowable\s*Bearing\s*Capacity.*?(\d+\.?\d*)',
                r'Safe\s*Bearing\s*Capacity.*?(\d+\.?\d*)',
                r'Bearing\s*Capacity.*?(\d+\.?\d*)',
                r'Allowable\s*Bearing\s*Pressure.*?(\d+\.?\d*)',
                r'qa\s*=\s*(\d+\.?\d*)'
            ]
            
            bearing_capacities = []
            for pattern in bearing_patterns:
                matches = re.findall(pattern, full_text, re.IGNORECASE)
                for match in matches:
                    try:
                        value = float(match)
                        # Validate bearing capacity (reasonable range: 0.1 - 50 T/ft²)
                        if 0.1 <= value <= 50:
                            bearing_capacities.append(value)
                    except ValueError:
                        continue
            
            # Foundation type detection
            foundation_type = "Unknown"
            text_upper = full_text.upper()
            if "RAFT FOUNDATION" in text_upper or "MAT FOUNDATION" in text_upper:
                foundation_type = "Raft"
            elif "PILE FOUNDATION" in text_upper or "DEEP FOUNDATION" in text_upper:
                foundation_type = "Pile"
            elif "SHALLOW FOUNDATION" in text_upper or "SPREAD FOOTING" in text_upper:
                foundation_type = "Shallow"
            elif "ISOLATED FOOTING" in text_upper:
                foundation_type = "Isolated"
            
            return {
                'bearing_capacities': bearing_capacities,
                'foundation_type': foundation_type
            }
            
        except Exception as e:
            print(f"❌ Error extracting targets: {e}")
            return None

    def extract_comprehensive_lab_data(self, complete_report_file):
        """Extract all possible laboratory parameters using text patterns and tables"""
        try:
            with pdfplumber.open(complete_report_file) as pdf:
                extracted_data = []
                
                for page_num, page in enumerate(pdf.pages):
                    text = page.extract_text()
                    tables = page.extract_tables()
                    
                    if text:
                        # Text-based extraction with enhanced patterns
                        text_data = self._extract_from_text_patterns_enhanced(text)
                        extracted_data.extend(text_data)
                    
                    if tables:
                        # Table-based extraction with enhanced mapping
                        for table in tables:
                            table_data = self._extract_from_table_enhanced(table)
                            extracted_data.extend(table_data)
                
                return extracted_data
                
        except Exception as e:
            print(f"❌ Error extracting lab data: {e}")
            return []

    def _extract_from_text_patterns_enhanced(self, text):
        """Enhanced extraction with comprehensive grain size patterns"""
        extracted = []
        
        # Comprehensive parameter patterns
        patterns = {
            'moisture_content_pct': [
                r'(?:moisture|water)\s*content.*?(\d+\.?\d*)\s*%',
                r'w\s*=\s*(\d+\.?\d*)\s*%',
                r'M\.C\..*?(\d+\.?\d*)\s*%',
                r'moisture.*?(\d+\.?\d*)\s*%'
            ],
            'liquid_limit_ll': [
                r'liquid\s*limit.*?(\d+\.?\d*)\s*%?',
                r'LL\s*=\s*(\d+\.?\d*)',
                r'L\.L\..*?(\d+\.?\d*)',
                r'Liquid\s*Limit\s*[=:]\s*(\d+\.?\d*)'
            ],
            'plastic_limit_pl': [
                r'plastic\s*limit.*?(\d+\.?\d*)\s*%?',
                r'PL\s*=\s*(\d+\.?\d*)',
                r'P\.L\..*?(\d+\.?\d*)',
                r'Plastic\s*Limit\s*[=:]\s*(\d+\.?\d*)'
            ],
            'sand_pct': [
                # Basic sand patterns
                r'sand\s*[=:]\s*(\d+\.?\d*)\s*%',
                r'(\d+\.?\d*)\s*%\s*sand',
                r'SAND\s*[=:]\s*(\d+\.?\d*)\s*%',
                r'Sand\s*[=:]\s*(\d+\.?\d*)\s*%',
                
                # Sand fraction patterns
                r'(?:fine|medium|coarse)\s*sand\s*[=:]\s*(\d+\.?\d*)\s*%',
                r'(\d+\.?\d*)\s*%\s*(?:fine|medium|coarse)\s*sand',
                r'sand\s*fraction\s*[=:]\s*(\d+\.?\d*)\s*%',
                r'total\s*sand\s*[=:]\s*(\d+\.?\d*)\s*%',
                
                # Sieve-based patterns for sand sizes (0.075mm to 4.75mm)
                r'retained\s*(?:on\s*)?(?:#\s*)?(?:4|8|16|30|50|100)\s*[^\d]*(\d+\.?\d*)\s*%',
                r'passing\s*(?:#\s*)?(?:4|8|16|30|50)\s*[^\d]*(\d+\.?\d*)\s*%',
                r'#\s*(?:4|8|16|30|50|100)\s*[^\d]*(\d+\.?\d*)\s*%',
                
                # Size range patterns
                r'0\.075.*?4\.75.*?(\d+\.?\d*)\s*%',
                r'(?:0\.15|0\.3|0\.6|1\.18|2\.36)\s*mm.*?(\d+\.?\d*)\s*%'
            ],
            'gravel_pct': [
                # Basic gravel patterns
                r'gravel\s*[=:]\s*(\d+\.?\d*)\s*%',
                r'(\d+\.?\d*)\s*%\s*gravel',
                r'GRAVEL\s*[=:]\s*(\d+\.?\d*)\s*%',
                r'Gravel\s*[=:]\s*(\d+\.?\d*)\s*%',
                
                # Gravel fraction patterns
                r'(?:fine|coarse)\s*gravel\s*[=:]\s*(\d+\.?\d*)\s*%',
                r'(\d+\.?\d*)\s*%\s*(?:fine|coarse)\s*gravel',
                r'gravel\s*fraction\s*[=:]\s*(\d+\.?\d*)\s*%',
                r'total\s*gravel\s*[=:]\s*(\d+\.?\d*)\s*%',
                
                # Sieve-based patterns for gravel sizes (>4.75mm)
                r'retained\s*(?:on\s*)?(?:#\s*)?(?:4|3/8|1/2|3/4|1)\s*[^\d]*(\d+\.?\d*)\s*%',
                r'#\s*4\s*[^\d]*(\d+\.?\d*)\s*%',
                
                # Size range patterns for gravel
                r'(?:4\.75|9\.5|12\.5|19|25)\s*mm.*?(\d+\.?\d*)\s*%',
                r'(?:3/16|3/8|1/2|3/4|1)[\"\s]*.*?(\d+\.?\d*)\s*%'
            ],
            'fines_pct': [
                # Basic fines patterns
                r'fines?\s*[=:]\s*(\d+\.?\d*)\s*%',
                r'(\d+\.?\d*)\s*%\s*fines?',
                r'FINES?\s*[=:]\s*(\d+\.?\d*)\s*%',
                
                # Sieve #200 patterns (standard for fines)
                r'passing\s*(?:#\s*)?200\s*[^\d]*(\d+\.?\d*)\s*%',
                r'#\s*200\s*[^\d]*(\d+\.?\d*)\s*%',
                r'(?:sieve\s*)?200\s*[^\d]*(\d+\.?\d*)\s*%',
                r'200\s*mesh.*?(\d+\.?\d*)\s*%',
                
                # Combined silt+clay
                r'silt\s*[+&]\s*clay\s*[=:]\s*(\d+\.?\d*)\s*%',
                r'(?:silt|clay)\s*fraction\s*[=:]\s*(\d+\.?\d*)\s*%',
                r'fine\s*fraction\s*[=:]\s*(\d+\.?\d*)\s*%',
                
                # Size-based (fines are <0.075mm)
                r'<\s*0\.075\s*mm.*?(\d+\.?\d*)\s*%',
                r'0\.075\s*mm\s*passing.*?(\d+\.?\d*)\s*%'
            ],
            'bulk_density': [
                r'bulk\s*density.*?(\d+\.?\d*)',
                r'dry\s*density.*?(\d+\.?\d*)',
                r'γd.*?(\d+\.?\d*)',
                r'unit\s*weight.*?(\d+\.?\d*)'
            ]
        }
        
        # Validation ranges for each parameter
        validation_ranges = {
            'moisture_content_pct': (0, 100),
            'liquid_limit_ll': (0, 200),
            'plastic_limit_pl': (0, 100),
            'sand_pct': (0, 100),
            'gravel_pct': (0, 100),
            'fines_pct': (0, 100),
            'bulk_density': (0.5, 3.0)
        }
        
        for param, pattern_list in patterns.items():
            valid_values = []
            
            for pattern in pattern_list:
                matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
                for match in matches:
                    try:
                        value = float(match)
                        
                        # Apply validation
                        if param in validation_ranges:
                            min_val, max_val = validation_ranges[param]
                            if min_val <= value <= max_val:
                                valid_values.append(value)
                                
                                # Debug output for grain size parameters
                                if param in ['sand_pct', 'gravel_pct'] and value > 0:
                                    print(f"  🎯 Found {param}: {value}% using pattern")
                        else:
                            valid_values.append(value)
                            
                    except ValueError:
                        continue
            
            if valid_values:
                # Use median to avoid outliers
                final_value = sorted(valid_values)[len(valid_values)//2]
                extracted.append({
                    'parameter': param,
                    'value': final_value,
                    'source': 'enhanced_text'
                })
        
        return extracted

    def _extract_from_table_enhanced(self, table):
        """Enhanced table extraction with better grain size mapping"""
        if not table or len(table) < 2:
            return []
        
        extracted = []
        headers = table[0] if table[0] else []
        
        # Enhanced header mapping for grain size analysis
        header_mapping = {
            'depth': ['depth', 'dep', 'd', 'elevation'],
            'moisture_content_pct': ['moisture', 'water content', 'w%', 'mc', 'w.c.'],
            'liquid_limit_ll': ['liquid limit', 'll', 'l.l', 'liquid'],
            'plastic_limit_pl': ['plastic limit', 'pl', 'p.l', 'plastic'],
            'sand_pct': [
                'sand', 'sand%', 'sand %', '% sand', 'total sand',
                'fine sand', 'medium sand', 'coarse sand', 'sand fraction',
                '#4', '#8', '#16', '#30', '#50', '#100', 'retained #4'
            ],
            'gravel_pct': [
                'gravel', 'gravel%', 'gravel %', '% gravel', 'total gravel',
                'fine gravel', 'coarse gravel', 'gravel fraction',
                'retained #4', '3/8', '1/2', '3/4', '1 inch'
            ],
            'fines_pct': [
                'fines', 'fine', 'fines%', 'fines %', '% fines',
                'passing 200', '#200', 'sieve 200', '200 mesh',
                'silt + clay', 'silt+clay', 'clay+silt', 'fine fraction'
            ],
            'bulk_density': ['bulk density', 'density', 'γd', 'unit weight', 'dry density'],
            'uscs_classification': ['uscs', 'classification', 'class', 'soil type'],
            'spt_n_value': ['n value', 'n-value', 'spt', 'n', 'blows'],
            'borehole_no': ['borehole', 'bh', 'bore', 'hole']
        }
        
        # Map headers to parameters
        column_mapping = {}
        for i, header in enumerate(headers):
            if header:
                header_lower = str(header).lower().strip()
                
                for param, keywords in header_mapping.items():
                    if any(keyword in header_lower for keyword in keywords):
                        column_mapping[i] = param
                        print(f"  📋 Mapped column {i} '{header}' to {param}")
                        break
        
        # Extract data from rows
        for row_idx, row in enumerate(table[1:]):
            if not row:
                continue
            
            for col_idx, cell_value in enumerate(row):
                if col_idx in column_mapping and cell_value:
                    param = column_mapping[col_idx]
                    
                    if param == 'uscs_classification':
                        uscs_match = re.search(r'\b(CL|CH|ML|MH|SM|SC|SW|SP|GW|GP|GM|GC)\b', 
                                             str(cell_value).upper())
                        if uscs_match:
                            extracted.append({
                                'parameter': param,
                                'value': uscs_match.group(1),
                                'source': 'enhanced_table'
                            })
                    else:
                        # Extract numeric values with validation
                        numeric_value = None
                        
                        if isinstance(cell_value, str):
                            # Clean the string and extract number
                            clean_cell = re.sub(r'[^\d\.]', '', str(cell_value))
                            if clean_cell:
                                try:
                                    numeric_value = float(clean_cell)
                                except ValueError:
                                    continue
                        elif isinstance(cell_value, (int, float)):
                            numeric_value = float(cell_value)
                        
                        if numeric_value is not None:
                            # Apply validation for percentages
                            if 'pct' in param and not (0 <= numeric_value <= 100):
                                continue
                            
                            extracted.append({
                                'parameter': param,
                                'value': numeric_value,
                                'source': 'enhanced_table'
                            })
                            
                            # Debug output for grain size
                            if param in ['sand_pct', 'gravel_pct'] and numeric_value > 0:
                                print(f"  🎯 Table extracted {param}: {numeric_value}%")
        
        return extracted

    def extract_soil_descriptions(self, complete_report_file):
        """Extract and parse soil descriptions"""
        try:
            with pdfplumber.open(complete_report_file) as pdf:
                descriptions = []
                
                for page in pdf.pages:
                    text = page.extract_text()
                    if not text:
                        continue
                    
                    # Enhanced soil description patterns
                    patterns = [
                        r'(\d+\.?\d*)\s*[-\']\s*(\d+\.?\d*)\s*[\'"]?\s*:?\s*([^\n]+(?:clay|sand|silt|gravel)[^\n]*)',
                        r'(CL|CH|ML|MH|SM|SC|SW|SP|GW|GP|GM|GC)\s*[:-]?\s*([^\n]+)',
                        r'(brown|gray|grey|black|white|yellow|red|orange)\s*([^\n]*(?:clay|sand|silt|gravel)[^\n]*)'
                    ]
                    
                    for pattern in patterns:
                        matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
                        for match in matches:
                            parsed = self._parse_soil_description(match)
                            if parsed:
                                descriptions.append(parsed)
                
                return descriptions
                
        except Exception as e:
            print(f"❌ Error extracting soil descriptions: {e}")
            return []

    def _parse_soil_description(self, match):
        """Parse soil description into structured features"""
        if isinstance(match, tuple):
            text = ' '.join(str(m) for m in match)
        else:
            text = str(match)
        
        text_lower = text.lower()
        parsed = {}
        
        # Extract features
        features = {
            'soil_color': ['brown', 'gray', 'grey', 'black', 'white', 'yellow', 'red', 'orange'],
            'consistency': ['soft', 'firm', 'stiff', 'hard', 'loose', 'dense'],
            'moisture': ['dry', 'moist', 'wet', 'saturated'],
            'primary_soil_type': ['clay', 'sand', 'silt', 'gravel'],
            'uscs_classification': ['CL', 'CH', 'ML', 'MH', 'SM', 'SC', 'SW', 'SP', 'GW', 'GP', 'GM', 'GC']
        }
        
        for feature, values in features.items():
            for value in values:
                if value.lower() in text_lower:
                    parsed[feature] = value
                    break
        
        # Extract depths
        depth_match = re.search(r'(\d+\.?\d*)\s*[-\']\s*(\d+\.?\d*)', text)
        if depth_match:
            parsed['depth_start'] = float(depth_match.group(1))
            parsed['depth_end'] = float(depth_match.group(2))
        
        # Extract SPT values
        spt_match = re.search(r'N\s*=\s*(\d+)', text, re.IGNORECASE)
        if spt_match:
            parsed['spt_n_value'] = int(spt_match.group(1))
        
        return parsed if len(parsed) > 0 else None

    def extract_spt_data(self, complete_report_file):
        """Extract SPT N-values"""
        try:
            with pdfplumber.open(complete_report_file) as pdf:
                spt_values = []
                
                patterns = [
                    r'N\s*=\s*(\d+)',
                    r'SPT\s*(\d+)',
                    r'N-value\s*(\d+)',
                    r'blow.*?(\d+)'
                ]
                
                for page in pdf.pages:
                    text = page.extract_text()
                    if not text:
                        continue
                    
                    for pattern in patterns:
                        matches = re.findall(pattern, text, re.IGNORECASE)
                        for match in matches:
                            try:
                                value = int(match)
                                # Validate SPT values (reasonable range: 0-100)
                                if 0 <= value <= 100:
                                    spt_values.append(value)
                            except ValueError:
                                continue
                
                return spt_values
                
        except Exception as e:
            print(f"❌ Error extracting SPT data: {e}")
            return []

    def create_comprehensive_dataset(self):
        """Create comprehensive dataset with all extracted features"""
        print(f"\n🔄 Processing {len(self.complete_reports)} projects...")
        all_data = []
        
        for complete_report in self.complete_reports:
            project_id = complete_report['project_id']
            complete_file = complete_report['file_path']
            
            # Find corresponding text plot
            text_plot = next((tp for tp in self.text_plots if tp['project_id'] == project_id), None)
            
            print(f"\n📁 Processing Project {project_id}")
            
            # Extract all data types
            target_data = None
            if text_plot:
                target_data = self.extract_target_variables(text_plot['file_path'])
                print(f"  ✅ Target variables extracted")
            else:
                print(f"  ⚠️  No text plot found - no target variables")
            
            lab_data = self.extract_comprehensive_lab_data(complete_file)
            soil_data = self.extract_soil_descriptions(complete_file)
            spt_data = self.extract_spt_data(complete_file)
            
            print(f"  📊 Extracted: {len(lab_data)} lab params, {len(soil_data)} soil features, {len(spt_data)} SPT values")
            
            all_data.append({
                'project_id': project_id,
                'target_data': target_data,
                'lab_data': lab_data,
                'soil_data': soil_data,
                'spt_data': spt_data
            })
        
        # Structure into DataFrame
        dataset = self._structure_dataset(all_data)
        print(f"\n✅ Dataset created: {dataset.shape[0]} rows × {dataset.shape[1]} columns")
        
        return dataset

    def _structure_dataset(self, all_data):
        """Structure dataset with proper bearing capacity validation"""
        rows = []
        
        # Get list of projects that actually have Text Plot files
        text_plot_projects = {tp['project_id'] for tp in self.text_plots}
        
        for project in all_data:
            project_id = project['project_id']
            row = {'project_id': project_id}
            
            # STRICT bearing capacity validation - only from Text Plots
            if project_id in text_plot_projects and project['target_data']:
                if project['target_data']['bearing_capacities']:
                    row['bearing_capacity'] = float(project['target_data']['bearing_capacities'][0])
                else:
                    row['bearing_capacity'] = None
                row['foundation_type'] = project['target_data']['foundation_type']
            else:
                # No Text Plot = No bearing capacity
                row['bearing_capacity'] = None
                row['foundation_type'] = None
            
            # Process lab data
            if project['lab_data']:
                lab_params = {}
                for entry in project['lab_data']:
                    param = entry['parameter']
                    value = entry['value']
                    
                    if param not in lab_params:
                        lab_params[param] = []
                    lab_params[param].append(value)
                
                # Average multiple values for numeric parameters
                for param, values in lab_params.items():
                    if values and isinstance(values[0], (int, float)):
                        row[param] = sum(values) / len(values)
                    elif values:
                        row[param] = values[0]
            
            # Process soil data
            if project['soil_data']:
                soil_features = {}
                for desc in project['soil_data']:
                    for key, value in desc.items():
                        if isinstance(value, (int, float)):
                            if key not in soil_features:
                                soil_features[key] = []
                            soil_features[key].append(value)
                        elif isinstance(value, str) and key not in soil_features:
                            soil_features[key] = value
                
                for feature, values in soil_features.items():
                    if isinstance(values, list) and values:
                        if isinstance(values[0], (int, float)):
                            row[feature] = sum(values) / len(values)
                        else:
                            row[feature] = values[0]
                    elif isinstance(values, str):
                        row[feature] = values
            
            # Process SPT data
            if project['spt_data']:
                row['avg_n_value'] = sum(project['spt_data']) / len(project['spt_data'])
                row['max_n_value'] = max(project['spt_data'])
                row['min_n_value'] = min(project['spt_data'])
                row['n_value_count'] = len(project['spt_data'])
            
            rows.append(row)
        
        return pd.DataFrame(rows)

    def clean_dataset(self, df):
        """Clean and validate dataset"""
        print(f"\n🧹 Cleaning dataset...")
        print(f"Initial shape: {df.shape}")
        
        # Fill missing numeric values with median
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if df[col].isnull().any():
                median_val = df[col].median()
                df[col].fillna(median_val, inplace=True)
                print(f"  📝 Filled {col} missing values with median: {median_val:.2f}")
        
        # Fill missing categorical values
        categorical_cols = df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            if df[col].isnull().any():
                mode_val = df[col].mode().iloc[0] if not df[col].mode().empty else 'Unknown'
                df[col].fillna(mode_val, inplace=True)
                print(f"  📝 Filled {col} missing values with: {mode_val}")
        
        print(f"✅ Cleaned dataset shape: {df.shape}")
        return df

    def generate_summary(self, df):
        """Generate comprehensive dataset summary"""
        print("\n" + "="*60)
        print("📊 DATASET SUMMARY")
        print("="*60)
        
        print(f"\n📈 Basic Statistics:")
        print(f"   Projects: {len(df)}")
        print(f"   Features: {len(df.columns)}")
        print(f"   Complete projects (with bearing capacity): {df['bearing_capacity'].notna().sum() if 'bearing_capacity' in df.columns else 0}")
        
        print(f"\n🎯 Target Variable (Bearing Capacity):")
        if 'bearing_capacity' in df.columns and df['bearing_capacity'].notna().any():
            bc_stats = df['bearing_capacity'].describe()
            print(f"   Mean: {bc_stats['mean']:.2f} T/ft²")
            print(f"   Range: {bc_stats['min']:.2f} - {bc_stats['max']:.2f} T/ft²")
            print(f"   Std Dev: {bc_stats['std']:.2f}")
        
        print(f"\n🔍 Grain Size Analysis:")
        grain_cols = ['sand_pct', 'gravel_pct', 'fines_pct']
        for col in grain_cols:
            if col in df.columns:
                non_null = df[col].notna().sum()
                if non_null > 0:
                    values = df[col].dropna()
                    print(f"   ✅ {col:<12}: {non_null}/4 projects, Range: {values.min():.1f}% - {values.max():.1f}%")
                else:
                    print(f"   ❌ {col:<12}: No data extracted")
        
        print(f"\n📋 Feature Completeness:")
        for col in df.columns:
            non_null = df[col].notna().sum()
            total = len(df)
            coverage = (non_null/total)*100
            status = "✅" if coverage == 100 else "⚠️" if coverage >= 50 else "❌"
            print(f"   {status} {col:<25}: {non_null}/{total} ({coverage:.0f}%)")
        
        print(f"\n🏗️ Foundation Types:")
        if 'foundation_type' in df.columns:
            foundation_counts = df['foundation_type'].value_counts()
            for ftype, count in foundation_counts.items():
                print(f"   {ftype}: {count}")
        
        return df.describe()

    def save_dataset(self, df, filename='geotechnical_dataset_final.csv'):
        """Save dataset to CSV"""
        df.to_csv(filename, index=False)
        print(f"\n💾 Dataset saved as: {filename}")
        print(f"   Shape: {df.shape}")
        print(f"   Columns: {list(df.columns)}")

print("✅ Enhanced GeotechnicalDataExtractor class defined with improved grain size extraction")

✅ Enhanced GeotechnicalDataExtractor class defined with improved grain size extraction


In [20]:
# =============================================================================
# CELL 3: MAIN EXECUTION FUNCTION
# =============================================================================

def main():
    """Main execution function for geotechnical data extraction"""
    
    print("🚀 Starting Enhanced Geotechnical Data Extraction System")
    print("="*65)
    
    # Initialize extractor
    extractor = GeotechnicalDataExtractor("Data")
    
    # Step 1: Identify files
    print("\n📁 Step 1: File Identification")
    complete_reports, text_plots = extractor.identify_file_types()
    
    if not complete_reports:
        print("❌ No complete reports found!")
        return None, None
    
    # Step 2: Extract comprehensive dataset
    print("\n🔍 Step 2: Enhanced Data Extraction")
    dataset = extractor.create_comprehensive_dataset()
    
    if dataset.empty:
        print("❌ No data extracted!")
        return None, None
    
    # Step 3: Clean dataset
    print("\n🧹 Step 3: Data Cleaning")
    cleaned_dataset = extractor.clean_dataset(dataset)
    
    # Step 4: Generate summary
    print("\n📊 Step 4: Analysis & Summary")
    summary_stats = extractor.generate_summary(cleaned_dataset)
    
    # Step 5: Save results
    print("\n💾 Step 5: Save Results")
    extractor.save_dataset(cleaned_dataset, 'geotechnical_dataset_enhanced_final.csv')
    
    print("\n✅ Enhanced Extraction Complete!")
    print(f"Final dataset: {cleaned_dataset.shape[0]} projects × {cleaned_dataset.shape[1]} features")
    
    return cleaned_dataset, summary_stats

print("✅ Enhanced main execution function defined")

✅ Enhanced main execution function defined


In [21]:
# =============================================================================
# CELL 4: EXECUTE THE ENHANCED PIPELINE
# =============================================================================

# Execute the enhanced geotechnical data extraction pipeline
print("🚀 EXECUTING ENHANCED GEOTECHNICAL DATA EXTRACTION PIPELINE")
print("="*75)

final_dataset, summary_stats = main()

if final_dataset is not None:
    print("\n📋 ENHANCED DATASET PREVIEW:")
    print(final_dataset.head())
    
    print("\n🔍 BEARING CAPACITY VALIDATION:")
    bc_data = final_dataset[['project_id', 'bearing_capacity']].copy()
    text_plot_projects = {'7155-25', '7157-25'}  # Known projects with Text Plots
    
    for idx, row in bc_data.iterrows():
        project_id = row['project_id']
        bc_value = row['bearing_capacity']
        should_have_bc = project_id in text_plot_projects
        has_bc = pd.notna(bc_value)
        
        if should_have_bc and has_bc:
            status = "✅ CORRECT"
        elif not should_have_bc and not has_bc:
            status = "✅ CORRECT"
        else:
            status = "❌ ERROR"
        
        print(f"   {project_id}: Expected BC={should_have_bc}, Has BC={has_bc} ({bc_value}) → {status}")
    
    print("\n🎯 GRAIN SIZE EXTRACTION RESULTS:")
    grain_size_cols = ['sand_pct', 'gravel_pct', 'fines_pct']
    for col in grain_size_cols:
        if col in final_dataset.columns:
            non_null = final_dataset[col].notna().sum()
            if non_null > 0:
                values = final_dataset[col].dropna()
                print(f"   ✅ {col:<12}: {non_null}/4 projects extracted")
                print(f"       Values: {list(values)}")
            else:
                print(f"   ❌ {col:<12}: No data found")
        else:
            print(f"   ❌ {col:<12}: Column missing")
    
    print("\n🎯 READY FOR AI/ML MODELING!")
    print("Use 'geotechnical_dataset_enhanced_final.csv' for your geotechnical AI system.")
    
    print(f"\n📊 ENHANCED FEATURES EXTRACTED:")
    for i, col in enumerate(final_dataset.columns, 1):
        non_null = final_dataset[col].notna().sum()
        total = len(final_dataset)
        coverage = (non_null/total)*100
        status = "✅" if coverage == 100 else "⚠️" if coverage >= 50 else "❌"
        print(f"   {i:2d}. {status} {col:<25}: {non_null}/{total} ({coverage:.0f}%)")
    
else:
    print("❌ Enhanced pipeline execution failed - check your PDF files and data directory")

print("\n🎉 ENHANCED PIPELINE COMPLETE!")



🚀 EXECUTING ENHANCED GEOTECHNICAL DATA EXTRACTION PIPELINE
🚀 Starting Enhanced Geotechnical Data Extraction System

📁 Step 1: File Identification
🔍 Found 6 PDF files
  ✅ Complete Report: 7144-25
  ✅ Complete Report: 7145-25
  ✅ Complete Report: 7155-25
  ✅ Text Plot: 7155-25
  ✅ Complete Report: 7157-25
  ✅ Text Plot: 7157-25

📊 Summary: 4 Complete Reports, 2 Text Plots

🔍 Step 2: Enhanced Data Extraction

🔄 Processing 4 projects...

📁 Processing Project 7144-25
  ⚠️  No text plot found - no target variables
  📋 Mapped column 1 'AAAUUUGGG’’’ 222000222555
ation Report
OOOVVVEEERRRSSSEEEAAASSS
BBBLLLOOOCCCKKK' to liquid_limit_ll
  📋 Mapped column 0 'Depth
(ft)' to depth
  📋 Mapped column 1 'BH – 1' to borehole_no
  📋 Mapped column 3 'BH - 2' to borehole_no
  📋 Mapped column 0 'Depth' to depth
  🎯 Found gravel_pct: 5.0% using pattern
  📋 Mapped column 1 'Depth of Footing from' to depth
  📋 Mapped column 3 'Reduced Level' to depth
  📋 Mapped column 4 'Net Allowable
Bearing Capacity' to liq

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always beha

In [22]:
# =============================================================================
# CELL 5: DEBUG GRAIN SIZE EXTRACTION (OPTIONAL)
# =============================================================================

def debug_grain_size_extraction():
    """Debug function to analyze what's in the PDFs for grain size data"""
    
    print("🔍 DEBUGGING GRAIN SIZE EXTRACTION")
    print("="*50)
    
    extractor = GeotechnicalDataExtractor("Data")
    extractor.identify_file_types()
    
    # Check first PDF only for debugging
    if extractor.complete_reports:
        report = extractor.complete_reports[0]
        project_id = report['project_id']
        file_path = report['file_path']
        
        print(f"\n📁 Analyzing Project {project_id}")
        print(f"   File: {file_path.name}")
        
        try:
            with pdfplumber.open(file_path) as pdf:
                for page_num, page in enumerate(pdf.pages[:3]):  # Check first 3 pages
                    text = page.extract_text()
                    if not text:
                        continue
                    
                    print(f"\n📄 Page {page_num + 1}:")
                    
                    # Look for grain size related text
                    lines = text.split('\n')
                    grain_lines = []
                    
                    for line in lines:
                        if any(keyword in line.lower() for keyword in 
                               ['sand', 'gravel', 'silt', 'clay', 'sieve', '%', 'grain', 'particle']):
                            grain_lines.append(line.strip())
                    
                    if grain_lines:
                        print("   🎯 Found grain size related lines:")
                        for line in grain_lines[:10]:  # Show first 10 matches
                            print(f"     {line}")
                    
                    # Check tables
                    tables = page.extract_tables()
                    if tables:
                        print(f"   📋 Found {len(tables)} tables")
                        for table_num, table in enumerate(tables):
                            if table and len(table) > 0:
                                headers = table[0] if table[0] else []
                                print(f"     Table {table_num + 1} headers: {headers}")
                                
                                # Show sample data
                                if len(table) > 1:
                                    print(f"     Sample row: {table[1]}")
                    
                    if page_num >= 2:  # Only check first 3 pages
                        break
                        
        except Exception as e:
            print(f"❌ Error analyzing {project_id}: {e}")
    
    else:
        print("❌ No complete reports found for debugging")

# Uncomment the next line to run debugging
# debug_grain_size_extraction()

print("✅ Debug function defined (uncomment to run)")


✅ Debug function defined (uncomment to run)
