# A1: Data Sources Setup

---

### Overview - A Notebooks download and clean data, B notebooks create timelines. C notebooks analyze. D notebooks publish the City of Berkeley Daily Update: the Permit Pipeline, Inspection Pipeline, and Construction Pipeline.

## This set of notebooks complement and support the City of Berkeley Annual Update, to be submitted in March, 2026, to CA HCD per legal requirement.

This notebook connects to the Berkeley Open Data Portal API to downloads permit and inspection data.

**Inputs:** None (fetches from API)

**Outputs:**
- `zoning_permits.csv`
- `building_permits.csv`
- `planning_records.csv`

**Dependencies:** sodapy, pandas

---

In [1]:
# ============================================================================
# COLAB ENVIRONMENT SETUP (Run this first in Colab!)
# ============================================================================

import os
import sys
from pathlib import Path

print('üîß SETTING UP ENVIRONMENT')
print('='*70)

# Detect environment
try:
    import google.colab
    IN_COLAB = True
    print('üåê Running in Google Colab')
except ImportError:
    IN_COLAB = False
    print('üíª Running locally')

if IN_COLAB:
    # Clone repository
    repo_path = Path('/content/berkeley-housing-analysis')
    
    if not repo_path.exists():
        print('\nüì• Cloning repository...')
        !git clone https://github.com/blockXblock/berkeley-housing-analysis.git
        print('‚úÖ Repository cloned')
    else:
        print('\n‚úÖ Repository already exists')
        # Pull latest changes
        !cd /content/berkeley-housing-analysis && git pull
    
    # Change to repo directory
    os.chdir(repo_path)
    
    # Add to Python path for module imports
    if str(repo_path) not in sys.path:
        sys.path.insert(0, str(repo_path))
    
    # Create directories
    (repo_path / 'inputs').mkdir(exist_ok=True)
    (repo_path / 'outputs').mkdir(exist_ok=True)
    (repo_path / 'temp').mkdir(exist_ok=True)
    
    # Set DATA_DIR for compatibility with notebook
    DATA_DIR = repo_path
    
    print(f'\nüìÇ Working directory: {os.getcwd()}')
    print(f'üì¶ Python path updated for module imports')
    
    # Verify modules
    modules_path = repo_path / 'modules'
    if modules_path.exists():
        py_files = [f.name for f in modules_path.glob('*.py') if f.name != '__pycache__']
        print(f'‚úÖ Found {len(py_files)} module files:')
        for name in sorted(py_files):
            print(f'   ‚Ä¢ {name}')
    
    print('\n‚ö†Ô∏è  Note: API token not available in Colab')
    print('   Berkeley API is blocked anyway (403 errors)')
    print('   Will use manual data download method')

else:
    # Local environment
    if 'workflows' in os.getcwd():
        # Navigate up to berkeley-data root
        while 'berkeley-data' not in os.path.basename(os.getcwd()) and os.getcwd() != '/':
            os.chdir('..')
            if os.path.basename(os.getcwd()) == 'berkeley-data':
                break
    
    DATA_DIR = Path.cwd()
    print(f'\nüìÇ Working directory: {os.getcwd()}')

print('\n' + '='*70)
print('üéâ SETUP COMPLETE! Ready to run notebook.')
print('='*70)


üîß SETTING UP ENVIRONMENT
üíª Running locally

üìÇ Working directory: /Users/johngage/berkeley-data

üéâ SETUP COMPLETE! Ready to run notebook.


## 1. Setup & Imports

In [2]:
# 1
# Install sodapy
!pip install -q sodapy


# Portable config loading
import json
from pathlib import Path
import shutil

# Try different paths
for p in [Path('config/berkeley_config.json'), Path('../../config/berkeley_config.json')]:
    if p.exists():
        config_path = p
        break
else:
    # Create from template
    t = Path('config/berkeley_config.json.template')
    c = Path('config/berkeley_config.json')
    if t.exists():
        shutil.copy(t, c)
        config_path = c

with open(config_path) as f:
    CONFIG = json.load(f)

from modules.data_loader import get_socrata_client, load_permits_from_api, DATASETS
import pandas as pd

DATA_DIR = Path('.')
print(f'Config: {config_path}')
print(f'Datasets: {list(DATASETS.keys())}')


Config: config/berkeley_config.json
Datasets: ['business_licenses', 'building_permits', 'zoning_permits', 'planning_records', 'crime_incidents', 'restaurant_inspections']


In [6]:
# ============================================================================
# WORKING SOCRATA CLIENT (Using App Token)
# ============================================================================

from sodapy import Socrata
import pandas as pd

# Berkeley Open Data Portal
BERKELEY_DOMAIN = 'data.cityofberkeley.info'

# Your app token (provides higher rate limits)
APP_TOKEN = '8PDke1Hu50Wk65wSM0QPxmH1w'

# Dataset IDs (verified working)
WORKING_DATASETS = {
    'business_licenses': 'rwnf-bu3w',
    'crime_incidents': 'k2nh-s5h5',
    'restaurant_inspections': 'b47j-kakm',
    'building_permits': 'ydr8-5enu',
}

# Initialize client
client = Socrata(BERKELEY_DOMAIN, APP_TOKEN)

print('‚úÖ Connected to Berkeley Open Data Portal')
print(f'   Domain: {BERKELEY_DOMAIN}')
print(f'   Using app token: {APP_TOKEN[:8]}...')
print(f'   Available datasets: {list(WORKING_DATASETS.keys())}')
print('\n‚ö†Ô∏è  Note: Some datasets may still return 403 due to WAF')


‚úÖ Connected to Berkeley Open Data Portal
   Domain: data.cityofberkeley.info
   Using app token: 8PDke1Hu...
   Available datasets: ['business_licenses', 'crime_incidents', 'restaurant_inspections', 'building_permits']

‚ö†Ô∏è  Note: Some datasets may still return 403 due to WAF


In [9]:
# ============================================================================
# DATA FETCHING FUNCTION
# ============================================================================

def fetch_berkeley_data(dataset_name, limit=10000, filters=None):
    """
    Fetch data from Berkeley Open Data Portal
    
    Parameters:
    -----------
    dataset_name : str
        Name of dataset from WORKING_DATASETS dict
    limit : int
        Maximum number of records to fetch
    filters : dict
        Optional filters (e.g., {'city': 'Berkeley'})
    
    Returns:
    --------
    pandas.DataFrame or None
    """
    try:
        dataset_id = WORKING_DATASETS.get(dataset_name)
        if not dataset_id:
            raise ValueError(f'Unknown dataset: {dataset_name}')
        
        print(f'üì• Fetching {dataset_name}...')
        print(f'   Dataset ID: {dataset_id}')
        print(f'   Limit: {limit:,} records')
        print(f'   APP_TOKEN: {APP_TOKEN}')

        # Build query parameters
        params = {'$limit': limit}
        
        if filters:
            # Convert filters to SoQL WHERE clause
            where_clauses = [f"{k}='{v}'" for k, v in filters.items()]
            params['$where'] = ' AND '.join(where_clauses)
            print(f'   Filters: {filters}')
        
        # Fetch data
        results = client.get(dataset_id, **params)
        
        # Convert to DataFrame
        df = pd.DataFrame.from_records(results)
        
        print(f'‚úÖ Success! Fetched {len(df):,} records')
        print(f'   Columns: {list(df.columns)[:5]}...')
        
        return df
        
    except Exception as e:
        print(f'‚ùå Error: {str(e)}')
        
        if '403' in str(e) or 'Forbidden' in str(e):
            print('\n   This dataset is blocked by WAF.')
            print('   Use manual download instead:')
            print(f'   https://data.cityofberkeley.info/d/{dataset_id}')
        
        return None

print('‚úÖ fetch_berkeley_data() function ready')


‚úÖ fetch_berkeley_data() function ready


In [10]:
# ============================================================================
# TEST: Fetch Business Licenses
# ============================================================================

print('üß™ TESTING API ACCESS')
print('='*70)

# Try to fetch business licenses (most likely to work)
df_business = fetch_berkeley_data('business_licenses', limit=100)

if df_business is not None:
    print('\nüìä SAMPLE DATA:')
    print(df_business.head())
    
    print(f'\nüìà SUMMARY:')
    print(f'   Total records: {len(df_business):,}')
    print(f'   Columns: {len(df_business.columns)}')
    
    # Save to CSV
    output_file = DATA_DIR / 'business_licenses.csv'
    df_business.to_csv(output_file, index=False)
    print(f'\nüíæ Saved to: {output_file}')
else:
    print('\n‚ö†Ô∏è  API access blocked - use manual download')

print('='*70)


üß™ TESTING API ACCESS
üì• Fetching business_licenses...
   Dataset ID: rwnf-bu3w
   Limit: 100 records
   APP_TOKEN: 8PDke1Hu50Wk65wSM0QPxmH1w
‚úÖ Success! Fetched 100 records
   Columns: ['apn', 'recordid', 'busdesc', 'b1_per_sub_type', 'dba']...

üìä SAMPLE DATA:
             apn   recordid               busdesc  \
0  054 177800101  BL-015520          ART CLOTHING   
1  ZZZZZZZZZZZZZ  BL-012113          CONSTRUCTION   
2  ZZZZZZZZZZZZZ  BL-022260       ENERGY ASSESSOR   
3  057 203401000  BL-050611  INTERNET PUBLICATION   
4  ZZZZZZZZZZZZZ  BL-014917   COMMERCIAL FLOORING   

                 b1_per_sub_type                     dba  \
0                   Retail Trade    GISELLE SHEPATIN INC   
1     Construction or Contractor          KONSTRUCTO INC   
2  Professional SemiProfessional                   TAPER   
3   Business Personal Repair Svs             STYLE WYLDE   
4     Construction or Contractor  FLOORING SOLUTIONS INC   

                                               nai

## 2. API Configuration

Get your free API token from:
https://data.cityofberkeley.info/profile/edit/developer_settings

In [13]:
# Load environment variables
try:
    from dotenv import load_dotenv
    load_dotenv(DATA_DIR / '.env')
    print("Loaded .env file")
except:
    print("Note: python-dotenv not installed (optional)")

# Check for API token
APP_TOKEN = os.environ.get('BERKELEY_APP_TOKEN')

# TODO: If no environment variable, set your token here:
APP_TOKEN = "z1ZX3Y2jwZ_BCAoo_iIe1h14HMMAzjpPOV_M"

if APP_TOKEN:
    print(f"API token loaded: {APP_TOKEN[:8]}...")
else:
    print("WARNING: No API token found!")
    print("Get your free token at: https://data.cityofberkeley.info/profile/edit/developer_settings")

Loaded .env file
API token loaded: z1ZX3Y2j...


## 3. Available Datasets

Berkeley Open Data Portal datasets relevant to housing:

In [14]:
# Display available datasets
print("Berkeley Open Data - Housing Related Datasets:\n")
print("="*60)

for name, dataset_id in DATASETS.items():
    info = CONFIG['api']['datasets'].get(name, {})
    desc = info.get('description', 'No description')
    print(f"{name}")
    print(f"  ID: {dataset_id}")
    print(f"  Description: {desc}")
    print()

Berkeley Open Data - Housing Related Datasets:

business_licenses
  ID: rwnf-bu3w
  Description: Active business licenses

building_permits
  ID: ydr8-5enu
  Description: Building permits

zoning_permits
  ID: vkhm-tsvp
  Description: Zoning permits

planning_records
  ID: rk4r-58ys
  Description: Planning records

crime_incidents
  ID: k2nh-s5h5
  Description: No description

restaurant_inspections
  ID: b47j-kakm
  Description: No description



## 4. Fetch Zoning Permits

Zoning permits are the first step in the housing development pipeline.

In [15]:
# Fetch zoning permits
print("Fetching Zoning Permits...")
print("="*60)

df_zoning = load_permits_from_api(
    'zoning_permits',
    limit=50000,
    app_token=APP_TOKEN
)

if df_zoning is not None:
    print(f"\nShape: {df_zoning.shape}")
    print(f"\nColumns:")
    for col in df_zoning.columns:
        print(f"  - {col}")
    
    print(f"\nSample records:")
    display(df_zoning.head(3))

Fetching Zoning Permits...
Using app token: z1ZX3Y2j...
Fetching zoning_permits from Berkeley Open Data...
Error fetching data: 403 Client Error: Forbidden


## 5. Fetch Building Permits

Building permits are issued after zoning approval.

In [None]:
# Fetch building permits
print("Fetching Building Permits...")
print("="*60)

df_building = load_permits_from_api(
    'building_permits',
    limit=50000,
    app_token=APP_TOKEN
)

if df_building is not None:
    print(f"\nShape: {df_building.shape}")
    print(f"\nColumns:")
    for col in df_building.columns:
        print(f"  - {col}")
    
    print(f"\nSample records:")
    display(df_building.head(3))

## 6. Document Data Schemas

Examine and document the schema for each dataset.

In [None]:
def document_schema(df, name):
    """Document dataframe schema"""
    print(f"\n{'='*60}")
    print(f"SCHEMA: {name}")
    print(f"{'='*60}")
    print(f"Records: {len(df):,}")
    print(f"Columns: {len(df.columns)}")
    print()
    
    for col in df.columns:
        dtype = df[col].dtype
        non_null = df[col].notna().sum()
        pct = 100 * non_null / len(df)
        sample = df[col].dropna().iloc[0] if non_null > 0 else 'N/A'
        if isinstance(sample, str) and len(sample) > 40:
            sample = sample[:40] + '...'
        print(f"{col}")
        print(f"  Type: {dtype}, Non-null: {pct:.0f}%")
        print(f"  Sample: {sample}")
        print()

# Document schemas
if df_zoning is not None:
    document_schema(df_zoning, 'Zoning Permits')

if df_building is not None:
    document_schema(df_building, 'Building Permits')

## 7. Export Data

Save fetched data to CSV files.

In [None]:
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%d')

if df_zoning is not None:
    print('‚úÖ Zoning data available')
else:
    print('‚ö†Ô∏è No zoning data - API blocked (403)')
    print('Manual download: https://data.cityofberkeley.info/d/vkhm-tsvp')

if df_building is not None:
    print('‚úÖ Building data available')
else:
    print('‚ö†Ô∏è No building data - API blocked (403)')
    print('Manual download: https://data.cityofberkeley.info/d/ydr8-5enu')


## 8. Load to Database (Optional)

Load data into SQLite for Datasette.

In [None]:
# Save to database
DB_PATH = CONFIG['paths']['database']

if df_zoning is not None:
    save_to_database(df_zoning, 'zoning_permits', DB_PATH)

if df_building is not None:
    save_to_database(df_building, 'building_permits', DB_PATH)

print(f"\nData loaded to: {DB_PATH}")

---

## Summary

This notebook:
- Connected to Berkeley Open Data Portal
- Downloaded zoning and building permits
- Documented data schemas
- Exported to CSV and SQLite

**Next:** Run `A2_address_standardization.ipynb` to standardize addresses.