# Notebook Title

## Setup Python and R environment
you can ignore this section

In [5]:
%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

%matplotlib inline  
from matplotlib import rcParams
rcParams['figure.figsize'] = (16, 100)

import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore") # Ignore all warnings
# warnings.filterwarnings("ignore", category=RRuntimeWarning) # Show some warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [6]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [7]:
%%R

# My commonly used R imports

require('tidyverse')

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


Loading required package: tidyverse


## 👉 download your data

You can write code here to download your dataset. Or if you already have it, just leave the URL in the comments and just load it into a pandas or R (or both) dataframe.

In [3]:
import pandas as pd
df=pd.read_csv('privateschools.csv')
df


Unnamed: 0,X,Y,OBJECTID,PPIN,NAME,STREET,CITY,STATE,ZIP,STFIP,...,NMCBSA,CBSATYPE,CSA,NMCSA,NECTA,NMNECTA,CD,SLDL,SLDU,SCHOOLYEAR
0,-86.541877,32.472090,1,2722,AUTAUGA ACADEMY,497 GOLSON RD,PRATTVILLE,AL,36067,1,...,"Montgomery, AL",1,388,"Montgomery-Selma-Alexander City, AL",N,N,102,1069,1030,2021-2022
1,-86.458475,32.469350,2,A0100060,CAMELLIA BAPTIST WEM,201 WOODVALE RD,PRATTVILLE,AL,36067,1,...,"Montgomery, AL",1,388,"Montgomery-Selma-Alexander City, AL",N,N,102,1088,1030,2021-2022
2,-86.472061,32.460800,3,A0900032,FIRST PRESBYTERIAN KINDERGARTEN,211 S CHESTNUT ST,PRATTVILLE,AL,36067,1,...,"Montgomery, AL",1,388,"Montgomery-Selma-Alexander City, AL",N,N,102,1088,1030,2021-2022
3,-86.466175,32.455425,4,A1592005,NEW HOPE ACADEMY,301 WATER ST,PRATTVILLE,AL,36067,1,...,"Montgomery, AL",1,388,"Montgomery-Selma-Alexander City, AL",N,N,102,1088,1030,2021-2022
4,-86.470441,32.461710,5,K9300030,FIRST BAPTIST PRESCHOOL AND KINDERGARTEN,138 S WASHINGTON ST,PRATTVILLE,AL,36067,1,...,"Montgomery, AL",1,388,"Montgomery-Selma-Alexander City, AL",N,N,102,1088,1030,2021-2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22340,-110.810241,43.491060,22341,A0303404,MOUNTAIN ACADEMY OF TETON SCIENCE SCHOOLS,700 COYOTE CANYON RD,JACKSON,WY,83001,56,...,"Jackson, WY-ID",2,N,N,N,N,5600,56023,56017,2021-2022
22341,-110.799526,43.458650,22342,A0903887,JACKSON HOLE COMMUNITY SCHOOL,"1715 HIGH SCHOOL ROAD, #240",JACKSON,WY,83001,56,...,"Jackson, WY-ID",2,N,N,N,N,5600,56016,56017,2021-2022
22342,-110.844286,43.367002,22343,A1102418,RED TOP MEADOWS,7905 FALL CREEK RD,WILSON,WY,83014,56,...,"Jackson, WY-ID",2,N,N,N,N,5600,56022,56016,2021-2022
22343,-110.810901,43.441910,22344,A2000010,JACKSON HOLE CLASSICAL ACADEMY,2500 S PARK LOOP RD,JACKSON,WY,83001,56,...,"Jackson, WY-ID",2,N,N,N,N,5600,56023,56017,2021-2022


## 👉 convert addresses --> lat/long 

See the [census-examples](https://github.com/data4news/census-examples) repository for examples. If you need help, try asking in the class slack channel. Chances are someone in the class is struggling with the same problem as you are so we might as well all learn together in the same slack channel! 

In [None]:
#My dataset has lat and long data. 

## 👉 convert lat/long to census geography codes 

(like 'GEOID', 'STATE', 'COUNTY', 'TRACT', 'BLOCK', etc...)

Same note as above, see [census-examples](https://github.com/data4news/census-examples) repository for examples or ask in the class slack channel if stuck.

In [11]:
!pip install requests-cache

Collecting requests-cache
  Using cached requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Using cached requests_cache-1.2.1-py3-none-any.whl (61 kB)
Installing collected packages: requests-cache
Successfully installed requests-cache-1.2.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import pandas as pd
import censusgeocode as cg
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import requests_cache
import csv  
cache = requests_cache.CachedSession("geocode_cache", backend="filesystem")

df = pd.read_csv('privateschools.csv')

def geocode(lat, lng):
    try:
        url = "https://geocoding.geo.census.gov/geocoder/geographies/coordinates"
        params = {
            "x": lng,
            "y": lat,
            "benchmark": "Public_AR_Census2020",
            "vintage": "Census2020_Census2020",
            "format": "json"
        }
        response = cache.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        
        # Check if the required key exists before accessing it
        if 'result' in data and 'geographies' in data['result'] and 'Census Blocks' in data['result']['geographies']:
            return data['result']['geographies']['Census Blocks'][0]
        else:
            print(f"Warning: No census data found for ({lat}, {lng})")
            return None
    except Exception as e:
        print(f"Error geocoding ({lat}, {lng}): {e}")
        return None

with open('censusgeos.csv', 'w', newline='') as f:
    header = ['SUFFIX', 'POP100', 'GEOID', 'CENTLAT', 'BLOCK', 'AREAWATER', 'STATE', 'BASENAME', 'OID', 'LSADC', 'INTPTLAT', 'FUNCSTAT', 'NAME', 'OBJECTID', 'TRACT', 'CENTLON', 'BLKGRP', 'AREALAND', 'HU100', 'INTPTLON', 'MTFCC', 'LWBLKTYP', 'UR', 'COUNTY']
    writer = csv.DictWriter(f, fieldnames=header)
    
    writer.writeheader()
    
    for _, row in tqdm(df.iterrows(), total=len(df)):
        json_data = geocode(row['LAT'], row['LON'])
        if json_data:  # Check if geocode() returned a valid dictionary
            writer.writerow(json_data)
        else:
            print(f"Skipping row with coordinates ({row['LAT']}, {row['LON']}) due to missing data.")


  0%|          | 0/22345 [00:00<?, ?it/s]

Error geocoding (37.56244, -122.380821): Expecting value: line 1 column 1 (char 0)
Skipping row with coordinates (37.56244, -122.380821) due to missing data.
Error geocoding (39.29272, -80.346121): ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Skipping row with coordinates (39.29272, -80.346121) due to missing data.


## 👉 Output Data

Output your dataframe containing your data and the Census connector codes (like tract, block, etc...).

In [2]:
import pandas as pd
df=pd.read_csv("privateschools_with_geocodes.csv")
df

Unnamed: 0,X,Y,OBJECTID,PPIN,NAME,STREET,CITY,STATE,ZIP,STFIP,...,CSA,NMCSA,NECTA,NMNECTA,CD,SLDL,SLDU,SCHOOLYEAR,Census_Tract,County
0,-86.541877,32.472090,1,2722,AUTAUGA ACADEMY,497 GOLSON RD,PRATTVILLE,AL,36067,1,...,388,"Montgomery-Selma-Alexander City, AL",N,N,102,1069,1030,2021-2022,1.001020e+09,Autauga County
1,-86.458475,32.469350,2,A0100060,CAMELLIA BAPTIST WEM,201 WOODVALE RD,PRATTVILLE,AL,36067,1,...,388,"Montgomery-Selma-Alexander City, AL",N,N,102,1088,1030,2021-2022,1.001021e+09,Autauga County
2,-86.472061,32.460800,3,A0900032,FIRST PRESBYTERIAN KINDERGARTEN,211 S CHESTNUT ST,PRATTVILLE,AL,36067,1,...,388,"Montgomery-Selma-Alexander City, AL",N,N,102,1088,1030,2021-2022,1.003011e+09,Baldwin County
3,-86.466175,32.455425,4,A1592005,NEW HOPE ACADEMY,301 WATER ST,PRATTVILLE,AL,36067,1,...,388,"Montgomery-Selma-Alexander City, AL",N,N,102,1088,1030,2021-2022,1.001020e+09,Autauga County
4,-86.470441,32.461710,5,K9300030,FIRST BAPTIST PRESCHOOL AND KINDERGARTEN,138 S WASHINGTON ST,PRATTVILLE,AL,36067,1,...,388,"Montgomery-Selma-Alexander City, AL",N,N,102,1088,1030,2021-2022,1.003011e+09,Baldwin County
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22340,-110.810241,43.491060,22341,A0303404,MOUNTAIN ACADEMY OF TETON SCIENCE SCHOOLS,700 COYOTE CANYON RD,JACKSON,WY,83001,56,...,N,N,N,N,5600,56023,56017,2021-2022,5.603997e+10,Teton County
22341,-110.799526,43.458650,22342,A0903887,JACKSON HOLE COMMUNITY SCHOOL,"1715 HIGH SCHOOL ROAD, #240",JACKSON,WY,83001,56,...,N,N,N,N,5600,56016,56017,2021-2022,5.603997e+10,Teton County
22342,-110.844286,43.367002,22343,A1102418,RED TOP MEADOWS,7905 FALL CREEK RD,WILSON,WY,83014,56,...,N,N,N,N,5600,56022,56016,2021-2022,5.603997e+10,Teton County
22343,-110.810901,43.441910,22344,A2000010,JACKSON HOLE CLASSICAL ACADEMY,2500 S PARK LOOP RD,JACKSON,WY,83001,56,...,N,N,N,N,5600,56023,56017,2021-2022,5.603997e+10,Teton County
