# NYC Apartment Search

_[Project prompt](https://docs.google.com/document/d/1BYVyFBDcTywdUlanH0ysfOrNWPgl7UkqXA7NeewTzxA/edit#heading=h.bpxu7uvknnbk)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an idea of a possible approach.**_

## Setup

In [1]:
# All import statements needed for the project, for example:

import json
import pathlib
import urllib.parse

import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import shapely
import sqlalchemy as db

from sqlalchemy.orm import declarative_base

In [83]:
# Any constants you might need; some have been added for you

# Where data files will be read from/written to - this should already exist
DATA_DIR = pathlib.Path("data")
ZIPCODE_DATA_FILE = DATA_DIR / "zipcodes" / "nyc_zipcodes.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

NYC_DATA_APP_TOKEN = "FILL_ME_IN"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/"
NYC_DATA_311 = "erm2-nwe9.geojson"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

In [84]:
# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

## Part 1: Data Preprocessing

In [246]:
def load_and_clean_zipcodes(zipcode_datafile):
    # load data
    df = gpd.read_file(zipcode_datafile)
     
    # Set up the headers to pass the application token
    clean_columns = ['ZIPCODE','geometry']
    df = df[clean_columns]
    
    # Remove invalid data points by rows
    df = df.dropna()
    
    # Normalize column names to lowercase
    df.columns = df.columns.str.lower()
    df.rename(columns={'zipcode': 'zip_code'}, inplace=True)
    
    # remove duplicate zip codes
    df.drop_duplicates(subset='zip_code', keep='first', inplace=True)
    
    # Normalize SRID
    df['geometry'] = df['geometry'].to_crs("epsg:4326")
    
    return df


In [18]:
from datetime import datetime

In [47]:
def download_and_clean_311_data():
    
    #### Download the data
    # Set up the url obtained the data from
    url = "https://data.cityofnewyork.us/resource/erm2-nwe9.geojson"
    
    # Set up the headers to pass the application token
    header = {"X-App-Token": "oOu0LSU0TAyxOY44hvVXEv2hT"}

    # Set up the parameter to limit the number of rows and date range
    start_date = datetime.strptime('10/01/2022', '%m/%d/%Y').strftime('%Y-%m-%dT%H:%M:%S')
    end_date = datetime.strptime('09/30/2023', '%m/%d/%Y').strftime('%Y-%m-%dT%H:%M:%S')
    
    param = {"$limit": 5000
            } # Need to adjust later

    # Make the API request and download the geojson file to local
    response = requests.get(url, headers=header, params=param)
    
    with open("data/nyc_311.geojson", "wb") as file:
        file.write(response.content)
    
    df = gpd.read_file("data/nyc_311.geojson")
                       
    
    #### Clean the data
    # Select necessary columns and Remove the others
    clean_columns = ['unique_key','created_date','complaint_type','incident_zip','geometry']
    df = df[clean_columns]
    
    # Remove invalid data by rows
    df.dropna(inplace=True)
    
    # Normalize column names and types if needed
    df.rename(columns={'incident_zip': 'zip_code'}, inplace=True)
    df['unique_key'] = df['unique_key'].astype('Int64')
    df['zip_code'] = df['zip_code'].astype("string")
                       
    return df


In [48]:
test_311 = download_and_clean_311_data()
test_311

Unnamed: 0,unique_key,created_date,complaint_type,zip_code,geometry
0,59682706,2023-12-09 12:00:00,Derelict Vehicles,11412,POINT (-73.75719 40.69898)
1,59683999,2023-12-09 12:00:00,Derelict Vehicles,11357,POINT (-73.82518 40.77956)
2,59681385,2023-12-09 12:00:00,Derelict Vehicles,11222,POINT (-73.94549 40.71914)
3,59681790,2023-12-09 02:41:46,Graffiti,10032,POINT (-73.94337 40.83670)
4,59684401,2023-12-09 02:06:35,Graffiti,11211,POINT (-73.95151 40.71341)
...,...,...,...,...,...
4995,59684672,2023-12-08 13:36:48,Noise - Street/Sidewalk,11374,POINT (-73.86006 40.73133)
4996,59680022,2023-12-08 13:36:47,Street Sign - Dangling,11693,POINT (-73.82164 40.61487)
4997,59683351,2023-12-08 13:36:42,Noise - Residential,10466,POINT (-73.84512 40.88620)
4998,59684241,2023-12-08 13:36:25,Traffic,10011,POINT (-74.00470 40.74471)


In [23]:
def download_and_clean_tree_data():
    
    url = "https://data.cityofnewyork.us/resource/5rq2-4hqu.geojson"

    # Set up the headers to pass the application token
    header = {"X-App-Token": "oOu0LSU0TAyxOY44hvVXEv2hT"}

    # Set up the parameter to limit the number of rows
    param = {"$limit": 5000} #confirmed limits around 683788~ 684000 to used

    # Make the API request and converting the JSON response to obtain a dataframe
    response = requests.get(url,headers=header, params=param)
    
    with open("data/tree.geojson", "wb") as file:
        file.write(response.content)
        
    df = gpd.read_file("data/tree.geojson")
    
    #### Clean the data
    # Select necessary columns and Remove the others
    clean_columns = ['tree_id','created_at','zipcode','spc_common','health','status','geometry']
    df = df[clean_columns]

    
    # Remove invalid data by rows
    df.dropna(inplace=True)
    
    # Normalize column names and types if needed
    df.rename(columns={'zipcode': 'zip_code'}, inplace=True)
    df.rename(columns={'spc_common': 'species'}, inplace=True)
    
    df['geometry'] = df['geometry'].to_crs("epsg:4326")
    
    
    return df

In [86]:
def load_and_clean_zillow_data():
    # load the csv file
    df = gpd.read_file(ZILLOW_DATA_FILE)
    
    #### Clean the data
    # Filter out non-NYC cities
    df = df[df['City']=='New York']

    # Select necessary columns and Remove the others
    clean_columns = ['RegionID','RegionName',r'2023/08/31',r'2023/01/31']
    df = df[clean_columns]
    
    # Normalize column names and types if needed
    df.columns = df.columns.str.lower()
    df.rename(columns={'regionid': 'region_id'}, inplace=True)
    df.rename(columns={'regionname': 'zip_code'}, inplace=True)
    # df.rename(columns={r'2023/08/31': 'august2023'}, inplace=True)
    # df.rename(columns={r'2023/01/31': 'january2023'}, inplace=True)
    
    # df['august2023'] = pd.to_numeric(df['august2023'])
    # df['january2023'] = pd.to_numeric(df['january2023'])
    
    # Remove invalid data by rows
    df.dropna(inplace=True)
    
    return df

In [None]:
def load_all_data():
    geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
    geodf_311_data = download_and_clean_311_data()
    geodf_tree_data = download_and_clean_tree_data()
    df_zillow_data = load_and_clean_zillow_data()
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )

In [44]:
geodf_zipcode_data, geodf_311_data, geodf_tree_data, df_zillow_data = load_all_data()

In [15]:
# Show basic info about each dataframe
geodf_zipcode_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   zip_code  263 non-null    object  
 1   geometry  263 non-null    geometry
dtypes: geometry(1), object(1)
memory usage: 4.2+ KB


In [16]:
# Show first 5 entries about each dataframe
geodf_zipcode_data.geometry

0      POLYGON ((-73.80585 40.68291, -73.80569 40.682...
1      POLYGON ((-73.93740 40.67973, -73.93487 40.679...
2      POLYGON ((-73.90294 40.67084, -73.90223 40.668...
3      POLYGON ((-73.95797 40.67066, -73.95576 40.670...
4      POLYGON ((-73.97208 40.65060, -73.97192 40.650...
                             ...                        
258    POLYGON ((-74.12065 40.64104, -74.12057 40.641...
259    POLYGON ((-73.84076 40.62536, -73.84306 40.627...
260    POLYGON ((-73.95805 40.72442, -73.95772 40.724...
261    POLYGON ((-73.95133 40.76931, -73.95165 40.769...
262    POLYGON ((-73.99354 40.75145, -73.99320 40.751...
Name: geometry, Length: 263, dtype: geometry

In [17]:
geodf_311_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 4937 entries, 0 to 4999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   unique_key    4937 non-null   Int64         
 1   created_date  4937 non-null   datetime64[ns]
 2   zip_code      4937 non-null   string        
 3   geometry      4937 non-null   geometry      
dtypes: Int64(1), datetime64[ns](1), geometry(1), string(1)
memory usage: 197.7 KB


In [18]:
geodf_311_data.head()

Unnamed: 0,unique_key,created_date,zip_code,geometry
0,59674686,2023-12-08 12:00:00,10033,POINT (-73.92843 40.85199)
1,59670275,2023-12-08 12:00:00,11369,POINT (-73.87725 40.75788)
2,59667305,2023-12-08 12:00:00,10033,POINT (-73.93274 40.84632)
4,59667303,2023-12-08 12:00:00,10019,POINT (-73.99209 40.76428)
5,59676143,2023-12-08 12:00:00,11231,POINT (-74.01050 40.67327)


In [19]:
geodf_tree_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 4810 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   tree_id     4810 non-null   object  
 1   created_at  4810 non-null   object  
 2   zip_code    4810 non-null   object  
 3   species     4810 non-null   object  
 4   health      4810 non-null   object  
 5   status      4810 non-null   object  
 6   geometry    4810 non-null   geometry
dtypes: geometry(1), object(6)
memory usage: 300.6+ KB


In [20]:
geodf_tree_data.head()

Unnamed: 0,tree_id,created_at,zip_code,species,health,status,geometry
0,180683,08/27/2015,11375,red maple,Fair,Alive,POINT (-73.84422 40.72309)
1,200540,09/03/2015,11357,pin oak,Fair,Alive,POINT (-73.81868 40.79411)
2,204026,09/05/2015,11211,honeylocust,Good,Alive,POINT (-73.93661 40.71758)
3,204337,09/05/2015,11211,honeylocust,Good,Alive,POINT (-73.93446 40.71354)
4,189565,08/30/2015,11215,American linden,Good,Alive,POINT (-73.97598 40.66678)


In [21]:
df_zillow_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 121 entries, 4 to 6721
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   region_id    121 non-null    object 
 1   zip_code     121 non-null    object 
 2   august2023   121 non-null    float64
 3   january2023  121 non-null    float64
dtypes: float64(2), object(2)
memory usage: 4.7+ KB


In [22]:
df_zillow_data.head()

Unnamed: 0,region_id,zip_code,august2023,january2023
4,62093,11385,3064.476503,2895.699421
6,62019,11208,2737.54747,2588.030194
13,61807,10467,2353.686402,2155.617718
14,62085,11373,2302.557354,2255.604528
15,62037,11226,2785.320137,2680.6837


## Part 2: Storing Data

In [23]:
DB_NAME = "Final_Project"
DB_USER = "postgres"
DB_URL = f"postgresql+psycopg2://{DB_USER}@localhost/{DB_NAME}"
DB_SCHEMA_FILE = "schema.sql"

### Creating Database

In [24]:
!createdb Final_Project


createdb: error: database creation failed: ERROR:  database "Final_Project" already exists


In [25]:
!psql --dbname Final_Project -c 'CREATE EXTENSION postgis;'

ERROR:  extension "postgis" already exists


In [26]:
#pip install psycopg2-binary

In [27]:
import psycopg2

In [28]:
def setup_new_postgis_database(username, db_name):
    conn = psycopg2.connect(dbname=db_name, user=username)
    return conn

In [29]:
setup_new_postgis_database(DB_USER, DB_NAME)

<connection object at 0x2812538b0; dsn: 'dbname=Final_Project user=postgres', closed: 0>

### Creating Tables


These are just a couple of options to creating your tables; you can use one or the other, a different method, or a combination.

In [30]:
engine = db.create_engine(DB_URL)

In [40]:
# create that required schema.sql file
with open(DB_SCHEMA_FILE, "w") as f:
    f.write(ZIPCODE_SCHEMA + ";\n")
    f.write(NYC_311_SCHEMA + ";\n")
    f.write(NYC_TREE_SCHEMA + ";\n")
    f.write(ZILLOW_SCHEMA + ";\n")

In [41]:
# If using SQL (as opposed to SQLAlchemy), execute the schema files to create tables
with engine.connect() as connection:
    connection.execute(open(DB_SCHEMA_FILE, "r").read())

### Add Data to Database

These are just a couple of options to write data to your tables; you can use one or the other, a different method, or a combination.

## Part 3: Understanding the Data

In [47]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    with open(outfile, 'w') as file:
        file.write(query)
    

### Query 1

In [49]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_1))
    for row in result:
        print(row)

In [50]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

In [51]:
# Able to delete later
with engine.connect() as connection:
    df = pd.read_sql_query(QUERY_1, connection)
print(df)

Empty DataFrame
Columns: [zip_code, num_complaints]
Index: []


### Query 2

In [72]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_2))
    for row in result:
        print(row)
    

('10306', 180)
('11105', 176)
('11375', 157)
('10457', 136)
('11215', 127)
('10024', 123)
('10458', 120)
('11230', 115)
('11205', 112)
('10312', 107)


In [73]:
write_query_to_file(QUERY_2, QUERY_2_FILENAME)

In [74]:
# Able to delete later
with engine.connect() as connection:
    df = pd.read_sql_query(QUERY_2, connection)
print(df)

  zip_code  num_trees
0    10306        180
1    11105        176
2    11375        157
3    10457        136
4    11215        127
5    10024        123
6    10458        120
7    11230        115
8    11205        112
9    10312        107


### Query 3

In [76]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_3))
    for row in result:
        print(row)

('10306', Decimal('2331.54'))
('11105', Decimal('2852.73'))
('11375', Decimal('2743.40'))
('10457', Decimal('2183.97'))
('11215', Decimal('3575.65'))
('10024', Decimal('3797.94'))
('10458', Decimal('1991.63'))
('11230', Decimal('2657.04'))
('11205', Decimal('3497.47'))


### Query 4

In [79]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_4))
    for row in result:
        print(row)

('10309', Decimal('1380.51'), 16, 7)
('10462', Decimal('1801.89'), 1, 30)
('10453', Decimal('1820.23'), 18, 60)
('11357', Decimal('1829.66'), 28, 35)
('10458', Decimal('1883.08'), 120, 65)
('10011', Decimal('4741.87'), 15, 29)
('10069', Decimal('4959.67'), 2, 1)
('10013', Decimal('5480.11'), 3, 19)
('10282', Decimal('7143.35'), 0, 1)
('10007', Decimal('7270.24'), 0, 2)


### Query 5

In [82]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_5))
    for row in result:
        print(row)

('10306', 180)
('11105', 176)
('11375', 157)
('10457', 136)
('11215', 127)
('10024', 123)
('10458', 120)
('11230', 115)
('11205', 112)
('10312', 107)


### Query 6

In [None]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_4))
    for row in result:
        print(row)

## Part 4: Visualizing the Data

### Visualization 1

In [142]:
file_path = "vis1.csv" 
df = pd.read_csv(file_path)

# 显示 DataFrame 的前几行数据
print(df)


                  type        date  count
0  Noise - Residential  2023-12-09    172
1       HEAT/HOT WATER  2023-12-08   1097
2  Noise - Residential  2023-12-07    613
3      Illegal Parking  2023-12-08   1408
4  Noise - Residential  2023-12-08    756
5       HEAT/HOT WATER  2023-12-07   1874
6      Illegal Parking  2023-12-09     74
7      Illegal Parking  2023-12-07   1406


In [None]:
def get_data_for_visual_1():
    """ 
    Write a query to get data from database for Visualization 1 and return to a dataframe 
    
    """

## Visualization 2

## Visualization 3

## Visualization 4

## Visualization 5

## Visualization 6