# Final Project - NYC Apartment Search - Group 2

## Setup

In [1]:
import json
import pathlib
import urllib.parse

import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import shapely
import sqlalchemy as db
from datetime import datetime
from pathlib import Path

from sqlalchemy.orm import declarative_base
from sqlalchemy import create_engine, text
import psycopg2
from psycopg2 import sql

from shapely.geometry import Point
from shapely.ops import transform
from functools import partial
import pyproj

In [2]:
DATA_DIR = pathlib.Path("data")
ZIPCODE_DATA_FILE = DATA_DIR / "zipcodes" / "nyc_zipcodes.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

NYC_DATA_APP_TOKEN = "nzMxHq7vh4aj5sl0jx9gggdlP"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/"
NYC_DATA_311 = "resource/erm2-nwe9.geojson"
NYC_DATA_TREES = "resource/5rq2-4hqu.geojson"

DB_NAME = "NYC-Data"
DB_USER = "postgres"
DB_PASSWORD = "dddd"  # Replace with your actual PostgreSQL password
DB_URL = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@localhost:5433/{DB_NAME}"
DB_SCHEMA_FILE = "schema.sql"
DB_PORT = 5433

# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

In [3]:
# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

## Part 1: Data Preprocessing

+ The following code defines functions to download data , clean and filtering for the relevant data, fill in missing data, and generate samples of these datasets

### 1.1 Data downloading and cleaning

In [1]:
DATA_DIR = pathlib.Path("data")

def download_nyc_geojson_data(url, force=False):
    parsed_url = urllib.parse.urlparse(url)
    url_path = parsed_url.path.strip("/")
    
    filename = DATA_DIR / url_path
    
    if force or not filename.exists():
        print(f"Downloading {url} to {filename}...")

        response = requests.get(url)
        geojson_data = response.json()

        with open(filename, "w") as f:
            json.dump(geojson_data, f, default=str)
        print(f"Done downloading {url}.")

    else:
        print(f"Reading from {filename}...")

    return filename

NameError: name 'pathlib' is not defined

In [None]:
def load_and_clean_zipcodes(zipcode_datafile):
    
    """
    Load and clean zipcode data from a file.

    Parameters:
        zipcode_datafile (str): The file path to the zipcode data file.

    Returns:
        pd.DataFrame: A cleaned DataFrame containing zipcode data.
    """
    # Load the data from the file
    df = gpd.read_file(zipcode_datafile)
    
    # To keep the necessary columns
    df_selected = df
    
    # Drop rows with missing values
    df_selected = df_selected.dropna()

    # Convert column names to lowercase
    df_selected.columns = [col.lower() for col in df_selected.columns]

    # Convert column types (converting 'population' to int)
    df_selected['population'] = df_selected['population'].astype(int)
    
    # drop duplicate zipcodes
    df_selected = df_selected.drop_duplicates(subset='zipcode')
    
    # Change the SRID to a specific value (EPSG 4326 - WGS 84)
    df_selected = df_selected.to_crs(epsg=4326)
    
    return df_selected

In [2]:
def download_and_clean_311_data():
    start_date = "2015-01-01T00:00:00.000"
    end_date = "2023-09-30T23:59:59.000"

    url = (
        f"{BASE_NYC_DATA_URL}{NYC_DATA_311}?"
        f"$$app_token={NYC_DATA_APP_TOKEN}&"
        f"$where=created_date between '{start_date}' and '{end_date}'&"
        f"$limit=1000000"
    )
    
    filename = download_nyc_geojson_data(url)
    df=gpd.read_file(filename)
    
    # To keep the necessary columns
    df_selected=df[['unique_key', 'created_date','city','incident_zip','complaint_type','geometry']]

    # Drop rows with missing values---we find that if drop all na, it will delete all the data.
    df_selected=df_selected.dropna()
    
    # Convert column names to lowercase
    df_selected.columns = [col.lower() for col in df_selected.columns]
    
    # Change the SRID to a specific value (EPSG 4326 - WGS 84)
    df_selected['geometry'] = df_selected['geometry'].to_crs(epsg=4326)
    
    
    return df_selected

In [3]:
def download_and_clean_tree_data():
    url=f"{BASE_NYC_DATA_URL}{NYC_DATA_TREES}?$$app_token={NYC_DATA_APP_TOKEN}&$limit=700000"
    filename=download_nyc_geojson_data(url)
    df=gpd.read_file(filename)
    
    # To keep the necessary columns
    df_selected=df[['tree_id','zipcode','address','health','zip_city','spc_common','status','sidewalk','borocode','block_id','geometry']]
    
    
    # Drop rows with missing values
    df_selected=df_selected.dropna()
    
    # Convert column names to lowercase
    df_selected.columns = [col.lower() for col in df_selected.columns]
    
    # Change the SRID to a specific value (EPSG 4326 - WGS 84)
    df_selected = df_selected.to_crs(epsg=4326)
    
    
    
    return df_selected

In [4]:
def load_and_clean_zillow_data():
    """
    Load and clean Zillow data from a CSV file.

    Parameters:
        file_path (str): The file path to the Zillow data CSV file.

    Returns:
        pd.DataFrame: A cleaned DataFrame containing Zillow data.
    """
    # Load the data from the CSV file
    zillow_data = pd.read_csv(ZILLOW_DATA_FILE)

    # To keep the necessary columns
    zillow_selected = zillow_data
    
    # only keep data in NY
    zillow_selected = zillow_selected[zillow_selected['State'] == 'NY']
    zillow_selected = zillow_selected.dropna()
    
    return zillow_selected

In [5]:
def load_all_data():
    geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
    geodf_311_data = download_and_clean_311_data()
    geodf_tree_data = download_and_clean_tree_data()
    df_zillow_data = load_and_clean_zillow_data()
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )

In [None]:
geodf_zipcode_data, geodf_311_data, geodf_tree_data, df_zillow_data = load_all_data()