# Noise Complaints Associated With Areas That Have Liquor Licenses

This notebook analyzes the relationship between liquor licenses and noise complaints in NYC.

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

# optional imports (used if available)
try:
    from sklearn.neighbors import BallTree
    SKLEARN_AVAILABLE = True
except Exception:
    SKLEARN_AVAILABLE = False

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
warnings.simplefilter(action='ignore', category=FutureWarning)

## 2. Load Data

In [None]:
# Define data paths
data_dir = Path('../data/raw')
liquor_license_file = data_dir / 'sla_active.csv'
noise_complaints_file = data_dir / '311_noise.csv'

# Load liquor license data
print("Loading liquor license data...")
liquor_df = pd.read_csv(liquor_license_file)
print(f"Loaded {len(liquor_df)} liquor license records")

# Load noise complaint data (if available)
if noise_complaints_file.exists():
    print("\nLoading noise complaint data...")
    noise_df = pd.read_csv(noise_complaints_file)
    print(f"Loaded {len(noise_df)} noise complaint records")
else:
    print("\nNote: 311_noise.csv not found. Please download NYC 311 noise complaint data.")
    print("Data source: https://data.cityofnewyork.us/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9")
    noise_df = None

## 3. Explore Liquor License Data

In [None]:
# Display basic information
print("Liquor License Data Structure:")
print(liquor_df.info())
print("\nFirst few records:")
liquor_df.head()

In [None]:
# Check for missing values
print("Missing Values:")
missing = liquor_df.isnull().sum()
print(missing[missing > 0])

In [None]:
# Summary statistics
print("Summary Statistics:")
liquor_df.describe()

## 4. Data Preprocessing

In [None]:
# Parse georeference data to extract latitude and longitude
def extract_coordinates(georeference):
    """Extract latitude and longitude from POINT string."""
    if pd.isna(georeference):
        return pd.Series({'longitude': None, 'latitude': None})
    try:
        # Some rows have the format 'POINT (lon lat)'
        if isinstance(georeference, str) and 'POINT' in georeference:
            coords = georeference.replace('POINT (', '').replace(')', '').split()
            return pd.Series({'longitude': float(coords[0]), 'latitude': float(coords[1])})
        # Other formats: a tuple-like string or a list
        if isinstance(georeference, str) and ',' in georeference:
            parts = [p.strip() for p in georeference.replace('(', '').replace(')', '').split(',')]
            return pd.Series({'longitude': float(parts[0]), 'latitude': float(parts[1])})
        return pd.Series({'longitude': None, 'latitude': None})
    except Exception:
        return pd.Series({'longitude': None, 'latitude': None})

# Extract coordinates
if 'Georeference' in liquor_df.columns:
    liquor_df[['longitude', 'latitude']] = liquor_df['Georeference'].apply(extract_coordinates)
else:
    # Try common columns
    for lon_col in ['Longitude', 'longitude', 'LONGITUDE', 'X']:
        if lon_col in liquor_df.columns:
            liquor_df['longitude'] = liquor_df[lon_col]
            break
    for lat_col in ['Latitude', 'latitude', 'LATITUDE', 'Y']:
        if lat_col in liquor_df.columns:
            liquor_df['latitude'] = liquor_df[lat_col]
            break

# Convert date columns to datetime
date_columns = ['Original Issue Date', 'Last Issue Date', 'Effective Date', 'Expiration Date']
for col in date_columns:
    if col in liquor_df.columns:
        liquor_df[col] = pd.to_datetime(liquor_df[col], errors='coerce')

print("Preprocessed liquor license data:")
print(f"Records with valid coordinates: {liquor_df[['longitude', 'latitude']].notna().all(axis=1).sum()}")

## 5. Analyze Liquor License Distribution

In [None]:
# Distribution by county
print("Liquor Licenses by County:")
if 'Premises County' in liquor_df.columns:
    county_counts = liquor_df['Premises County'].value_counts()
    print(county_counts)
    # Visualize
    plt.figure(figsize=(10, 6))
    county_counts.plot(kind='bar')
    plt.title('Distribution of Liquor Licenses by County')
    plt.xlabel('County')
    plt.ylabel('Number of Licenses')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("Premises County column not found in liquor license data.")

In [None]:
# Distribution by type
print("\nLiquor Licenses by Type:")
if 'Description' in liquor_df.columns:
    type_counts = liquor_df['Description'].value_counts()
    print(type_counts)
    # Visualize top 10 types
    plt.figure(figsize=(12, 6))
    type_counts.head(10).plot(kind='barh')
    plt.title('Top 10 License Types')
    plt.xlabel('Number of Licenses')
    plt.ylabel('License Type')
    plt.tight_layout()
    plt.show()
else:
    print("Description column not found in liquor license data.")

In [None]:
# Geographic distribution (for Richmond County - Staten Island)
if 'Premises County' in liquor_df.columns:
    richmond_df = liquor_df[liquor_df['Premises County'] == 'Richmond'].copy()
    richmond_valid = richmond_df[richmond_df[['longitude', 'latitude']].notna().all(axis=1)]
    if len(richmond_valid) > 0:
        plt.figure(figsize=(12, 10))
        plt.scatter(richmond_valid['longitude'], richmond_valid['latitude'], 
                    alpha=0.5, s=20, c='red')
        plt.title('Geographic Distribution of Liquor Licenses in Richmond County (Staten Island)')
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        print(f"\nRichmond County: {len(richmond_valid)} licenses with valid coordinates")
else:
    print("Premises County column not found; skipping county map.")

## 6. Analyze Noise Complaint Data (if available)

In [None]:
def extract_noise_coords(df):
    """Try to find latitude/longitude columns in the noise dataframe and normalize them to 'latitude'/'longitude'."""
    possible_lon = ['Longitude', 'longitude', 'LONGITUDE', 'X', 'Long', 'lon']
    possible_lat = ['Latitude', 'latitude', 'LATITUDE', 'Y', 'Lat', 'lat']
    for lon_col in possible_lon:
        for lat_col in possible_lat:
            if lon_col in df.columns and lat_col in df.columns:
                df = df.copy()
                df['longitude'] = pd.to_numeric(df[lon_col], errors='coerce')
                df['latitude'] = pd.to_numeric(df[lat_col], errors='coerce')
                return df
    # try common 311 column 'Location 1' or 'Location' or 'the_geom' that may contain '(lat, lon)' or 'POINT (lon lat)'
    for col in ['Location 1', 'Location', 'the_geom', 'geom', 'Point', 'location']:
        if col in df.columns:
            s = df[col].astype(str)
            # look for POINT pattern or tuple
            def parse_loc(v):
                if 'POINT' in v:
                    try:
                        coords = v.replace('POINT (', '').replace(')', '').split()
                        return float(coords[1]), float(coords[0])
                    except Exception:
                        return (np.nan, np.nan)
                if ',' in v and '(' in v:
                    try:
                        parts = v.replace('(', '').replace(')', '').split(',')
                        return float(parts[0]), float(parts[1])
                    except Exception:
                        return (np.nan, np.nan)
                return (np.nan, np.nan)
            parsed = s.apply(parse_loc).apply(pd.Series)
            parsed.columns = ['latitude', 'longitude']
            df = df.copy().reset_index(drop=True)
            df['latitude'] = pd.to_numeric(parsed['latitude'], errors='coerce')
            df['longitude'] = pd.to_numeric(parsed['longitude'], errors='coerce')
            if df[['latitude', 'longitude']].notna().all(axis=1).sum() > 0:
                return df
    # If nothing found, return df as-is with no lat/lon
    return df

if noise_df is not None:
    print("Noise Complaint Data Structure:")
    print(noise_df.info())
    print("\nFirst few records:")
    try:
        display(noise_df.head())
    except Exception:
        print(noise_df.head())
    
    # Normalize coordinates
    noise_df = extract_noise_coords(noise_df)
    coord_count = noise_df[['latitude', 'longitude']].notna().all(axis=1).sum()
    print(f"\nNoise complaints with coordinates: {coord_count}")
    # Filter for noise complaints by common 311 columns
    complaint_cols = ['Complaint Type', 'complaint_type', 'Complaint Type Description', 'Descriptor', 'descriptor']
    noise_only = noise_df
    found = False
    for col in complaint_cols:
        if col in noise_df.columns:
            noise_only = noise_df[noise_df[col].astype(str).str.contains('Noise', case=False, na=False)].copy()
            found = True
            print(f"Filtered using column '{col}'. Results: {len(noise_only)} rows")
            break
    if not found:
        # try 'Complaint Type' like strings in an available column 'Complaint Type' may not exist; fall back to 'Descriptor' text matching
        text_cols = [c for c in noise_df.columns if noise_df[c].dtype == 'O']
        mask = pd.Series(False, index=noise_df.index)
        for c in text_cols:
            mask = mask | noise_df[c].astype(str).str.contains('noise', case=False, na=False)
        noise_only = noise_df[mask].copy()
        print(f"Filtered by searching text columns for 'noise'. Results: {len(noise_only)} rows")
else:
    print("Noise complaint data not available. Please download from NYC Open Data.")
    noise_only = None

## 7. Spatial Analysis (if noise data available)

In [None]:
# We'll compute distance (in meters) from each noise complaint to the nearest liquor license.
# Use BallTree with haversine (requires coordinates in radians) if available; fall back to vectorized haversine function.
from math import radians, sin, cos, sqrt, atan2
EARTH_RADIUS_M = 6371000  # meters

def haversine_array(lat1, lon1, lat2_arr, lon2_arr):
    """Compute haversine distance between a point and arrays of points (meters)."""
    # lat/lon in degrees
    lat1r = np.radians(lat1)
    lon1r = np.radians(lon1)
    lat2r = np.radians(lat2_arr)
    lon2r = np.radians(lon2_arr)
    dlat = lat2r - lat1r
    dlon = lon2r - lon1r
    a = np.sin(dlat/2.0)**2 + np.cos(lat1r)*np.cos(lat2r)*np.sin(dlon/2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return EARTH_RADIUS_M * c

def compute_nearest_distances(noise_df, lic_df):
    # Filter valid coordinates
    noise_valid = noise_df[noise_df[['latitude', 'longitude']].notna().all(axis=1)].copy()
    lic_valid = lic_df[lic_df[['latitude', 'longitude']].notna().all(axis=1)].copy()
    if len(noise_valid) == 0 or len(lic_valid) == 0:
        print("Not enough data for spatial analysis (missing coordinates).")
        return None
    # prepare arrays in radians for BallTree if available
    lic_coords = np.vstack([lic_valid['latitude'].values, lic_valid['longitude'].values]).T
    noise_coords = np.vstack([noise_valid['latitude'].values, noise_valid['longitude'].values]).T
    try:
        if SKLEARN_AVAILABLE:
            # BallTree expects radians and shape (n_samples, 2)
            tree = BallTree(np.radians(lic_coords), metric='haversine')
            dist_rad, ind = tree.query(np.radians(noise_coords), k=1)
            dist_m = dist_rad.flatten() * EARTH_RADIUS_M
            noise_valid['nearest_license_distance_m'] = dist_m
            noise_valid['nearest_license_idx'] = ind.flatten()
        else:
            # Fall back to pairwise haversine (works but is slower)
            dists = []
            lic_lat = lic_valid['latitude'].values
            lic_lon = lic_valid['longitude'].values
            for i, row in noise_valid[['latitude', 'longitude']].iterrows():
                dist_arr = haversine_array(row['latitude'], row['longitude'], lic_lat, lic_lon)
                dists.append(dist_arr.min())
            noise_valid['nearest_license_distance_m'] = dists
        # Return merged result; keep original index alignment
        return noise_valid
    except Exception as e:
        print('Error computing distances:', e)
        return None

if noise_only is not None and len(noise_only) > 0:
    nearest = compute_nearest_distances(noise_only, liquor_df)
    if nearest is not None:
        # Basic metrics
        thresholds = [200, 500]  # meters
        for t in thresholds:
            within = (nearest['nearest_license_distance_m'] <= t).sum()
            print(f"Complaints within {t} m of a liquor license: {within} ({within / len(nearest) * 100:.2f}%)")
        # Histogram of distances
        plt.figure(figsize=(10,6))
        plt.hist(nearest['nearest_license_distance_m'].dropna(), bins=50, range=(0,2000))
        plt.title('Histogram of distances from noise complaints to nearest liquor license')
        plt.xlabel('Distance (m)')
        plt.ylabel('Number of complaints')
        plt.tight_layout()
        plt.show()

        # Temporal analysis (if created date exists)
        date_cols = ['Created Date', 'created_date', 'Created', 'Created Date/Time', 'created_datetime']
        date_col = None
        for col in date_cols:
            if col in nearest.columns:
                date_col = col
                break
        if date_col is not None:
            nearest[date_col] = pd.to_datetime(nearest[date_col], errors='coerce')
            nearest['hour'] = nearest[date_col].dt.hour
            # compare hourly patterns for complaints near licenses vs far
            nearest['near_200m'] = nearest['nearest_license_distance_m'] <= 200
            hourly = nearest.groupby(['hour', 'near_200m']).size().unstack(fill_value=0)
            hourly_norm = hourly.div(hourly.sum(axis=0), axis=1)
            hourly.plot(kind='bar', figsize=(14,6))
            plt.title('Hourly noise complaints: near (<=200m) vs far (>200m)')
            plt.xlabel('Hour of day')
            plt.ylabel('Number of complaints')
            plt.tight_layout()
            plt.show()
        else:
            print("No created date column found for temporal analysis. Searched columns: {date_cols}")

        # Add nearest license info back to original noise dataframe (join by index)
        noise_df = noise_df.join(nearest[['nearest_license_distance_m']], how='left')
else:
    print("No noise complaints available for spatial analysis.")

## 8. Summary and Conclusions

In [None]:
print("Analysis Summary:")
print(f"\nTotal liquor licenses analyzed: {len(liquor_df)}")
if 'Premises County' in liquor_df.columns:
    print(f"Counties covered: {liquor_df['Premises County'].nunique()}")
else:
    print("Counties covered: N/A")
if 'Description' in liquor_df.columns:
    print(f"License types: {liquor_df['Description'].nunique()}")
else:
    print("License types: N/A")
print(f"Records with valid geographic coordinates: {liquor_df[['longitude', 'latitude']].notna().all(axis=1).sum()}")

if noise_df is not None:
    try:
        print(f"\nNoise complaints analyzed (total): {len(noise_df)}")
        if 'nearest_license_distance_m' in noise_df.columns:
            print(f"Noise complaints with computed distance: {noise_df['nearest_license_distance_m'].notna().sum()}")
    except Exception:
        pass
else:
    print("\nNote: Complete the analysis by adding noise complaint data.")

## Next Steps

1. Download NYC 311 noise complaint data from [NYC Open Data](https://data.cityofnewyork.us/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9)
2. Filter for noise-related complaints
3. Perform spatial join to find noise complaints near liquor licenses
4. Analyze temporal patterns (time of day, day of week)
5. Create visualizations showing correlations
6. Generate statistical reports