In [1]:
import json

import numpy as np
import pandas as pd
from money_parser import price_dec

pd.options.display.max_columns = None

In [2]:
def curr_to_float(v): return float(price_dec(v)) if v else v
def tf_to_10(v): return 1 if v == 't' else 0
def nulls_to_0(v): return float(0 if (pd.isnull(v) or v == '') else v)

converters_to_columns = [
    (curr_to_float, ['price', 'weekly_price', 'monthly_price']),
    (tf_to_10, ['instant_bookable', 'host_is_superhost']),
    (nulls_to_0, ['bedrooms', 'bathrooms'])
]

converters = {col: conv for conv, cols in converters_to_columns for col in cols}

df = pd.read_csv('data/listings.csv', index_col='id', converters=converters)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
complaints_df = pd.read_csv('data/NYPD_Complaint_Data_Historic.csv', index_col='CMPLNT_NUM')

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


In [4]:
cleaned_complaints_df = complaints_df[~complaints_df['Latitude'].isnull()].copy()

In [5]:
lats = pd.cut(cleaned_complaints_df['Latitude'], 50)
lons = pd.cut(cleaned_complaints_df['Longitude'], 50)

In [6]:
lats = lats.apply(lambda v: str(v.left) + ',' + str(v.right))
lons = lons.apply(lambda v: str(v.left) + ',' + str(v.right))

In [7]:
boxes = lats.str.cat(lons, sep=',')

In [8]:
cleaned_complaints_df['boxes'] = boxes

In [9]:
def convert_to_number(val):
    if (val == 'FELONY'):
        return 5
    elif (val == 'MISDEMEANOR'):
        return 1
    else:
        return 0.1

cleaned_complaints_df['LAW_CAT_CD_NUMBER'] = cleaned_complaints_df['LAW_CAT_CD'].apply(convert_to_number)

In [10]:
box_df = cleaned_complaints_df[['boxes', 'LAW_CAT_CD_NUMBER']].groupby('boxes').sum().reset_index().copy()

In [11]:
box_df['lower_latitude'] = box_df['boxes'].apply(lambda v: float(v.split(',')[0]))
box_df['upper_latitude'] = box_df['boxes'].apply(lambda v: float(v.split(',')[1]))
box_df['lower_longitude'] = box_df['boxes'].apply(lambda v: float(v.split(',')[2]))
box_df['upper_longitude'] = box_df['boxes'].apply(lambda v: float(v.split(',')[3]))

In [12]:
# box_df['lower_latitude'] = box_df['lower_latitude'].apply(lambda v: 0 if v == box_df['lower_latitude'].min() else v)
# box_df['upper_latitude'] = box_df['upper_latitude'].apply(lambda v: (v+1) if v == box_df['upper_latitude'].max() else v)
# box_df['lower_longitude'] = box_df['lower_longitude'].apply(lambda v: (v-1) if v == box_df['lower_longitude'].min() else v)
# box_df['upper_longitude'] = box_df['upper_longitude'].apply(lambda v: 0 if v == box_df['upper_longitude'].max() else v)

In [12]:
lat_cols = box_df.loc[: , ['lower_latitude', 'upper_latitude']]
lon_cols = box_df.loc[: , ['lower_longitude', 'upper_longitude']]

box_df['mean_latitude'] = lat_cols.mean(axis='columns')
box_df['mean_longitude'] = lon_cols.mean(axis='columns')

In [13]:
df['box'] = ''
df['box_lower_latitude'] = np.nan
df['box_upper_latitude'] = np.nan
df['box_lower_longitude'] = np.nan
df['box_upper_longitude'] = np.nan
df['crime_score'] = np.nan
done_count = 0
for index, row in df.iterrows():
    s1 = np.logical_and(row.latitude > box_df['lower_latitude'], row.latitude <= box_df['upper_latitude'])
    s2 = np.logical_and(row.longitude > box_df['lower_longitude'], row.longitude <= box_df['upper_longitude'])
    box = box_df[np.logical_and(s1, s2)]
    try:
        box_dict = box.iloc[0].to_dict()
        df.at[index, 'box'] = box_dict['boxes']
        df.at[index, 'box_lower_latitude'] = box_dict['lower_latitude']
        df.at[index, 'box_upper_latitude'] = box_dict['upper_latitude']
        df.at[index, 'box_lower_longitude'] = box_dict['lower_longitude']
        df.at[index, 'box_upper_longitude'] = box_dict['upper_longitude']
        df.at[index, 'crime_score'] = box_dict['LAW_CAT_CD_NUMBER']
    except Exception as e:
        print('index', index)
        df.at[index, 'box'] = np.nan
        df.at[index, 'crime_score'] = 0
    done_count+=1
    if done_count % 2500 == 0:
        print('done ', done_count)

index 1391683
done  2500
done  5000
index 5251338
done  7500
done  10000
done  12500
done  15000
done  17500
done  20000
done  22500
done  25000
done  27500
done  30000
index 26703677
index 27301638
done  32500
done  35000
done  37500
done  40000
done  42500
index 36417725
done  45000
done  47500
index 39447248
index 39811924
index 40346146
done  50000
index 40452853


In [14]:
df[['neighbourhood_cleansed', 'crime_score']].groupby('neighbourhood_cleansed').mean().sort_values('crime_score', ascending=False)

Unnamed: 0_level_0,crime_score
neighbourhood_cleansed,Unnamed: 1_level_1
Theater District,20595.711650
NoHo,14087.600000
Chelsea,13958.700861
Hell's Kitchen,12516.142008
Flatiron District,12484.552000
...,...
City Island,179.488889
Huguenot,177.600000
Lighthouse Hill,170.200000
Neponsit,163.100000


In [15]:
box_df = box_df.rename(columns={'LAW_CAT_CD_NUMBER': 'crime_score', 'boxes': 'box'})
box_df = box_df[['lower_latitude', 'upper_latitude', 'mean_latitude', 
                       'lower_longitude', 'upper_longitude', 'mean_longitude', 
                       'box', 'crime_score']]

In [16]:
box_df.to_csv('data/crime_scores_granular.csv')

In [17]:
(df[['box_lower_latitude', 'box_upper_latitude', 'box_lower_longitude', 
     'box_upper_longitude', 'box', 'crime_score']].to_csv('data/listings_with_crime_scores_granular.csv'))