In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import sklearn
from sklearn import manifold
from sklearn.manifold import TSNE
import matplotlib.cm as cm
from sklearn import cluster
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import math
import scipy
from scipy.spatial import distance_matrix
import itertools
import time

In [2]:
# Data Source: https://data.boston.gov/dataset/rentsmart
# Desc: Rentals only; Boston; 2016-present (previous years available 'through 1700s')

# "RentSmart Boston compiles data from BOS:311 and the City's Inspectional Services Division
# to give prospective tenants a more complete picture of the homes and apartments they are 
# considering renting, assisting them in understanding any previous issues with the property, 
# including: housing violations, building violations, enforcement violations, housing complaints,
# sanitation requests, and/or civic maintenance requests."

In [None]:
'''
TODO
- # occupants, race, and income

'''

'\nTODO\n- correlations in space, time, building characteristics with complaint number and frequency\n\n'

In [84]:
data_path = 'RentSmart/20220314_download/tmpdzll6bw0.csv'

In [85]:
df = pd.read_csv(data_path, parse_dates=['date'])

In [86]:
df.shape

(309756, 13)

In [87]:
addr_splt = [addr.split() for addr in df.address]

In [88]:
addr_splt[:5]

[['137', 'W', 'Ninth', 'St,', '02127'],
 ['141', 'W', 'Ninth', 'St,', '02127'],
 ['24-26', 'Faneuil', 'St,', '02135'],
 ['39', 'Millmont', 'St,', '02119'],
 ['19', 'Park', 'St,', '02122']]

In [89]:
for i in range(len(addr_splt)):
    if len(addr_splt[i]) != 4:
        addr_splt[i] = ['','','','']
    else:
        addr_splt[i][2] = addr_splt[i][2][:-1]
        addr_splt[i][1] = addr_splt[i][1].upper()
        addr_splt[i][2] = addr_splt[i][2].upper()
        

In [90]:
addr_splt[:5]

[['', '', '', ''],
 ['', '', '', ''],
 ['24-26', 'FANEUIL', 'ST', '02135'],
 ['39', 'MILLMONT', 'ST', '02119'],
 ['19', 'PARK', 'ST', '02122']]

In [91]:
df['st_num'] = None
df['st_name'] = None
df['st_name_suf'] = None
df['zipcode_str'] = None

In [92]:
df[['st_num','st_name','st_name_suf','zipcode_str']] = addr_splt

In [93]:
df

Unnamed: 0,date,violation_type,description,address,neighborhood,zip_code,parcel,owner,year built,year remodeled,property_type,latitude,longitude,st_num,st_name,st_name_suf,zipcode_str
0,2022-03-13 00:00:00.000,Enforcement Violations,Improper storage trash: res,"137 W Ninth St, 02127",South Boston,2127,700334000,ONE-37 W NINTH STREET,1920.0,1992.0,Condominium Main*,42.334460,-71.054240,,,,
1,2022-03-13 00:00:00.000,Enforcement Violations,Improper storage trash: res,"141 W Ninth St, 02127",South Boston,2127,700336000,LASOFF STEVEN S,1885.0,1990.0,Residential 3-family,42.334370,-71.054130,,,,
2,2022-03-12 00:00:00.000,Enforcement Violations,Improper storage trash: res,"24-26 Faneuil St, 02135",Brighton,2135,2202558000,LIU ZIYU,1930.0,2015.0,Residential 2-family,42.355008,-71.151007,24-26,FANEUIL,ST,02135
3,2022-03-12 00:00:00.000,Enforcement Violations,Improper storage trash: res,"39 Millmont St, 02119",Roxbury,2119,903678000,THREENINEMILLMONT LLC,1900.0,2015.0,Residential 3-family,42.327532,-71.090943,39,MILLMONT,ST,02119
4,2022-03-12 00:00:00.000,Enforcement Violations,Improper storage trash: res,"19 Park St, 02122",Dorchester,2122,1600041000,CORRIGAN JUSTIN,1890.0,1996.0,Residential 2-family,42.301993,-71.053850,19,PARK,ST,02122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309751,2017-04-09 14:22:00.000,Civic Maintenance Requests,Contractor Complaints,"24 Nottinghill Rd, 02135",Brighton,2135,2102699000,METCHIK IRETA Z,1925.0,,Residential 1-family,42.343680,-71.149700,24,NOTTINGHILL,RD,02135
309752,2017-04-09 14:03:00.000,Housing Complaints,Work w/out Permit,"96-98 Brookley Rd, 02130",Jamaica Plain,2130,1102723000,PAYNE THOMAS M,1910.0,,Residential 3-family,42.302863,-71.105433,96-98,BROOKLEY,RD,02130
309753,2017-04-09 13:31:48.610,Sanitation Requests,Abandoned Bicycle,"124 Chandler St, 02116",Boston,2116,400042000,ONE TWENTY FOUR CHANDLER ST,1999.0,2015.0,Condominium Main*,42.345980,-71.074040,124,CHANDLER,ST,02116
309754,2017-04-09 12:48:00.000,Sanitation Requests,Abandoned Vehicles,"50 Florence St #3, 02131",Roslindale,2131,1904207000,ROSYLN MANOR CONDO TR,1999.0,1999.0,Condominium Main*,42.285120,-71.126470,,,,


In [94]:
df.zip_code = df.zip_code.astype(float)

In [6]:
data_path2 = 'buildings_inventory/building_inventory_021020.csv'

In [9]:
building_df = pd.read_csv(data_path2,low_memory=False)

In [83]:
building_df.loc[building_df.st_name == 'MILLMONT',['st_num','st_name','st_name_suf','zipcode']]

Unnamed: 0,st_num,st_name,st_name_suf,zipcode
7438,5,MILLMONT,ST,2119.0
8862,29,MILLMONT,ST,2119.0
56933,39,MILLMONT,ST,2119.0
72683,15,MILLMONT,ST,2119.0
72704,23 -25,MILLMONT,ST,2119.0
74342,46,MILLMONT,ST,2119.0
86341,14,MILLMONT,ST,2119.0
93925,,MILLMONT,ST,2119.0
97951,16,MILLMONT,ST,2119.0


In [55]:
building_df[['st_num','st_name','st_name_suf','unit_num','zipcode']]

Unnamed: 0,st_num,st_name,st_name_suf,unit_num,zipcode
0,,,,,
1,5,EVERETT,ST,,2122.0
2,43,ESSEX,ST,,2129.0
3,525,WESTERN,AV,,2135.0
4,,,,,
...,...,...,...,...,...
98925,,,,,
98926,19,AVALON,RD,,2132.0
98927,46,GERRISH,ST,,2135.0
98928,,,,,


In [36]:
building_df['unit_num'].unique()

array([nan])

In [96]:
df = df.merge(building_df,left_on=['st_num','st_name','st_name_suf','zip_code'],right_on=['st_num','st_name','st_name_suf','zipcode'],how='inner')

In [97]:
df.to_csv('rentsmart_withoccdata.csv')

In [98]:
df.shape

(127232, 121)

In [None]:
# perhaps should have done an outer merge