In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
'''
numpy is going to complain about not being able to determine a dtype
when aggregating, but it can be safely ignored
'''
np.warnings.filterwarnings('ignore')

In [3]:
loc = './dryden_alex_ODM_final/raw_data/HDP_Complaint_Problems/Complaint_Problems.csv'
problems_df = pd.read_csv(loc)
loc = './dryden_alex_ODM_final/raw_data/HDP_Complaints/Housing_Maintenance_Code_Complaints.csv'
complaints_df = pd.read_csv(loc)

Complaints and the Problems that issue fromt them are stored in two seperate datafiles. The foriegn key is a complaint id. So we will set the index on both and join them. 

In [4]:
problems_df.set_index('ComplaintID', inplace=True)
complaints_df.set_index('ComplaintID', inplace=True)



In [5]:
com_prob = complaints_df.join(other=problems_df, lsuffix='_complaint', rsuffix='_problem', how='left')

In [6]:

#add leading zeros to the bloack and lot so they concat with boro into standard 10 digit
com_prob['Block'] = com_prob['Block'].apply(lambda x: '{0:0>5}'.format(x))
com_prob['Lot'] = com_prob['Lot'].apply(lambda x: '{0:0>4}'.format(x))

#make BBL column
com_prob['BBL'] = com_prob['BoroughID'].map(str) + com_prob['Block'].map(str) + com_prob['Lot'].map(str)



In [7]:
'''
put the date in the correct format. Explicitly declaring the 
format will help catch data integrity problems/confirm they were found
in the initial cleaning.
'''

com_prob['ReceivedDate'] = pd.to_datetime(
    com_prob['ReceivedDate'], format="%m/%d/%Y")

com_prob['StatusDate_problem'] = pd.to_datetime(
    com_prob['StatusDate_problem'], format="%m/%d/%Y")

In [8]:
f1 = com_prob.groupby('BBL').agg({})

In [9]:
'''
Numpy doesn't natively perform arithmetic on dates like most SQL based applications. So
this little function converts the datetime objects in to 8 byte integers, gets their mean
and then converts that back into a datetime object and returns it
'''
def date_mean(date):
    return np.array(date, dtype='datetime64[s]').view('i8').mean().astype('datetime64[s]')
    
# get the average date of when the fee was charged
f1 = com_prob.groupby('BBL').agg({'ReceivedDate': date_mean})


In [10]:
f1.columns = ['MeanReceivedDate']

In [11]:
'''
these values were flagged during the initial cleaning, so lets check to see 
if they are different from what is listed in the source's data documentation
'''


see_values = com_prob.groupby('UnitType').size()
see_values

UnitType
APARTMENT     2270142
BUILDING          301
BUILDING-W     567184
PUBLIC ARE     204792
PUBLIC PAR        171
dtype: int64

In [12]:

#get the count of apt problems per building
f2 = com_prob.loc[com_prob['UnitType'].isin(['APARTMENT'])].groupby('BBL').size()

#get the non-apartment ones
f3 = com_prob.loc[com_prob['UnitType'].isin([
    'BUILDING', 'BUILDING-W','PUBLIC ARE','PUBLIC PAR'])].groupby('BBL').size()

f4 = com_prob.groupby('BBL').size()

f5 = com_prob.loc[com_prob['Type'].isin(['EMERGENCY'])].groupby('BBL').size()
f6 = com_prob.loc[com_prob['Type'].isin(['HAZARDOUS'])].groupby('BBL').size()
f7 = com_prob.loc[com_prob['Type'].isin(['IMMEDIATE EMERGENCY'])].groupby('BBL').size()
f8 = com_prob.loc[com_prob['Type'].isin(['NON EMERGENCY'])].groupby('BBL').size()

In [13]:
f1['ApartmentComplaints'] = f2
f1['BuildingComplaints'] = f3
f1['EMERGENCY'] = f5
f1['HAZARDOUS'] = f6
f1['IMMEDIATE_EMERGENCY'] = f7
f1['NON_EMERGENCY'] = f8
f1['TotalComplaints'] = f4


In [14]:
f1.fillna(0, inplace=True)


In [15]:
#save file
f1.to_csv(r'./dryden_alex_ODM_final/intermediate_data/intermediate_csv/HDPComplaintProblemsClean.csv', index='BBL', header=True)

In [16]:
f1.head()

Unnamed: 0_level_0,MeanReceivedDate,ApartmentComplaints,BuildingComplaints,EMERGENCY,HAZARDOUS,IMMEDIATE_EMERGENCY,NON_EMERGENCY,TotalComplaints
BBL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1000047501,2014-11-25 00:00:00,2.0,0.0,2.0,0.0,0.0,0.0,2
1000077501,2017-10-24 12:00:00,9.0,5.0,6.0,0.0,1.0,7.0,14
1000090014,2014-12-22 00:00:00,1.0,0.0,1.0,0.0,0.0,0.0,1
1000100014,2017-07-26 00:00:00,7.0,0.0,3.0,0.0,0.0,4.0,7
1000100032,2015-11-09 00:00:00,1.0,0.0,1.0,0.0,0.0,0.0,1
