# Getting and cleaning complaint problems data

First, data from the NYC Housing Complaint problems dataset are read into a pandas data frame.

In [None]:
import pandas as pd
import numpy as np
import re

### 

def get_problem_data():
    query = ("https://data.cityofnewyork.us/api/views/a2nx-4u46/rows.csv?accessType=DOWNLOAD")
    complaint_problems = pd.read_csv(query)
    return complaint_problems
            
complaint_problems = get_problem_data()
complaint_problems.head(3)

Next, the dataset is cleaned as follows:
* The following features are dropped:
    ** UnitType, SpaceType, Type, MajorCategory, MinorCategory, Code, Status,

In [2]:
print complaint_problems.shape
complaint_problems = complaint_problems.drop(['UnitType', 'SpaceType', 'Type', 'MajorCategory','MinorCategory','Code','Status'],1)
print complaint_problems.shape

(852936, 18)
(852936, 11)


Then rows are dropped if:
    * StatusID == 1 (i.e. the record is open)
    * They are incomplete (i.e. only complete records are included in analytic sample)

In [3]:
complaint_problems = complaint_problems[(complaint_problems.StatusID == 2)]
print complaint_problems.shape
complaint_problems = complaint_problems[~(complaint_problems.isnull().any(axis=1))]
print complaint_problems.shape

(830437, 11)
(827515, 11)


Next, rows are dropped if:

* Values in fields UnitType, SpaceType, Type, MajorCategory, MinorCategory, and Code are not in codebook (implying the value is missing or mis-entered).

Sets of allowed code values are read in from txt files, which were produced from the HPD-producedd codebook 'HPD_Complaints_Open_Data.pdf' as follows:

* 'HPD_Complaints_Open_Data.pdf' was converted to an html file using the pdfminer package.
* All text from each section of the codebook was copied from this html page into a text file.
* This text file was manually cleaned and everything except allowed code values was deleted.

In [32]:
def get_allowed_codes_from_txt(feature_name):
    file_path = 'codebook_for_complaint_problems/' + str(feature_name) + '.txt'
    code_values = pd.read_csv(file_path, header=None)
    code_values = np.array(code_values).flatten().astype(int)
    return code_values

allowed_UnitTypeID = get_allowed_codes_from_txt('UnitType')
allowed_TypeID = get_allowed_codes_from_txt('ProblemType')
allowed_SpaceTypeID = get_allowed_codes_from_txt('SpaceType')
allowed_MajorCategoryID = get_allowed_codes_from_txt('MajorCategory')
allowed_MinorCategoryID = get_allowed_codes_from_txt('MinorCategory')
allowed_CodeID = get_allowed_codes_from_txt('Code')

print  allowed_UnitTypeID

[91 92 93]


In [5]:
##Note there is an extra whitespace in the SpaceTypeID_ column name
print complaint_problems.columns

Index([u'ProblemID', u'ComplaintID', u'UnitTypeID', u'SpaceTypeID ', u'TypeID',
       u'MajorCategoryID', u'MinorCategoryID', u'CodeID', u'StatusID',
       u'StatusDate', u'StatusDescription'],
      dtype='object')


In [6]:
##so we need to first rename the columns
complaint_problems.columns = ['ProblemID', 'ComplaintID', 'UnitTypeID', 'SpaceTypeID', 'TypeID', 'MajorCategoryID', 'MinorCategoryID', 'CodeID', 'StatusID', 'StatusDate', 'StatusDescription']

In [33]:
print complaint_problems.shape
complaint_problems_proc = complaint_problems[(complaint_problems['SpaceTypeID'].isin(allowed_SpaceTypeID))]
print complaint_problems_proc.shape

complaint_problems_proc = complaint_problems_proc[(complaint_problems_proc['UnitTypeID'].isin(allowed_UnitTypeID))]
print complaint_problems_proc.shape

complaint_problems_proc = complaint_problems_proc[(complaint_problems_proc['TypeID'].isin(allowed_TypeID))]
print complaint_problems_proc.shape

complaint_problems_proc = complaint_problems_proc[(complaint_problems_proc['MajorCategoryID'].isin(allowed_MajorCategoryID))]
print complaint_problems_proc.shape

complaint_problems_proc = complaint_problems_proc[(complaint_problems_proc['MinorCategoryID'].isin(allowed_MinorCategoryID))]
print complaint_problems_proc.shape

complaint_problems_proc = complaint_problems_proc[(complaint_problems_proc['CodeID'].isin(allowed_CodeID))]
print complaint_problems_proc.shape

(827515, 11)
(787767, 11)
(787767, 11)
(787767, 11)
(787767, 11)
(787251, 11)
(777240, 11)


Then, StatusDescriptions are processed and coded using the scheme outlined below:

In [34]:
###Proposed classes for StatusDescription:

## 1. "not able to gain access" = r'not\sables\sto\sgain\saccess'
##1. "unable to access" = r'unable\sto\saccess'
## 2. "inspected the following conditions. No violations were issued." = r'inspected\sthe\sfollowing\sconditions\.\sNo\sviolations\swere\sissued'
##2. "Heat was not required at the time of the inspection. No violations were issued" = r'Heat\swas\snot\srequired\sat\sthe\stime\sof\sthe\sinspection\.\sNo\sviolations\swere\sissued'
## 3. ". Violations were issued" = r'\.Violations\swere\sissued'
##4. ". Violations were previously issued" = r'\.Violations\swere\spreviously\sissued'
##5. "conditions were corrected" = r'conditions\swere\scorrected'
##5. "advised by a tenant' [that heat or hot water was restored] = r'advised\sby\sa\stenant'
##6. "conditions are still open" = r'conditions\sare\still\open'
##7. "inspection to test the paint for lead" = r'inspection\sto\stest\sthe\spaint\sfor\slead'
##8. ". A Section 8 Failure was issued." = r'\.\sA\sSection\s8\sFailure\swas\sissued\.'
##0. Not one of the above.

def infer_complaint_status(input_string):
    try:
        input_string = str(input_string)
    except:
        print input_string
    if bool(re.search(r'not\sable\sto\sgain\saccess', input_string)):
        code = 1 
    elif bool(re.search(r'unable\sto\saccess', input_string)):
        code = 1
    elif bool(re.search(r'inspected\sthe\sfollowing\sconditions\.\sNo\sviolations\swere\sissued', input_string)):
        code = 2
    elif bool(re.search(r'Heat\swas\snot\srequired\sat\sthe\stime\sof\sthe\sinspection\.\sNo\sviolations\swere\sissued', input_string)):
        code = 2
    elif bool(re.search(r'\.\sViolations\swere\sissued', input_string)):
        code = 3
    elif bool(re.search(r'\.\sViolations\swere\spreviously\sissued', input_string)):
        code = 4
    elif bool(re.search(r'conditions\swere\scorrected', input_string)):
        code = 5 
    elif bool(re.search(r'advised\sby\sa\stenant', input_string)):
        code = 5
    elif bool(re.search(r'conditions\sare\sstill\sopen', input_string)):
        code = 6
    elif bool(re.search(r'inspection\sto\stest\sthe\spaint\sfor\slead', input_string)):
        code = 7
    elif bool(re.search(r'\.\sA\sSection\s8\sFailure\swas\sissued\.', input_string)):
        code = 8
    else:
        code = 0
    return code

complaint_problems_proc['StatusDescriptionID'] = complaint_problems_proc['StatusDescription'].map(infer_complaint_status)

In [35]:
print len(complaint_problems_proc[(complaint_problems_proc['StatusDescriptionID'] == 0)])

0


Since there are no records with StatusDescriptionID == 0, we know we've successfully handled every status description. As such, we proceed by dropping StatusID (since they're all 2), and StatusDescription.

In [36]:
complaint_problems_proc = complaint_problems_proc.drop(['StatusID','StatusDescription'],axis=1)
complaint_problems_proc.head(3)

Unnamed: 0,ProblemID,ComplaintID,UnitTypeID,SpaceTypeID,TypeID,MajorCategoryID,MinorCategoryID,CodeID,StatusDate,StatusDescriptionID
0,14603248,6991025,91,541,1,9,65,624,09/13/2014,3
1,14643878,7007545,91,543,1,59,348,2834,09/03/2014,5
2,14630215,7000457,91,541,3,65,381,2831,10/24/2014,1


Finally, we convert StatusDate to a datetime feature.

In [38]:
complaint_problems_proc.StatusDate = pd.to_datetime(complaint_problems_proc.StatusDate)
print complaint_problems_proc.StatusDate.dtype

datetime64[ns]


To recap- we now have a processed complaint problems dataset including all closed records with valid UnitTypeID, SpaceTypeID, TypeID, MajorCategoryID, and CodeId entries. All text features hve been dropped (with the information retained in the ID codes), and a new StatusDescriptionID has been constructed that summarizes the endpoint for each complaint problem.