# Getting and cleaning complaint problems data

First, data from the NYC Housing Complaint problems dataset are read into a pandas data frame.

In [48]:
import pandas as pd
import numpy as np
import re

### 

def get_problem_data():
    query = ("https://data.cityofnewyork.us/api/views/a2nx-4u46/rows.csv?accessType=DOWNLOAD")
    complaint_problems = pd.read_csv(query)
    return complaint_problems
            
complaint_problems = get_problem_data()
complaint_problems.head(3)

Unnamed: 0,ProblemID,ComplaintID,UnitTypeID,UnitType,SpaceTypeID,SpaceType,TypeID,Type,MajorCategoryID,MajorCategory,MinorCategoryID,MinorCategory,CodeID,Code,StatusID,Status,StatusDate,StatusDescription
0,14603248,6991025,91,APARTMENT,541,BATHROOM,1,EMERGENCY,9,PLUMBING,65,BASIN/SINK,624,DRAIN PIPE BLOCKED OR BROKEN,2,CLOSE,09/13/2014,The Department of Housing Preservation and Dev...
1,14643878,7007545,91,APARTMENT,543,ENTIRE APARTMENT,1,EMERGENCY,59,HEAT/HOT WATER,348,APARTMENT ONLY,2834,NO HOT WATER,2,CLOSE,09/03/2014,The Department of Housing Preservation and Dev...
2,14630215,7000457,91,APARTMENT,541,BATHROOM,3,NON EMERGENCY,65,WATER LEAK,381,SLOW LEAK,2831,AT WALL OR CEILING,2,CLOSE,10/24/2014,The Department of Housing Preservation and Dev...


Next, the dataset is cleaned as follows:
* The following features are dropped: UnitType, SpaceType, Type, MajorCategory, MinorCategory, Code, Status,

In [49]:
print complaint_problems.shape
complaint_problems = complaint_problems.drop(['UnitType', 'SpaceType', 'Type', 'MajorCategory','MinorCategory','Code','Status'],1)
print complaint_problems.shape

(852936, 18)
(852936, 11)


Then rows are dropped if:
    * StatusID == 1 (i.e. the record is open)
    * They are incomplete (i.e. only complete records are included in analytic sample)

In [50]:
complaint_problems = complaint_problems[(complaint_problems.StatusID == 2)]
print complaint_problems.shape
complaint_problems = complaint_problems[~(complaint_problems.isnull().any(axis=1))]
print complaint_problems.shape

(830437, 11)
(827515, 11)


Next, rows are dropped if:

* Values in fields UnitType, SpaceType, Type, MajorCategory, MinorCategory, and Code are not in codebook (implying the value is missing or mis-entered).

Sets of allowed code values are read in from txt files, which were produced from the HPD-produced codebook 'HPD_Complaints_Open_Data.pdf' as follows:

* 'HPD_Complaints_Open_Data.pdf' was converted to an html file using the pdfminer package.
* All text from each section of the codebook was copied from this html page into a text file.
* This text file was manually cleaned and everything except allowed code values was deleted.

In [51]:
def get_allowed_codes_from_txt(feature_name):
    file_path = 'codebook_for_complaint_problems/' + str(feature_name) + '.txt'
    code_values = pd.read_csv(file_path, header=None)
    code_values = np.array(code_values).flatten().astype(int)
    return code_values

allowed_UnitTypeID = get_allowed_codes_from_txt('UnitType')
allowed_TypeID = get_allowed_codes_from_txt('ProblemType')
allowed_SpaceTypeID = get_allowed_codes_from_txt('SpaceType')
allowed_MajorCategoryID = get_allowed_codes_from_txt('MajorCategory')
allowed_MinorCategoryID = get_allowed_codes_from_txt('MinorCategory')
allowed_CodeID = get_allowed_codes_from_txt('Code')

Note there is an extra whitespace in the SpaceTypeID_ column name

In [52]:
print complaint_problems.columns

Index([u'ProblemID', u'ComplaintID', u'UnitTypeID', u'SpaceTypeID ', u'TypeID',
       u'MajorCategoryID', u'MinorCategoryID', u'CodeID', u'StatusID',
       u'StatusDate', u'StatusDescription'],
      dtype='object')


So we need to first rename the columns.

In [53]:
complaint_problems.columns = ['ProblemID', 'ComplaintID', 'UnitTypeID', 'SpaceTypeID', 'TypeID', 'MajorCategoryID', 'MinorCategoryID', 'CodeID', 'StatusID', 'StatusDate', 'StatusDescription']

In [113]:
print complaint_problems.shape
complaint_problems_proc = complaint_problems[(complaint_problems['SpaceTypeID'].isin(allowed_SpaceTypeID))]
print complaint_problems_proc.shape

complaint_problems_proc = complaint_problems_proc[(complaint_problems_proc['UnitTypeID'].isin(allowed_UnitTypeID))]
print complaint_problems_proc.shape

complaint_problems_proc = complaint_problems_proc[(complaint_problems_proc['TypeID'].isin(allowed_TypeID))]
print complaint_problems_proc.shape

complaint_problems_proc = complaint_problems_proc[(complaint_problems_proc['MajorCategoryID'].isin(allowed_MajorCategoryID))]
print complaint_problems_proc.shape

complaint_problems_proc = complaint_problems_proc[(complaint_problems_proc['MinorCategoryID'].isin(allowed_MinorCategoryID))]
print complaint_problems_proc.shape

complaint_problems_proc = complaint_problems_proc[(complaint_problems_proc['CodeID'].isin(allowed_CodeID))]
print complaint_problems_proc.shape

(827515, 11)
(787767, 11)
(787767, 11)
(787767, 11)
(787767, 11)
(787251, 11)
(777240, 11)


Then, StatusDescriptions are processed and coded using the scheme outlined below:

In [114]:
###Proposed classes for StatusDescription:

## 1. "not able to gain access" = r'not\sables\sto\sgain\saccess'
##1. "unable to access" = r'unable\sto\saccess'
## 2. "inspected the following conditions. No violations were issued." = r'inspected\sthe\sfollowing\sconditions\.\sNo\sviolations\swere\sissued'
##2. "Heat was not required at the time of the inspection. No violations were issued" = r'Heat\swas\snot\srequired\sat\sthe\stime\sof\sthe\sinspection\.\sNo\sviolations\swere\sissued'
## 3. ". Violations were issued" = r'\.\sViolations\swere\sissued'
##4. ". Violations were previously issued" = r'\.\sViolations\swere\spreviously\sissued'
##5. "conditions were corrected" = r'conditions\swere\scorrected'
##5. "advised by a tenant' [that heat or hot water was restored] = r'advised\sby\sa\stenant'
##6. "conditions are still open" = r'conditions\sare\still\open'
##7. "inspection to test the paint for lead" = r'inspection\sto\stest\sthe\spaint\sfor\slead'
##8. ". A Section 8 Failure was issued." = r'\.\sA\sSection\s8\sFailure\swas\sissued\.'
##0. Not one of the above.

def infer_complaint_status(input_string):
    try:
        input_string = str(input_string)
    except:
        print input_string
    if bool(re.search(r'not\sable\sto\sgain\saccess', input_string)):
        code = 1 
    elif bool(re.search(r'unable\sto\saccess', input_string)):
        code = 1
    elif bool(re.search(r'inspected\sthe\sfollowing\sconditions\.\sNo\sviolations\swere\sissued', input_string)):
        code = 2
    elif bool(re.search(r'Heat\swas\snot\srequired\sat\sthe\stime\sof\sthe\sinspection\.\sNo\sviolations\swere\sissued', input_string)):
        code = 2
    elif bool(re.search(r'\.\sViolations\swere\sissued', input_string)):
        code = 3
    elif bool(re.search(r'\.\sViolations\swere\spreviously\sissued', input_string)):
        code = 4
    elif bool(re.search(r'conditions\swere\scorrected', input_string)):
        code = 5 
    elif bool(re.search(r'advised\sby\sa\stenant', input_string)):
        code = 5
    elif bool(re.search(r'conditions\sare\sstill\sopen', input_string)):
        code = 6
    elif bool(re.search(r'inspection\sto\stest\sthe\spaint\sfor\slead', input_string)):
        code = 7
    elif bool(re.search(r'\.\sA\sSection\s8\sFailure\swas\sissued\.', input_string)):
        code = 8
    else:
        code = 0
    return code

complaint_problems_proc['StatusDescriptionID'] = complaint_problems_proc['StatusDescription'].map(infer_complaint_status)

In [115]:
print len(complaint_problems_proc[(complaint_problems_proc['StatusDescriptionID'] == 0)])

0


Since there are no records with StatusDescriptionID == 0, we know we've successfully handled every status description. As such, we proceed by dropping StatusID (since they're all 2), and StatusDescription.

In [116]:
complaint_problems_proc = complaint_problems_proc.drop(['StatusID','StatusDescription'],axis=1)
complaint_problems_proc.head(3)

Unnamed: 0,ProblemID,ComplaintID,UnitTypeID,SpaceTypeID,TypeID,MajorCategoryID,MinorCategoryID,CodeID,StatusDate,StatusDescriptionID
0,14603248,6991025,91,541,1,9,65,624,09/13/2014,3
1,14643878,7007545,91,543,1,59,348,2834,09/03/2014,5
2,14630215,7000457,91,541,3,65,381,2831,10/24/2014,1


Next we need to determine the number of problems associated with a complaint.

In [117]:
prob_in_complaint = complaint_problems_proc.ComplaintID.value_counts()
prob_in_complaint.name = 'Probs_in_complaint'

In [118]:
complaint_problems_proc.set_index('ComplaintID', drop=False, inplace=True)
complaint_problems_proc = complaint_problems_proc.join(prob_in_complaint, how='inner')
complaint_problems_proc.shape

(777240, 11)

In [119]:
complaint_problems_proc.reset_index(inplace=True, drop=True)
complaint_problems_proc.head(6)

Unnamed: 0,ProblemID,ComplaintID,UnitTypeID,SpaceTypeID,TypeID,MajorCategoryID,MinorCategoryID,CodeID,StatusDate,StatusDescriptionID,Probs_in_complaint
0,14248328,6834336,91,542,1,63,375,2817,04/24/2014,1,4
1,14248329,6834336,91,543,3,63,376,2821,03/18/2015,2,4
2,14248330,6834336,91,543,3,63,376,2823,03/18/2015,2,4
3,14248331,6834336,91,541,1,65,380,2828,04/24/2014,1,4
4,14248462,6834403,91,546,3,65,381,2831,08/25/2014,1,2
5,14248463,6834403,91,546,3,58,343,2686,08/25/2014,1,2


Now we can drop ProblemID, so we have only informative variables in the DataFrame. We also drop StatusDate, since it is not needed (given we will be using the ComplaintDate from the complaint dataset to determine the time a complaint was made).

In [120]:
complaint_problems_proc = complaint_problems_proc.drop(['ProblemID', "StatusDate"], axis=1)

In [121]:
complaint_problems_proc.head(6)

Unnamed: 0,ComplaintID,UnitTypeID,SpaceTypeID,TypeID,MajorCategoryID,MinorCategoryID,CodeID,StatusDescriptionID,Probs_in_complaint
0,6834336,91,542,1,63,375,2817,1,4
1,6834336,91,543,3,63,376,2821,2,4
2,6834336,91,543,3,63,376,2823,2,4
3,6834336,91,541,1,65,380,2828,1,4
4,6834403,91,546,3,65,381,2831,1,2
5,6834403,91,546,3,58,343,2686,1,2


Finally, we convert StatusDescriptionID to a binary variable- violation_issued. This is justified since we're interested in predicting which complaints result in a violation. Note we first drop records with StatusDescriptionID == 4, since these are effectively duplicate records in our dataset.

In [122]:
len(complaint_problems_proc[(complaint_problems_proc['StatusDescriptionID'] == 4)])

19039

In [123]:
complaint_problems_proc = complaint_problems_proc[(complaint_problems_proc.StatusDescriptionID != 4)]
complaint_problems_proc['ViolationIssued'] = complaint_problems_proc['StatusDescriptionID'].map(lambda x: 1 if (x == 3 or x == 8) else 0)
complaint_problems_proc = complaint_problems_proc.drop('StatusDescriptionID', axis=1)
complaint_problems_proc.head(6)

Unnamed: 0,ComplaintID,UnitTypeID,SpaceTypeID,TypeID,MajorCategoryID,MinorCategoryID,CodeID,Probs_in_complaint,ViolationIssued
0,6834336,91,542,1,63,375,2817,4,0
1,6834336,91,543,3,63,376,2821,4,0
2,6834336,91,543,3,63,376,2823,4,0
3,6834336,91,541,1,65,380,2828,4,0
4,6834403,91,546,3,65,381,2831,2,0
5,6834403,91,546,3,58,343,2686,2,0


In [124]:
complaint_problems_proc.ViolationIssued.value_counts()

0    607273
1    150928
dtype: int64

To recap- we now have a processed complaint problems dataset including all closed records with valid UnitTypeID, SpaceTypeID, TypeID, MajorCategoryID, and CodeID entries. All text features hve been dropped (with the information retained in the ID codes), and a new StatusDescriptionID has been constructed that summarizes the endpoint for each complaint problem.