In [None]:
datafile = "nyc_311_data_subset.csv"

In [None]:
import pandas as pd
import numpy as np

<h4><b>read_csv</b>: A pandas function that reads a comma separated file</h4>
read_csv will try to format the data so that it is the correct type and will report any typing problems<br>
It will also look for a header row. 
<br>http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html

In [None]:
data = pd.read_csv(datafile)
data

<h4>Let's examine our data</h4>

In [None]:
data.info()

<h4>Looks like Unique Key really is a unique key and can serve as an index</h4>

In [None]:
data = pd.read_csv(datafile,index_col='Unique Key')

In [None]:
data.iloc[1:10]

<h4>Columns 4 has mixed types</h4>

<h4>Column 4 is incident zip</h4>
Let's examine it<br>
The unique() function returns unique values in a column

In [None]:
data['Incident Zip'].unique()

<h4>Some issues</h4>
<li>Sometimes zip is a float, other times it is a str
<li>Zipcodes that are represented as floats and start with 0 are missing the first digit
<li>Some zipcodes have the 4 digit extension added. Comparison becomes tough
<li>What the heck is zip 0?
<li>What about the missing (nan) values? The ? (question mark)? "UNKNOWN"?

<h2>The first step in data cleaning is to:</h2>
<h3>Decide what to do with "bad" data ("JFK", "UNKNOWN", etc.). Convert to Nan or delete the record.</h3>
<h3>Make sure all data in a column is in the correct format (convert floats to strings, get rid of the 4 digit extension)</h3>
<h3>Decide what to do with missing values (NaNs)</h3>

<h3>for "Incident Zip"</h3>
<h4>we'll drop rows with NaN or bad data</h4>
<h4>get rid of the 4 digit extension</h4>
<h4>remove zips less than 10000 and greater than 19999</h4>
<h3>Let's write a function that fixes zips</h3>

In [None]:
def fix_zip(input_zip):
    try:
        input_zip = int(float(input_zip))
    except:
        try:
            input_zip = int(input_zip.split('-')[0])
        except:
            return np.NaN
    if input_zip < 10000 or input_zip > 19999:
        return np.NaN
    return str(input_zip)
        

<h4>And test it</h4>

In [None]:
fix_zip('11211.00')

<h3>Next, we'll apply this function to every element in input zip to get a revised column</h3>
<h4>The pandas function "apply" applies a function to a dataframe column
<li>fix_zip will be applied to each element of the Incident Zip column and we replace the existing column with the modified one

In [None]:
data['Incident Zip'] = data['Incident Zip'].apply(fix_zip)

In [None]:
data['Incident Zip'].unique()

<h3>Finally, we'll get rid of all rows that have zip == Nan</h3>
<li>We don't have to, that's just a choice we're making</li>

In [None]:
data = data[data['Incident Zip'].notnull()]

In [None]:
data

<h3>Let's take a look at the columns again</h3>

In [None]:
data.info()

<h3>Closed Data, Latitude and Longitude all have missing values</h3>
<h3>Let's get rid of them</h3>

In [None]:
data = data[(data['Latitude'].notnull()) & (data['Longitude'].notnull())  & (data['Closed Date'].notnull())]

In [None]:
data.info()

<h4>Let's take a look at Borough data</h4>

In [None]:
data['Borough'].unique()

<h4>Let's look at 'Unspecified'</h4>

In [None]:
data[data['Borough']=='Unspecified'][['Agency','Incident Zip']]

<h4>Looks like a lot of these are NYPD related</h4>
<h4>Let's take a closer look</h4>

In [None]:
data[data['Borough']=='Unspecified'].groupby('Agency').count()

<h4>Unspecified appears to have a systematic bias toward NYPD</h4>
<h4>Though only a small proportion of NYPD complaints (see below)</h4>
<h4>We have to decide whether to keep them or lose them!</h4>

In [None]:
nypd_complaints_total = data[data['Agency']=='NYPD']['Borough'].count()
nypd_unspecified = data[(data['Borough']=='Unspecified') & (data['Agency']=="NYPD")]['Borough'].count()
percentage = nypd_unspecified/nypd_complaints_total*100
print("%1.2f"%percentage)

<h3>For now, we'll get rid of them. Unspecified will be hard to explain!</h3>

In [None]:
data = data[data['Borough'] != 'Unspecified']

<h4>Dealing with time</h4>
<li>Dates and times are best converted to datetime
<li>That way they will be useful for analysis because we can compute timedelta objects

In [None]:
import datetime
data['Created Date'] = data['Created Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))

In [None]:
data['Created Date'][0:20]

In [None]:
data['Closed Date'] = data['Closed Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))


In [None]:
data


<h3>We can create a new column that tracks the time it takes to close a complaint</h3>

In [None]:
data['processing_time'] =  data['Closed Date'] - data['Created Date']

In [None]:
#And look at summary statistics
data['processing_time'].describe()


<h4>There is some odd stuff here</h4>
<li>Negative processing time?
<li>Since our data is for two months, a max of 148 days worth checking out

<h3>Let's examine the negative processing time data</h3>

In [None]:
data[data['processing_time']<datetime.timedelta(0,0,0)]

<h3>And the large processing times as well</h3>

In [None]:
data[data['processing_time']>datetime.timedelta(148,0,0)]

<h3>Looks like the upper end makes sense but the negative times don't</h3>
<h3>Though we need to explore this more, we'll get rid of negative times for now</h3>

In [None]:
data = data[data['processing_time']>=datetime.timedelta(0,0,0)]

<h2>Finally, let's write a function that incorporates all our changes</h2>

In [None]:
def read_311_data(datafile):
    import pandas as pd
    import numpy as np
    #Add the fix_zip function
    def fix_zip(input_zip):
        try:
            input_zip = int(float(input_zip))
        except:
            try:
                input_zip = int(input_zip.split('-')[0])
            except:
                return np.NaN
        if input_zip < 10000 or input_zip > 19999:
            return np.NaN
        return str(input_zip)
    
    #Read the file
    df = pd.read_csv(datafile,index_col='Unique Key')
    
    #fix the zip
    df['Incident Zip'] = df['Incident Zip'].apply(fix_zip)
    
    #drop all rows that have any nans in them (note the easier syntax!)
    
    df = df.dropna(how='any')
    
    #get rid of unspecified boroughs
    df = df[df['Borough'] != 'Unspecified']
    
    #Convert times to datetime and create a processing time column
    
    import datetime
    df['Created Date'] = df['Created Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
    df['Closed Date'] = df['Closed Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
    df['processing_time'] =  df['Closed Date'] - df['Created Date']
    
    #Finally, get rid of negative processing times and return the final data frame
    
    df = df[df['processing_time']>=datetime.timedelta(0,0,0)]
    
    return df
    

In [None]:
df = read_311_data('nyc_311_data_subset.csv')
df.info()