# Data Cleaning intro

Working with New York 311 dataset on Incident records by time and location

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Set pandas defaults
# Show max 10 rows: head(5) ... tail(5)
pd.set_option('max_rows', 10) 

In [None]:
import warnings
warnings.filterwarnings("ignore")

### read_csv
- tries to format the data in correct type
- looks for a header row

#### Sanity check path

In [None]:
# !ls

In [None]:
datafile = '../data/nyc_311_data_subset-2.csv'

In [None]:
df = pd.read_csv(datafile)
df

## Feature exploration
- for efficiency set priority in which you want to clean/engineer the data, e.g.:
  - get rid of singletons
  - get rid of unique identifiers
  - get rid of irrelavant features
  - order priority on non-null values:
    - clean/engineer object types (multi types)
    - clean/engineer date types
    - clean/engineer float types
    - clean/engineer integer types

In [None]:
df.info()

In [None]:
df.describe()

### Unique Key

In [None]:
df['Unique Key'].unique()

##### 'Unique Key' is not so unique after all

In [None]:
del df['Unique Key']

### Incident zip

In [None]:
df['Incident Zip'].unique();

#### Types within Zipcodes:
- 5 digit numbers
- 5 digit numbers start with 0 
- 4 digit numbers
- 5 digits-4 digits number
- 0
- missing (nan), ?, 'UNKNOWN'
- 'JFK'

#### Define valid version
- 10000< valid NY zipcode <19999
- NaN otherwise

#### Clean

In [None]:
def fix_zip(input_zip):
    try:
        # string numbers -> float numbers -> int numbers
        # string words will raise exception
        input_zip = int(float(input_zip))
    except:
        try:
            # split '12345-1234' and take first part - remove last part
            # string words will raise exception
            input_zip = int(input_zip.split('-')[0])
        except:
            # string words -> NaN
            return np.NaN
        
    # incorrect zipcodes -> NaN
    if input_zip < 10000 or input_zip > 19999:
        return np.NaN
    return str(input_zip)
        

#### Test suite
- Fail fast!
- Define border cases to break 

In [None]:
test_cases = ['?', 0, '1234', '01234', '12345-1234', 'JFK', 'UNKNOWN', '12345']
correct_cases = [np.NaN, np.NaN, np.NaN, np.NaN, '12345', np.NaN, np.NaN, '12345']

for i, zip in enumerate(test_cases):
    fixed = fix_zip(zip)
    np.testing.assert_equal(fixed, correct_cases[i], err_msg='Zipcode {} is not cleaned correctly'.format(zip))
    print('zip code: {}, fixed: {}'.format(zip, fixed))


#### Apply function to feature

In [None]:
df['Incident Zip'] = df['Incident Zip'].apply(fix_zip)

In [None]:
df['Incident Zip'].unique();

#### Check percentage of samples containing NaN

In [None]:
def perc_null(feature):
    null_percentage = df[feature].isnull().sum() / df[feature].notnull().sum() * 100
    print('Percentage of NaN in {}: {:.2f}%'.format(feature, null_percentage))

In [None]:
perc_null('Incident Zip')

#### Remove samples from df containing NaN

In [None]:
# df['Incident Zip'].dropna(axis=0, inplace=True)
df = df[df['Incident Zip'].notnull()]
df.info()

### Latitude and Longitude

In [None]:
df.info()

#### Check percentage of samples containing NaN

In [None]:
perc_null('Latitude'), perc_null('Longitude');

In [None]:
# combined percentage NaN's
lat_lon_notnull = (df['Latitude'].notnull()) & (df['Longitude'].notnull())
(len(df) - sum(lat_lon_notnull)) / len(df) * 100

#### Remove samples from df containing NaN

In [None]:
df = df[lat_lon_notnull]

#### Sanity check

In [None]:
perc_null('Latitude'), perc_null('Longitude')
df.info()

### Closed Date

In [None]:
perc_null('Closed Date')

In [None]:
df = df[df['Closed Date'].notnull()]

#### Sanity check

In [None]:
perc_null('Closed Date')
df.info()

### Borough

In [None]:
df['Borough'].unique()

#### Explore 'Unspecified'

In [None]:
df[df['Borough']=='Unspecified']

#### Dive into relationships

In [None]:
#### Check Frequency

In [None]:
def frequency(df, feature, value, by='', n_top=5):
    if by == '':
        by = feature
    return (df[df[feature]==value][by].value_counts()
                                      .sort_values(ascending=False)
                                      .nlargest(n_top))

In [None]:
frequency(df, 'Borough', 'Unspecified', 'Incident Zip')

In [None]:
frequency(df, 'Borough', 'Unspecified', 'Latitude')

In [None]:
frequency(df, 'Borough', 'Unspecified', 'Agency')

#### Check relevance

In [None]:
frequency(df, 'Agency', 'NYPD')

In [None]:
frequency(df, 'Borough', 'Unspecified', 'Agency') / frequency(df, 'Agency', 'NYPD')[0]

#### Remove 'unspecified'

In [None]:
df = df[df['Borough'] != 'Unspecified']

#### Sanity check

In [None]:
df['Borough'].unique()

### Convert dates and time to datetime object
 - to compute and use analysis on them

In [None]:
import datetime

In [None]:
df['Created Date'] = df['Created Date'].apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'))

In [None]:
df['Created Date'][0:20]

In [None]:
df['Closed Date'] = df['Closed Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))

In [None]:
df

### Create time elapsed feature

In [None]:
df['processing_time'] =  df['Closed Date'] - df['Created Date']

In [None]:
#And look at summary statistics
df['processing_time'].describe()

#### Explore
 - negative processing time
 - our data is for two months, a max of 148 days worth checking out

<h3>Let's examine the negative processing time data</h3>

In [None]:
df[df['processing_time'] < datetime.timedelta(0, 0, 0)]

<h3>And the large processing times as well</h3>

In [None]:
data[data['processing_time'] > datetime.timedelta(148,0,0)]

<h3>Looks like the upper end makes sense but the negative times don't</h3>
<h3>Though we need to explore this more, we'll get rid of negative times for now</h3>

In [None]:
data = data[data['processing_time'] >= datetime.timedelta(0,0,0)]

<h2>Finally, let's write a function that incorporates all our changes</h2>

In [None]:
def read_311_data(datafile):
    import pandas as pd
    import numpy as np
    #Add the fix_zip function
    def fix_zip(input_zip):
        try:
            input_zip = int(float(input_zip))
        except:
            try:
                input_zip = int(input_zip.split('-')[0])
            except:
                return np.NaN
        if input_zip < 10000 or input_zip > 19999:
            return np.NaN
        return str(input_zip)
    
    #Read the file
    df = pd.read_csv(datafile,index_col='Unique Key')
    
    #fix the zip
    df['Incident Zip'] = df['Incident Zip'].apply(fix_zip)
    
    #drop all rows that have any nans in them (note the easier syntax!)
    
    df = df.dropna(how='any')
    
    #get rid of unspecified boroughs
    df = df[df['Borough'] != 'Unspecified']
    
    #Convert times to datetime and create a processing time column
    
    import datetime
    df['Created Date'] = df['Created Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
    df['Closed Date'] = df['Closed Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
    df['processing_time'] =  df['Closed Date'] - df['Created Date']
    
    #Finally, get rid of negative processing times and return the final data frame
    
    df = df[df['processing_time']>=datetime.timedelta(0,0,0)]
    
    return df
    

In [None]:
df = read_311_data('nyc_311_data_subset-2.csv')
df.info()