# This Jupyter script will cleanup the data for the Austin permit dataset found here: [Austin Data](https://www.opendatanetwork.com/dataset/data.austintexas.gov/3syk-w9eu)

The following script assumes that the data sets have been downloaded and stored within the `datasets` directory.

In [53]:
import pandas as pd
import numpy as np
import os
df = pd.read_csv('../datasets/Austin_Permits.csv', low_memory = False, encoding='utf-8')

In [54]:
df.head()

Unnamed: 0,Permit Type,Permit Type Desc,Permit Num,Permit Class Mapped,Permit Class,Work Class,Condominium,Project Name,Description,TCAD ID,...,Contractor Zip,Applicant Full Name,Applicant Organization,Applicant Phone,Applicant Address 1,Applicant Address 2,Applicant City,Applicant Zip,Certificate Of Occupancy,Total Lot SQFT
0,EP,Electrical Permit,1985-024664 EP,Residential,Residential,Remodel,No,1403 City Park Road C 00000,Bldg Repairs,,...,,,,,,,,,No,
1,EP,Electrical Permit,1985-018847 EP,Commercial,Commercial,Remodel,No,2929 Bee Caves Road H B00000,Shell Bldg Bldg B,,...,,,,,,,,,No,
2,EP,Electrical Permit,1987-016991 EP,Commercial,Sign Permit,,No,5211 Mc Carty Lane A 00000,Berm Sign For Church,,...,,,,,,,,,No,
3,BP,Building Permit,1984-024176 BP,Residential,Residential,Remodel,No,13333 Morris Road R 00000,Duplex,,...,,,,,,,,,No,
4,BP,Building Permit,1987-015505 BP,Commercial,C-1000 Commercial Remodel,Remodel,No,104 6 Street East A 00100,Remodel To Create Rollin Donuts,,...,,,,,,,,,Yes,


## Define some global variables

In [55]:
CITY = 'Austin'

# DATASET COLUMNS
PERMIT_TYPE_DEF_COL = 'Permit Type Definition'
PERMIT_DESC_COL = 'Description'

## Rename columns to match other datasets

In [56]:
df = df.rename(columns={"Permit Type Desc": PERMIT_TYPE_DEF_COL})

## Check if there are any duplicate permits:

In [57]:
data_size = df.shape[0]
print(f"Number of records before removing duplicates: {data_size}")
df.drop_duplicates(subset=['Description'])
data_size_after_drop = df.shape[0]
removed = data_size - data_size_after_drop
print(f"Removed {removed} duplicates.")

Number of records before removing duplicates: 2090115
Removed 0 duplicates.


## Drop columns that will not be used:

In [58]:
keep_cols = [PERMIT_TYPE_DEF_COL, PERMIT_DESC_COL]
df = df[keep_cols]
df.head()

Unnamed: 0,Permit Type Definition,Description
0,Electrical Permit,Bldg Repairs
1,Electrical Permit,Shell Bldg Bldg B
2,Electrical Permit,Berm Sign For Church
3,Building Permit,Duplex
4,Building Permit,Remodel To Create Rollin Donuts


## Replace empty values with NaN and display rows that have NaN values

In [59]:
df = df.replace(' ', np.nan)
nan_values = df[df.isna().any(axis=1)]
df.dropna()

Unnamed: 0,Permit Type Definition,Description
0,Electrical Permit,Bldg Repairs
1,Electrical Permit,Shell Bldg Bldg B
2,Electrical Permit,Berm Sign For Church
3,Building Permit,Duplex
4,Building Permit,Remodel To Create Rollin Donuts
...,...,...
2090110,Building Permit,Dumpster compactor pad
2090111,Building Permit,New 466 Sf Swimming Pool And 414 Sf Concrete D...
2090112,Building Permit,New Detached WorkshopOffice New Bath Kitchen ...
2090113,Building Permit,New 2 Story Sf Res Att Garage Covd Patio Covd ...


## Map 'Driveway/Sideway' permit type to Building permit type

In [60]:
df.loc[(getattr(df, PERMIT_TYPE_DEF_COL) == 'Driveway / Sidewalks'), PERMIT_TYPE_DEF_COL] = 'Building Permit'

# ML Playground

## Display the count for each permit type

In [61]:
df[PERMIT_TYPE_DEF_COL].value_counts()

Electrical Permit    601126
Building Permit      569781
Plumbing Permit      468221
Mechanical Permit    450987
Name: Permit Type Definition, dtype: int64

In [62]:
df.head()

Unnamed: 0,Permit Type Definition,Description
0,Electrical Permit,Bldg Repairs
1,Electrical Permit,Shell Bldg Bldg B
2,Electrical Permit,Berm Sign For Church
3,Building Permit,Duplex
4,Building Permit,Remodel To Create Rollin Donuts


## Grab X data

In [63]:
X = df.drop(columns=[PERMIT_TYPE_DEF_COL])
X.head()

Unnamed: 0,Description
0,Bldg Repairs
1,Shell Bldg Bldg B
2,Berm Sign For Church
3,Duplex
4,Remodel To Create Rollin Donuts


## Grab y data (Labels)

In [64]:
y = getattr(df, PERMIT_TYPE_DEF_COL)
y.head()

0    Electrical Permit
1    Electrical Permit
2    Electrical Permit
3      Building Permit
4      Building Permit
Name: Permit Type Definition, dtype: object

## Export X and y data to CSV

In [65]:
df.to_csv(path_or_buf=f'../datasets/clean_{CITY}DataSet.csv', index=False)

## Uncommnet the following if you would like to separate the X and y datasets into different files

In [66]:
# X.to_csv(path_or_buf=f'../datasets/X_{CITY}DataSet.csv', index=False)

In [67]:
# y.to_csv(path_or_buf=f'../datasets/y_{CITY}DataSet.csv', index=False)