# This Jupyter script will aggregate the data for the Austin, San Francisco and Boston permit datasets

The following script assumes that all 3 of the following scripts have run successfully and have created the files specified below to be used in this script stored within the `datasets` directory.
- `clean_AustinDataSet.csv`
- `clean_SanFranciscoDataSet.csv`
- `clean_BostonDataSet.csv`

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
aus_df = pd.read_csv('../datasets/clean_AustinDataSet.csv')
sf_df = pd.read_csv('../datasets/clean_SanFranciscoDataSet.csv')
bos_df = pd.read_csv('../datasets/clean_BostonDataSet.csv')

## Define some global variables

In [11]:
# DATASET COLUMNS
BASE_DIR = 'all_cities_dataset'
PERMIT_TYPE_DEF_COL = 'Permit Type Definition'
PERMIT_DESC_COL = 'Description'

In [12]:
if aus_df.shape[1] != 2:
    raise Exception(f"Austin data doesn't have right columns: {aus_df.columns}")
if sf_df.shape[1] != 2:
    raise Exception(f"SF data doesn't have right columns: {sf_df.columns}")
if bos_df.shape[1] != 2:
    raise Exception(f"Boston data doesn't have right columns: {bos_df.columns}")

# Append the datasets together

In [13]:
print(f"Size before append of the Austin dataest: {aus_df.shape}")
print(f"Size before append of the SF dataest: {sf_df.shape}")
print(f"Size before append of the Boston dataest: {bos_df.shape}")
df = aus_df.append(sf_df)
df = df.append(bos_df)
print(f"Size of final dataframe with all datasets: {df.shape}")

Size before append of the Austin dataest: (2090115, 2)
Size before append of the SF dataest: (1705703, 2)
Size before append of the Boston dataest: (523418, 2)
Size of final dataframe with all datasets: (4319236, 2)


## Display the count for each permit type

In [14]:
df[PERMIT_TYPE_DEF_COL].value_counts()

Building Permit      1958111
Electrical Permit    1068872
Plumbing Permit       789348
Mechanical Permit     502905
Name: Permit Type Definition, dtype: int64

# Create directory structure for training data

In [15]:
os.makedirs(BASE_DIR, exist_ok=True)

In [16]:
elc_lbl = "Electrical Permit"
plb_lbl = "Plumbing Permit"
blg_lbl = "Building Permit"
mch_lbl = "Mechanical Permit"
labels = [elc_lbl, plb_lbl, blg_lbl, mch_lbl]
directories = ["electrical_permit", "plumbing_permit", "building_permit", "mechanical_permit"]

for directory in directories:
    os.makedirs("all_cities_dataset/" + directory, exist_ok=True)   

In [17]:
# count1, count2, count3, count4 = 0, 0, 0, 0
# for i in range(len(df)):
#     if (getattr(df.iloc[i], PERMIT_TYPE_DEF_COL) == elc_lbl):
#         file = open(f"{BASE_DIR}/electrical_permit/" + str(count1) + ".txt", 'w', encoding="utf-8")
#         file.write(str(getattr(df.iloc[i], PERMIT_DESC_COL)))
#         file.close()
#         count1 += 1
#     if (getattr(df.iloc[i], PERMIT_TYPE_DEF_COL) == plb_lbl):
#         file = open(f"{BASE_DIR}/plumbing_permit/" + str(count2) + ".txt", 'w', encoding="utf-8")
#         file.write(str(getattr(df.iloc[i], PERMIT_DESC_COL)))
#         file.close()
#         count2 += 1
#     if (getattr(df.iloc[i], PERMIT_TYPE_DEF_COL) == blg_lbl):
#         file = open(f"{BASE_DIR}/building_permit/" + str(count3) + ".txt", 'w', encoding="utf-8")
#         file.write(str(getattr(df.iloc[i], PERMIT_DESC_COL)))
#         file.close()
#         count3 += 1
#     if (getattr(df.iloc[i], PERMIT_TYPE_DEF_COL) == mch_lbl):
#         file = open(f"{BASE_DIR}/mechanical_permit/" + str(count4) + ".txt", 'w', encoding="utf-8")
#         file.write(str(getattr(df.iloc[i], PERMIT_DESC_COL)))
#         file.close()
#         count4 += 1
# print(count1, count2, count3, count4)

# Export to file

## Grab X data

In [None]:
X = df.drop(columns=[PERMIT_TYPE_DEF_COL])
X.head()

Unnamed: 0,Description
0,addiition to a new unit on basement fl per ord...
1,(rear of (e) lot with an (e) r-3/28 structure ...
2,(rear of (e) lot with an (e) r-3/28 structure ...
3,"to erect 4 stories, no basement, type v-b, 2 u..."
4,legalize (e) dwelling unit @ 1st fl per ordina...


## Grab y data (Labels)

In [None]:
y = getattr(df, PERMIT_TYPE_DEF_COL)
y.head()

0    Building Permit
1    Building Permit
2    Building Permit
3    Building Permit
4    Building Permit
Name: Permit Type Definition, dtype: object

## Export X and y data to CSV

In [19]:
df.to_csv(path_or_buf=f'../datasets/clean_AllCitiesDataSet.csv', index=False)