### First step in reformatting utilities
Formerly utilities_reformatted_merge_clean<br>
This file reads in the raw utility xlsx files, downloaded from the reformatted data in Sharepoint.<br>
It standardizes column names and creates separate Year, Month, Day columns from the charge date.<br>
Then, it removes extra spacing from the addresses and converts each address to upper case<br>
It drops charges where the address was recorded as NA<br>
Then, it exports the utilities charges as one large csv<br>

In [6]:
import numpy as np
import pandas as pd
#IN_PATH = '/home/mirabel/Dropbox (GaTech)/CDS-2019-AlbanyHub/Raw-Data/UtilitiesReformatted/'
#OUT_PATH = '/home/mirabel/Dropbox (GaTech)/CDS-2019-AlbanyHub/Test-Replication/'
IN_PATH = '/Users/william/Dropbox (Amherst College)/CDS-2019-AlbanyHub/Raw-Data/UtilitiesReformatted/'
OUT_PATH = '/Users/william/Dropbox (Amherst College)/CDS-2019-AlbanyHub/Test-Replication/'

def fix_oct_2018(df_oct):
    out = df_oct[df_oct['Month'] != '10']
    exclusion = df_oct[~df_oct.isin(out)]
    dropout = exclusion.dropna(how='all')
    return dropout

In [2]:
def fix_oct_2018(df_oct):
    df_oct = df_oct[df_oct['Month'] == '10']
    return df_oct

In [3]:
#Combine monthly utilities xlsx files into a list of dataframes for a particular year
def readexcel(year, in_path):
    months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    if year == "2019":
        months = ["Jan", "Feb"]
    year_list = []
    for month in months:
        #Read in file
        path = in_path + str(year) + '/' + str(year) + month + '.xlsx'
        df = pd.read_excel(path)
        #Drop space from column names
        df.columns = [col.strip() for col in df.columns]
        #Some utilities months contain 'City'/'State' and others don't -> add city and state columns
        if "City" not in df.columns:
            df.insert(2, "City", "ALBANY")
            df.insert(3, "State", "GA")
        #Change the type of charge date to string
        df['Charge Date'] = df['Charge Date'].astype(str)
        parts = df['Charge Date'].str.split("-", expand=True)
        #Split charge date string into year, month, and day
        df['Year'] = parts[0]
        df['Month'] = parts[1]
        df['Day'] = parts[2]
        if month == "Oct" and year == "2018":
            df = fix_oct_2018(df)
        year_list.append(df)
        print(month)
    return year_list

In [4]:
#Write the dataframes from each year into one csv
def writedf(year_list, year, out_path):
    months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    df = year_list[0]
    for i in range(1, len(year_list)):
        df = pd.concat([df, year_list[i]], ignore_index=True)
        print(months[i])
    path = out_path+'Totals'+str(year)+'.csv'
    df.to_csv(path, index=False)

### Read the raw data and write to combined csvs for each year

In [None]:
df2012 = readexcel("2012", IN_PATH)

In [11]:
df2013 = readexcel("2013", IN_PATH)

Jan
Feb
Mar
Apr
May
Jun
Jul
Aug
Sep
Oct
Nov
Dec


In [5]:
df2014 = readexcel("2014", IN_PATH)

Jan
Feb
Mar
Apr
May
Jun
Jul
Aug
Sep
Oct
Nov
Dec


In [6]:
df2015 = readexcel("2015", IN_PATH)

Jan
Feb
Mar
Apr
May
Jun
Jul
Aug
Sep
Oct
Nov
Dec


In [5]:
df2016 = readexcel("2016", IN_PATH)

Jan
Feb
Mar
Apr
May
Jun
Jul
Aug
Sep
Oct
Nov
Dec


In [6]:
df2017 = readexcel("2017", IN_PATH)

Jan
Feb
Mar
Apr
May
Jun
Jul
Aug
Sep
Oct
Nov
Dec


In [7]:
df2018 = readexcel("2018", IN_PATH)

Jan
Feb
Mar
Apr
May
Jun
Jul
Aug
Sep
Oct
Nov
Dec


In [10]:
df2019 = readexcel("2019", IN_PATH)

Jan
Feb


Write (Optional)

In [13]:
writedf(df2012, 2012, OUT_PATH)

Feb
Mar
Apr
May
Jun
Jul
Aug
Sep
Oct
Nov
Dec


In [14]:
writedf(df2013, 2013, OUT_PATH)

Feb
Mar
Apr
May
Jun
Jul
Aug
Sep
Oct
Nov
Dec


In [8]:
writedf(df2014, 2014, OUT_PATH)

Feb
Mar
Apr
May
Jun
Jul
Aug
Sep
Oct
Nov
Dec


In [9]:
writedf(df2015, 2015, OUT_PATH)

Feb
Mar
Apr
May
Jun
Jul
Aug
Sep
Oct
Nov
Dec


In [7]:
writedf(df2016, 2016, OUT_PATH)

Feb
Mar
Apr
May
Jun
Jul
Aug
Sep
Oct
Nov
Dec


In [8]:
writedf(df2017, 2017, OUT_PATH)

Feb
Mar
Apr
May
Jun
Jul
Aug
Sep
Oct
Nov
Dec


In [8]:
writedf(df2018, 2018, OUT_PATH)

Feb
Mar
Apr
May
Jun
Jul
Aug
Sep
Oct
Nov
Dec


In [12]:
writedf(df2019, 2019, OUT_PATH)

Feb


### run cleaning script after merging together

In [13]:
#path = '/Users/william/Dropbox (Amherst College)/CDS-2019-AlbanyHub/Raw-Data/ReformattedYears/'
path=OUT_PATH

In [75]:
#Combine all year csvs into a single dataframe
years = ["2013", "2014", "2015", "2016", "2017", "2018", "2019"]
df = pd.read_csv(path + "Totals2012.csv")
for year in years:
    df_test = pd.read_csv(path+ "Totals" + year + ".csv")
    df = pd.concat([df, df_test], ignore_index=True)
    

In [80]:
df.index = range(len(df))

In [81]:
#Drop extra column from index if it exists
print(df.shape)
df.drop('Unnamed: 0', inplace=True, axis=1, errors='ignore')
df.shape

(15305018, 16)


(15305018, 16)

In [82]:
s_utilities = df['Premise Address']
s_utilities.shape # number of records

(15305018,)

In [83]:
s_utilities[s_utilities.isna()].shape #There are some NA addresses

(20754,)

In [84]:
df = df.dropna(subset=['Premise Address']) #Drop NA addresses
df.shape

(15284264, 16)

In [75]:
#Send to one large csv (~2GB)
df.to_csv(path+"Total.csv", index=False)

In [3]:
#Read back in (or skip if it is already in memory)
df = pd.read_csv(path+'Total.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [85]:
#Strip extra spaces from each address
s_utilities = df['Premise Address'].str.strip()
#Uncomment if this results in memory issues
# del df 
s_utilities[s_utilities.isna()].shape

(0,)

In [86]:
s_utilities[s_utilities.isna()].shape

(0,)

In [87]:
#Strip extra space between words
lis = [None] * len(s_utilities)
for i in range(len(s_utilities)):
    #Print % done
    if i % 1000000 == 0:
        print(str(i))
    tmp = s_utilities.iloc[i].split() #Split by all space characters -> list 
    lis[i] = " ".join(tmp) # Reformat with only one space
len(lis)

0
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000


15284264

In [88]:
fixed_util = pd.Series(lis)
# del lis
# del s_utilities
# #Read back in (or skip if it is already in memory)
# df = pd.read_csv(path+'Total.csv')

In [89]:
fixed_util[fixed_util.isna()].shape

(0,)

In [90]:
fixed_util.tail()

15284259    308 ACORN ST
15284260    308 ACORN ST
15284261    308 ACORN ST
15284262    308 ACORN ST
15284263    308 ACORN ST
dtype: object

In [9]:
#Export fixed addresses to csv
#fixed_util.to_csv(path+"fixed_addr.csv", index=False)

In [55]:
fixed_util.shape

(15284264,)

In [91]:
fixed_util[fixed_util.str.upper().isna()]

Series([], dtype: object)

In [92]:
#Convert all addresses to upper case
df['Premise Address'] = fixed_util.str.upper()

In [93]:
sum(fixed_util.str.upper().isna())

0

In [94]:
test = fixed_util.str.upper()

In [96]:
df.index = range(len(df))

In [97]:
df['Premise Address'].index

RangeIndex(start=0, stop=15284264, step=1)

In [102]:
sum(test.isna())

0

In [103]:
df['Premise Address'] = test

In [104]:
sum(df['Premise Address'].isna())

0

In [106]:
df[df['Year']==2018]['Month'].value_counts()

1.0     184673
8.0     183684
7.0     182694
3.0     182619
6.0     182361
9.0     182166
4.0     181639
2.0     181630
5.0     181134
11.0    170930
10.0    169063
12.0    149900
Name: Month, dtype: int64

In [109]:
df[df['Year']==2015]['Month'].value_counts()

8.0     192971
10.0    188676
4.0     187230
3.0     187085
9.0     186960
5.0     186670
7.0     186057
2.0     185938
11.0    185358
6.0     184930
12.0    183841
1.0     183792
Name: Month, dtype: int64

In [110]:
#Export back to original csv
df.to_csv(path+"Total.csv", index=False)

In [36]:
df['Premise Address'].value_counts()

2401 NOTTINGHAM WAY         64908
539 N WESTOVER BLVD         55105
525 DON CUTLER SR DR        52647
2415 BRIERWOOD DR           47812
4000 GILLIONVILLE RD        47811
2415 DAWSON RD              43053
1200 KINGSTOWN CT           41413
320 S JACKSON ST            41405
2507 NOTTINGHAM WAY         40435
333 S MOCK RD               39349
2724 LEDO RD                38295
1308 HOBSON ST              37700
211 PINEBLUFF RD            36674
600 SANDS DR                36432
421 S WESTOVER BLVD         33051
2010 W BROAD AVE            32075
2000 TOMPKINS AVE           29865
1601 RADIUM SPRINGS RD      28411
1005 E 4TH AVE              26794
1001 DUNES AVE              26421
500 PINSON RD               26182
1404 WHISPERING PINES RD    25715
1336 MERCANTILE DR          25132
2030 W BROAD AVE            25033
3110 GRAYSTONE LN           22518
2304 W GORDON AVE           21933
419 S WESTOVER BLVD         21295
629 16TH AVE                21021
613 16TH AVE                20635
2710 W OAKRIDG

# Step 1 Stop Here

In [10]:
dftestutilities = pd.read_csv(path+'utilities_notfound.csv')

In [11]:
dftestutilities.iloc[300:400]

Unnamed: 0,addr,loc
300,1401 WATER HYDRANT METER,1471614
301,1401 WATER HYDRANT METER,1471615
302,1401 WATER HYDRANT METER,1471616
303,1401 WATER HYDRANT METER,1471617
304,1401 WATER HYDRANT METER,1471618
305,1401 WATER HYDRANT METER,1471619
306,1401 WATER HYDRANT METER,1471620
307,1401 WATER HYDRANT METER,1471621
308,1401 WATER HYDRANT METER,1471622
309,1401 WATER HYDRANT METER,1471623


In [None]:
dftest = pd.read_csv(path+'Total.csv')

In [None]:
dftest.head()

In [None]:
dftest['Premise Address'][0]

In [None]:
df = pd.read_csv('/Users/william/Dropbox (Amherst College)/CDS-2019-AlbanyHub/Raw-Data/ReformattedYears/Total.csv')

In [None]:
df.drop(['Unnamed: 0', 'ChargeID.1'], axis=1, inplace=True)

In [None]:
dftestutilities = pd.read_csv('/Users/william/Dropbox (Amherst College)/CDS-2019-AlbanyHub/Raw-Data/ReformattedYears/utilities_notfound.csv')

In [None]:
dftestutilities['addr'].value_counts()

120 OLD BLAYLOCK LN
902 COTTON AVE

In [None]:
df_junc = pd.read_csv('/Users/william/Dropbox (Amherst College)/CDS-2019-AlbanyHub/ToDatabase/addr_junct_table.csv')

In [None]:
df_junc[df_junc['Address'] == "929 BLAKELY CT"]

In [None]:
import Levenshtein

In [None]:
def lev_list(address_list, address, i):
    temp_list = []
    temp_list_u = []
    for u_address in address_list:
        if Levenshtein.distance(address, u_address.upper()) <= i:
            temp_list.append(address)
            temp_list_u.append(u_address.upper())
    temp_list_series = pd.Series(temp_list)
    temp_list_u_series = pd.Series(temp_list_u)
    return temp_list_u_series

In [None]:
lev_list(df_junc['Address'], "231 E OGLETHORPE AVE", 1)

In [7]:
df.shape

(15637769, 16)

In [8]:
s_u = df['Premise Address']

In [None]:
s_