In [58]:
import pandas as pd
import numpy as np
import usaddress, os, re, time
import sqlite3 as lite
from nyc_geoclient import Geoclient

In [59]:
# NYC Geoclient API token
g=Geoclient('73c3cfed', 'bcuny951952geocli')

In [140]:
def clean_strings(x):
    try:
        return str(x).strip()
    except ValueError:
        return np.nan
    
def parse_address(address):
    separators=['Apt','APT','#']
    # gets rid of the apartments in the address
    for separator in separators:
        if address.find(separator)!=-1:
            no_apt=address.split(separator,1)[0]
        else:
            no_apt=address
        #apartments can also be indicated by comma followed by number with optional letter (ex: , 503C)
        street=re.split(r'(,\s\d+$|,\s\d+\w{1}$)', no_apt)
        street=street[0] # grab what came before the apartment
        
        #separate address into street number and street name.
    split=re.split(r'(^\d+\s|^\d+-\d+\s|^\d+\w{1}\s|^\d+-\d+\w{1}\s)', street)
    if split[0]=='':            
        split.remove(split[0])
#     print split
    # strip trailing spaces
    return [_.strip() for _ in split]
        
def AddressMatch(str_num,str_name,borough): ## function to geocode street addresses
    geocode=g.address(str_num,str_name,borough)
    message=geocode.get('message')
    latitude=geocode.get('latitude')
    longtitude=geocode.get('longitude')
    GeocodeResult='Address Match'
    return [longtitude, latitude, GeocodeResult, message]

def BlockMatch(borough,block,lot): ## function to geocode blocks and lots
    geocode=g.bbl(borough,block,lot)
    message=geocode.get('message')
    latitude=geocode.get('latitudeInternalLabel')
    longtitude=geocode.get('longitudeInternalLabel')
    GeocodeResult='Block Match'
    return [longtitude, latitude, GeocodeResult, message]

def Geocode(df):
    start=int(input('From what line should I start? Type in numercial value; type 0 (zero) for the first iteration'))
    counter=0
    for index, row in df.iterrows():
        # geocode only from start
        if index==0 or index>start:    
            counter=counter+1
            if counter%50==0:
                # will pause for 1 second after 50 geocoded records
                time.sleep(1)
            try:
                # do Address match first
                result=AddressMatch(row['street_number'], row['street_name'], int(row['borough']) )

                # if longtitude is None-->invalid addres, then try BlockMatch function
                if result[0] is None:
                    result=BlockMatch(int(row['borough']), int(row['block']), int(row['lot']) )

                    # if BlockMatch didn't return longtitude, mark the record as Unmatched in place of result
                    if result[0] is None:
                        result[2]='Unmatched'

                # for the db, remove last two items (parsed addrres) from the dataframe row and add geocoded results
                db_row=list(row[0:-2])+result

                # writing geocoded record into a database
                cur=con.cursor()
                table_name='_yr'+year
                cur.execute('''CREATE TABLE IF NOT EXISTS %s (sale_id INTEGER PRIMARY KEY, bbl_id INTEGER, 
                year TEXT, borough INTEGER,nbhd TEXT, bldg_ctgy TEXT,
                tax_cls_p TEXT, block TEXT,lot TEXT,easmnt TEXT, bldg_cls_p TEXT,address TEXT,
                apt TEXT, zip TEXT, res_unit INTEGER,com_unit INTEGER, tot_unit INTEGER, land_sqft INTEGER,
                tot_sqft INTEGER, yr_built INTEGER, tax_cls_s TEXT, bldg_cls_s TEXT,sale_date TEXT, price INTEGER,
                usable TEXT, long REAL, lat REAL, georesult TEXT, message TEXT)'''% table_name)

                qMark='?,'*28
                placeholder=qMark[:-1]       
                cur.execute('''INSERT INTO %s(bbl_id, year, borough, nbhd, bldg_ctgy, tax_cls_p, block,lot,
                easmnt, bldg_cls_p, address, apt, zip, res_unit, com_unit, tot_unit, land_sqft, tot_sqft,
                yr_built, tax_cls_s, bldg_cls_s, sale_date, price, usable, long, lat, 
                georesult, message) VALUES (%s)''' % (table_name,placeholder), db_row)        
                con.commit()    

            except Exception as e:
                print e
                print('An error has occurred. File stopped at index '+str(index))
                break
    con.close()    
    print 'Done'

In [61]:
data_path='/Users/anastasiaclark/NYC_RE_Sales'
year='2017'

In [62]:
df_list=[]
data_folder=os.path.join(data_path,year)
boro_sales=[table for table in os.listdir(data_folder) if not 'citywide_sales' in table ]
for boro_table in boro_sales:
    df=pd.read_excel(os.path.join(data_folder,boro_table),skiprows=[0,1,2,3], parse_dates=True)
    df_list.append(df)

In [84]:
sales=pd.concat(df_list)
sales.columns=[c.strip() for c in sales.columns]

sales.rename(columns={'BOROUGH': 'borough','NEIGHBORHOOD':'nbhd','BUILDING CLASS CATEGORY':'bldg_ctgy',
                   'TAX CLASS AT PRESENT':'tax_cls_p','BLOCK':'block','LOT':'lot',
                      'EASE-MENT':'easmnt','BUILDING CLASS AT PRESENT':'bldg_cls_p','ADDRESS':'address',
                   'APARTMENT NUMBER':'apt','ZIP CODE':'zip','RESIDENTIAL UNITS':'res_unit',
                   'COMMERCIAL UNITS':'com_unit','TOTAL UNITS':'tot_unit',
                   'LAND SQUARE FEET':'land_sqft','GROSS SQUARE FEET':'tot_sqft',
                   'YEAR BUILT':'yr_built','TAX CLASS AT TIME OF SALE':'tax_cls_s',
                   'BUILDING CLASS AT TIME OF SALE':'bldg_cls_s',
                      'SALE PRICE':'price','SALE DATE':'sale_date'}, 
          inplace=True)

# in 2017 DOF changed the column names, assuming that column 'BUILDING CLASS AS OF FINAL ROLL 17/18' will have different
# ending, locate them using the regex and rename them
sales.rename(columns={sales.filter(regex='BUILDING CLASS AS OF FINAL ROLL*').columns[0]: 'bldg_cls_p', 
                      sales.filter(regex='TAX CLASS AS OF FINAL ROLL*').columns[0]: 'tax_cls_p'}, inplace=True)


In [86]:
text_cols=[c for c in sales.columns if sales[c].dtype=='object']
for c in text_cols:
    sales[c]=sales[c].apply(lambda x: clean_strings(x))
    
sales['bbl_id']=sales['borough'].astype(str)+sales['block'].astype(str)+sales['lot'].astype(str)
sales['usable']=np.where(sales['price']>10,'True','False')
sales['year']='{}'.format(year)
# SQLite doesn't support pandas datetime format
sales['sale_date']=sales['sale_date'].astype(str)

# re-arrange the order of the columns to be same as in the past
cols_order=['bbl_id', 'year', 'borough', 'nbhd', 'bldg_ctgy', 'tax_cls_p', 'block', 'lot',
            'easmnt', 'bldg_cls_p', 'address', 'apt', 'zip', 'res_unit', 'com_unit', 
            'tot_unit', 'land_sqft', 'tot_sqft', 'yr_built', 'tax_cls_s', 'bldg_cls_s', 
            'sale_date', 'price', 'usable']

sales=sales.loc[:, cols_order]

In [87]:
sales[['street_number', 'street_name']]=sales.apply(lambda row: pd.Series(parse_address(row['address'])), axis=1)

In [88]:
sales

Unnamed: 0,bbl_id,year,borough,nbhd,bldg_ctgy,tax_cls_p,block,lot,easmnt,bldg_cls_p,...,land_sqft,tot_sqft,yr_built,tax_cls_s,bldg_cls_s,sale_date,price,usable,street_number,street_name
0,2302825,2017,2,BATHGATE,01 ONE FAMILY DWELLINGS,1,3028,25,,A5,...,1842,2048,1901,1,A5,2017-04-04,0,False,412,EAST 179 STREET
1,2303055,2017,2,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,55,,A5,...,1330,1460,1899,1,A5,2017-07-18,305000,True,410,EAST 182ND STREET
2,2303056,2017,2,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,56,,A1,...,1306,1440,1899,1,A1,2017-01-19,178000,True,412,EAST 182 STREET
3,2303056,2017,2,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,56,,A1,...,1306,1440,1899,1,A1,2017-07-14,449000,True,412,EAST 182 STREET
4,2303065,2017,2,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,65,,A1,...,1622,1587,1899,1,A1,2017-05-12,140000,True,4455,PARK AVENUE
5,2303070,2017,2,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,70,,A1,...,1694,1497,1899,1,A1,2017-11-06,246000,True,4445,PARK AVENUE
6,2303613,2017,2,BATHGATE,01 ONE FAMILY DWELLINGS,1,3036,13,,A1,...,3525,1764,1899,1,A1,2017-02-03,420000,True,4348,PARK AVENUE
7,2303742,2017,2,BATHGATE,01 ONE FAMILY DWELLINGS,1,3037,42,,A1,...,3525,1340,1899,1,A1,2017-09-29,380250,True,4428,PARK AVENUE
8,23037101,2017,2,BATHGATE,01 ONE FAMILY DWELLINGS,1,3037,101,,S0,...,1293,2820,1952,1,S0,2017-11-13,325000,True,443,EAST 180 STREET
9,2304355,2017,2,BATHGATE,01 ONE FAMILY DWELLINGS,1,3043,55,,A1,...,2356,2047,1901,1,A1,2017-08-22,127000,True,1948,BATHGATE AVENUE


# Testing

In [120]:
test=sales.head(n=30)

In [123]:
start=10

In [137]:
z=input('Test')

Test4


In [139]:
type(z)

int

In [None]:
# here is the data
con=lite.connect('RE_test.sqlite')
r=Geocode(sales)

From what line should I start? Type in numercial value; type 0 (zero) for the first iteration3403
