In [2]:
import pandas as pd
import numpy as np
from scipy import stats

In [61]:
'''
numpy is going to complain about not being able to determine a dtype
when aggregating, but it can be safely ignored
'''
np.warnings.filterwarnings('ignore')

In [62]:
loc = './dryden_alex_ODM_final/raw_data/Transactions/Local_Law_7-2018_Qualified_Transactions.csv'
transactions_df = pd.read_csv(loc)

In [63]:

#add leading zeros to the bloack and lot so they concat with boro into standard 10 digit
transactions_df['block'] = transactions_df['block'].apply(lambda x: '{0:0>5}'.format(x))
transactions_df['lot'] = transactions_df['lot'].apply(lambda x: '{0:0>4}'.format(x))

#make BBL column
transactions_df['BBL'] = transactions_df['boro'].map(str) + transactions_df['block'].map(str) + transactions_df['lot'].map(str)



In [64]:
'''
put the date in the correct format. Explicitly declaring the 
format will help catch data integrity problems.
'''

transactions_df['deed_date'] = pd.to_datetime(
    transactions_df['deed_date'], format="%m/%d/%Y")



In [65]:
f1 = transactions_df[['BBL','price', 'cap_rate', 'borough_cap_rate', 'Latitude', 'Longitude', 'BIN', 'deed_date']]


In [66]:
#set values as correct type
f1['cap_rate'] = f1['cap_rate'].astype('float64')
f1['price'] = f1['price'].astype('int64')
f1['borough_cap_rate'] = f1['borough_cap_rate'].astype('float64')
f1['Latitude'] = f1['Latitude'].astype('float64')
f1['Longitude'] = f1['Longitude'].astype('float64')

In [67]:
f1.set_index('BBL', inplace=True)

In [68]:
f1['watchlist'] = f1['cap_rate']-f1['borough_cap_rate']
f1['watchlist'] = f1['watchlist'].where(f1['watchlist']<0, True)
f1['watchlist'] = f1['watchlist'].where(f1['watchlist']>0, False)


In [70]:
'''
Values passed ICV in manual cleaning, but there is still some data that needs to be removed. 
Cap rates should be around .01-.05 (i.e., the annual income from the property is 1-5% of the cost).
Any values around that are fine. There should be some weird ones, but anything above, say, 1 is 
clearly a mistake. Invistigating one such value, it appears from the deed that a singel apartment was 
sold in a building--not the whole building. 

'''
f1.drop(f1[f1.cap_rate > 1].index, inplace=True)

In [73]:
#save file
f1.to_csv(r'./dryden_alex_ODM_final/intermediate_data/intermediate_csv/Transactions.csv', index='BBL', header=True)

In [72]:
f1.head()

Unnamed: 0_level_0,price,cap_rate,borough_cap_rate,Latitude,Longitude,BIN,deed_date,watchlist
BBL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1002380040,22600000,0.02638,0.03329,40.71781,-73.996908,1003114.0,2018-09-14,0.0
1003790037,786751,0.64596,0.03329,40.724379,-73.975536,1004556.0,2018-08-07,1.0
1003790037,904764,0.56171,0.03329,40.724379,-73.975536,1004556.0,2018-08-07,1.0
1003790037,983439,0.51677,0.03329,40.724379,-73.975536,1004556.0,2018-08-07,1.0
1003790037,1258802,0.40373,0.03329,40.724379,-73.975536,1004556.0,2018-08-07,1.0
