In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def lot_conversion(lot):
    if isinstance(lot,str):
        if 'No' in lot:
            return np.nan
        lot,unit = lot.split(' ')
        lot = float(lot.replace(',',''))
        if unit=='acres':
            lot*=43560
    return lot

def clean_price(value):
    if isinstance(value,str):
        if 'No' in value:
            return np.nan
        value = value.replace('$','')
        value = value.replace(',','')
        value = float(value)
    return value

In [3]:
for_sale = pd.read_csv('for_sale_raw.csv')
for_sale['ListType'] = 'For Sale'
sold = pd.read_csv('sold_raw.csv')
sold['ListType'] = 'Sold'

df = pd.concat([for_sale,sold],axis=0)

In [4]:
keep = ['Price','ListType','Street','State','City','Zipcode',
        'Beds', 'Baths','Area', 'Latlong','Type', 'Year Built',
        'Heating', 'Cooling','Parking', 'Lot', 'Price/Sqft',
        'Full Bathrooms', '3/4 Bathrooms','1/2 Bathrooms', '1/4 Bathrooms',
        'Basement', 'Flooring','Heating Features', 'Total Interior Livable Area',
        'Fireplace', 'Parking Features', 'Stories', 'Exterior Features',
        'Roof','Tax Assessed Value']
df = df[keep]
df.shape

(1515, 31)

In [5]:
city = df['City']=='Rochester'
df = df[city]
df.drop(columns=['City','State'],inplace=True)
df.shape

(1216, 29)

In [6]:
df[['Lat','Lon']] = df['Latlong'].str.split(', ',expand=True)

df['Lat'] = df['Lat'].str.split(': ').str[-1]
df['Lon'] = df['Lon'].str.split(': ').str[-1].str.replace('}','')

df['Lat'] = pd.to_numeric(df['Lat'])
df['Lon'] = pd.to_numeric(df['Lon'])

df.drop(columns=['Latlong'],inplace=True)
df.shape

(1216, 30)

In [7]:
to_numeric = ['Year Built','Lot','Price/Sqft',
               'Total Interior Livable Area','Tax Assessed Value']
df[to_numeric].head()

Unnamed: 0,Year Built,Lot,Price/Sqft,Total Interior Livable Area,Tax Assessed Value
0,1890,"4,891 sqft",$21,"1,932 sqft","$30,612"
1,1955,0.34 acres,$130,"1,188 sqft","$115,000"
2,1900,"4,207 sqft",$25,"1,316 sqft","$23,980"
3,1940,"6,484 sqft",$94,"1,327 sqft","$78,571"
4,1950,0.3 acres,$87,"1,376 sqft","$97,200"


In [8]:
df['Year Built'] = pd.to_numeric(df['Year Built'],errors='coerce')

df['Lot'] = df['Lot'].apply(lot_conversion)
df['Total Interior Livable Area'] = df['Total Interior Livable Area'].apply(lot_conversion)

df['Price/Sqft'] = df['Price/Sqft'].apply(clean_price)
df['Tax Assessed Value'] = df['Tax Assessed Value'].apply(clean_price)

df[to_numeric].head()

Unnamed: 0,Year Built,Lot,Price/Sqft,Total Interior Livable Area,Tax Assessed Value
0,1890.0,4891.0,21.0,1932.0,30612.0
1,1955.0,14810.4,130.0,1188.0,115000.0
2,1900.0,4207.0,25.0,1316.0,23980.0
3,1940.0,6484.0,94.0,1327.0,78571.0
4,1950.0,13068.0,87.0,1376.0,97200.0
