## Part 1 - Load & Clean

In [1]:
# Import libraries
import pandas as pd
import numpy as np


# To see full column values
pd.set_option('max_colwidth', 800)


In [2]:
# Create path
file = "Resources/adam.csv"

# Read in file
df = pd.read_csv(file)

# Drop 2nd index
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [3]:
# Display rows
rows = df.shape[0]

# Display # columns      
columns = df.shape[1]

# Display column names
df.columns

print(f'The dataset is comprised of {rows} rows and {columns} columns.')

The dataset is comprised of 241 rows and 10 columns.


In [4]:
# Check Datatypes
df.dtypes

Address          object
Unit             object
Borough          object
Neighborhood     object
City             object
State            object
Year              int64
Price           float64
Status           object
Postalcode        int64
dtype: object

In [5]:
# Convert Year to Datetime
df['Year'] = pd.to_datetime(df['Year'], format='%Y')
df.head()

# Update Status Column
df.Status = df['Status'].replace({"RENTED": "Rental"})
df.Status = df['Status'].replace({"SOLD": "Sale"})


In [6]:
# Filter to include only data in New York'
filtered = df['State'] == 'New York'

full = df[filtered] 
full = full.rename(columns= {"Address": "Street"})
full

Unnamed: 0,Street,Unit,Borough,Neighborhood,City,State,Year,Price,Status,Postalcode
0,9602 4th Avenue,6D,Brooklyn,Bay Ridge,New York,New York,2019-01-01,,Rental,11209
1,94 Degraw Street,,Brooklyn,Columbia Street Waterfront,Brooklyn,New York,2020-01-01,2350000.0,Sale,11231
2,933 Lafayette Street,1st floor,Brooklyn,Bedford-Stuyvesant,Brooklyn,New York,2020-01-01,1095000.0,Sale,11221
3,918 Manhattan Ave,#2,Brooklyn,Greenpoint,Brooklyn,New York,2021-01-01,5000.0,Rental,11222
4,906 Prospect Place,2A,Brooklyn,Crown Heights,Brooklyn,New York,2021-01-01,735000.0,Sale,11213
...,...,...,...,...,...,...,...,...,...,...
236,308 East 38th Street,4C,Manhattan,Murray Hill,New York,New York,2018-01-01,890000.0,Sale,10016
237,308 East 38th Street,8D,Manhattan,Murray Hill,New York,New York,2018-01-01,899000.0,Sale,10016
238,308 East 38th Street,16F,Manhattan,Murray Hill,New York,New York,2018-01-01,1556238.0,Sale,10016
239,308 East 38th Street,11E,Manhattan,Murray Hill,New York,New York,2018-01-01,1374723.0,Sale,10016


In [7]:
# Create arrays to capture all column values individually
a = np.char.array(full['Street'].values)
b = np.char.array(full['City'].values)
c = np.char.array(full['State'].values)
c = np.char.array(full['State'].values)
d = np.char.array(full['Postalcode'].values)

full['Location'] = a.astype(str) + ', ' + b.astype(str) + ', ' + c.astype(str) + ', ' + d.astype(str)
full

Unnamed: 0,Street,Unit,Borough,Neighborhood,City,State,Year,Price,Status,Postalcode,Location
0,9602 4th Avenue,6D,Brooklyn,Bay Ridge,New York,New York,2019-01-01,,Rental,11209,"9602 4th Avenue, New York, New York, 11209"
1,94 Degraw Street,,Brooklyn,Columbia Street Waterfront,Brooklyn,New York,2020-01-01,2350000.0,Sale,11231,"94 Degraw Street, Brooklyn, New York, 11231"
2,933 Lafayette Street,1st floor,Brooklyn,Bedford-Stuyvesant,Brooklyn,New York,2020-01-01,1095000.0,Sale,11221,"933 Lafayette Street, Brooklyn, New York, 11221"
3,918 Manhattan Ave,#2,Brooklyn,Greenpoint,Brooklyn,New York,2021-01-01,5000.0,Rental,11222,"918 Manhattan Ave, Brooklyn, New York, 11222"
4,906 Prospect Place,2A,Brooklyn,Crown Heights,Brooklyn,New York,2021-01-01,735000.0,Sale,11213,"906 Prospect Place, Brooklyn, New York, 11213"
...,...,...,...,...,...,...,...,...,...,...,...
236,308 East 38th Street,4C,Manhattan,Murray Hill,New York,New York,2018-01-01,890000.0,Sale,10016,"308 East 38th Street, New York, New York, 10016"
237,308 East 38th Street,8D,Manhattan,Murray Hill,New York,New York,2018-01-01,899000.0,Sale,10016,"308 East 38th Street, New York, New York, 10016"
238,308 East 38th Street,16F,Manhattan,Murray Hill,New York,New York,2018-01-01,1556238.0,Sale,10016,"308 East 38th Street, New York, New York, 10016"
239,308 East 38th Street,11E,Manhattan,Murray Hill,New York,New York,2018-01-01,1374723.0,Sale,10016,"308 East 38th Street, New York, New York, 10016"


In [8]:
# Export clean dataframe to CSV

full.to_csv("Resources/clean.csv")