Scope: 
Zuma Ltd is a real estate company, and would like to use advanced analytics to improve their realtime property listing platform

To do:
Get data from rapid API
Implement rate limit, data consistency and authentication

Clean data, handle missing values and transform into a structured format
Address data quality to improve reliability

Load data into a CSV or database suitable for Zuma realtor's system
Ensure it's robust and scalable for future use


In [2]:
import requests
import json
import pandas as pd
import psycopg2

url = "https://realtor-search.p.rapidapi.com/properties/nearby-home-values"

querystring = {"lat":"40.23184","lon":"-76.895774"}

headers = {
	"x-rapidapi-key": "caf62efaf5msh9aa39a9391d842fp1c1a18jsn75da162ac4aa",
	"x-rapidapi-host": "realtor-search.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring)


data = response.json()['data']["home_search"]["results"]
print(data)

[{'property_id': '3621748109', 'href': 'https://www.realtor.com/realestateandhomes-detail/316-Manchester-Rd_Camp-Hill_PA_17011_M36217-48109', 'description': {'beds': 4, 'baths_full': 2, 'baths': 2, 'baths_half': None, 'baths_consolidated': '2', 'sqft': 1279, 'lot_sqft': 6098, '__typename': 'SearchHomeDescription'}, 'location': {'address': {'line': '316 Manchester Rd', 'city': 'Camp Hill', 'state_code': 'PA', 'postal_code': '17011', '__typename': 'SearchHomeAddress'}, '__typename': 'SearchHomeLocation'}, 'current_estimates': [{'estimate': 259000, 'date': '2025-07-14', 'isbest_homevalue': True, '__typename': 'LatestEstimate'}, {'estimate': 264236, 'date': '2025-08-06', 'isbest_homevalue': False, '__typename': 'LatestEstimate'}, {'estimate': 254800, 'date': '2025-08-11', 'isbest_homevalue': False, '__typename': 'LatestEstimate'}], 'list_price': None, 'permalink': '316-Manchester-Rd_Camp-Hill_PA_17011_M36217-48109', 'status': 'sold', 'listing_id': None, '__typename': 'SearchHome'}, {'prope

In [3]:
## SAVE DATA TO JSON FILE
file_name = 'real_estate-2.json'
with open(file_name, 'w') as file:
    json.dump(data, fp=file, indent=4)


In [4]:
# LOAD THE DATA INTO A DATAFRAME
real_estate_df = pd.read_json('real_estate.json')
#real_estate_df.head()

In [5]:
# TREAT N/A VALUES
real_estate_df.fillna({
       'bedrooms' :0.0, 
       'squareFootage' :0.0, 
       'yearBuilt' :0.0, 
       'features' :'unknown', 
       'county' : 'unknown',
       'assessorID' :0.0, 
       'legalDescription' :'unknown', 
       'ownerOccupied' : 0, 
       'bathrooms' :0.0,
       'lotSize' : 0.0, 
       'propertyType' :'unknown', 
       'taxAssessment' :'unknown', 
       'propertyTaxes' :'unknown',
       'lastSalePrice' : 0.0, 
       'lastSaleDate' :'01-01-1991', 
       'owner' :'unknown', 
       'subdivision' :'unknown', 
       'zoning' :'unknown',
       'addressLine2' :'unknown'
}, inplace=True)


In [6]:
# MODEL THE DATA ACCORDING TO YOUR ERD

# EXTRACT SALES DF
real_estate_df.columns
sales_df = real_estate_df[['lastSalePrice', 'lastSaleDate']].copy().drop_duplicates().reset_index(drop=True)
sales_df.index.name = 'sales_id'
sales_df = sales_df.reset_index()


In [7]:
# EXTRACT FEATURES DF
real_estate_df.columns
real_estate_df[['features']] = real_estate_df[['features']].astype(str)
features_df = real_estate_df[['bedrooms', 'squareFootage', 'features', 'bathrooms', 'lotSize' ]].copy().drop_duplicates().reset_index(drop=True)
features_df.index.name = 'features_id'
features_df = features_df.reset_index()


In [8]:
# EXTRACT LOCATION DF
real_estate_df.columns
location_df = real_estate_df[['addressLine1', 'addressLine2', 'city', 'state', 'zipCode', 'county', 'longitude', \
                              'latitude', 'formattedAddress']].copy().drop_duplicates().reset_index(drop=True)
location_df.index.name = 'location_id'
location_df = location_df.reset_index()

In [9]:
# EXTRACT PROPERTIES FACT TABLE
# BECAUSE WE NEED ID'S OF OTHER TABLES AS FORIEGN KEYS, WE'LL NEED TO MERGE THE TABLES FIRST BEFORE EXTRACTING THE COLUMNS

properties_df = real_estate_df.merge(sales_df, on=['lastSalePrice', 'lastSaleDate'], how='left').merge(features_df, on=['bedrooms', 'squareFootage', 'features', 'bathrooms', 'lotSize'], how='left').merge(location_df, on=['addressLine1', 'addressLine2', 'city', 'state', 'zipCode', 'county', 'longitude', \
                              'latitude', 'formattedAddress'], how='left')

properties_df = properties_df[['id', 'location_id', 'features_id', 'sales_id', 'yearBuilt', 'assessorID', 'legalDescription',\
                               'ownerOccupied', 'propertyType', 'taxAssessment', 'propertyTaxes', 'owner', 'zoning', 'subdivision']]

properties_df.reset_index()
properties_df.index.name = 'property_id'
properties_df = properties_df.reset_index()


In [10]:
# SAVE TO CSV
sales_df.to_csv('./cleaned data/zuma_sales.csv', index = False)
features_df.to_csv('./cleaned data/zuma_features.csv', index = False)
location_df.to_csv('./cleaned data/zuma_location.csv', index = False)
properties_df.to_csv('./cleaned data/zuma_property.csv', index = False)

In [11]:
# PUSH DATA TO POSTGRES DATABASE
def connect_to_db():
    conn = psycopg2.connect(
        host='localhost',
        database='zumaDB',
        user='postgres',
        password='sa'
    )
    return conn
con = connect_to_db()

In [19]:
# CREATE TABLES
def create_tables():
    con = connect_to_db()
    cursor = con.cursor()
    queries = '''
            DROP TABLE IF EXISTS property, sales, features, location;
            
            CREATE TABLE sales (
            sales_id SERIAL PRIMARY KEY,
            lastSalePrice NUMERIC(12,2),
            lastSaleDate DATE
            );

            CREATE TABLE features (
            features_id SERIAL PRIMARY KEY,
            bedrooms NUMERIC(12,2),
            squareFootage NUMERIC(12,2),
            features TEXT,
            bathrooms NUMERIC(12,2),
            lotSize NUMERIC(12,2)
            );

            CREATE TABLE location (
                location_id SERIAL PRIMARY KEY,
                addressLine1 VARCHAR(255) NULL,
                addressLine2 VARCHAR(255),
                city VARCHAR(100),
                state VARCHAR(100),
                zipCode VARCHAR(20),
                county VARCHAR(100),
                longitude NUMERIC(9,6),
                latitude NUMERIC(9,6),
                formattedAddress TEXT
            );

            CREATE TABLE property (
                property_id SERIAL PRIMARY KEY,
                id TEXT,
                location_id INT NULL,
                features_id INT NULL,
                sales_id INT NULL,
                yearBuilt TEXT,
                assessorID VARCHAR(100),
                legalDescription TEXT,
                ownerOccupied TEXT,
                propertyType VARCHAR(100),
                taxAssessment TEXT,
                propertyTaxes TEXT,
                owner TEXT,
                zoning VARCHAR(100),
                subdivision VARCHAR(255),
                FOREIGN KEY (location_id) REFERENCES location(location_id),
                FOREIGN KEY (features_id) REFERENCES features(features_id),
                FOREIGN KEY (sales_id) REFERENCES sales(sales_id)
            );
        '''
    cursor.execute(queries)
    con.commit()
    cursor.close()
    con.close()

create_tables()

In [20]:
## LOAD DATA TO DATABASE
import csv

def clear_db_data():
    con = connect_to_db()
    cursor = con.cursor()
    cursor.execute('TRUNCATE  property, sales, features, location;')
    con.commit()
    cursor.close()
    con.close()
    print('db clearing successful')

def load_data_from_csv(file_path):
    con = connect_to_db()
    cursor = con.cursor()

    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        
        # DYNAMIC INSERT
        table_name = file.name.split('/')[-1].split('.')[0].split('_')[-1]
        dt_columns = pd.read_csv(file_path).columns.tolist()
        placeholders = ', '.join(['%s'] * len(dt_columns))
        column_names = ', '.join(dt_columns)        
        next(reader)
        for row in reader:
            cursor.execute(f'''
                INSERT INTO {table_name} ({column_names}) VALUES ({placeholders});
                ''', row
            )

    con.commit()
    cursor.close()
    con.close()



In [21]:
clear_db_data();

data_files = ['zuma_sales.csv', 'zuma_features.csv', 'zuma_location.csv', 'zuma_property.csv']

for d in data_files:
    file_path = './cleaned data/'+d
    load_data_from_csv(file_path)
    print(f'{d} successfully loaded')

db clearing successful
zuma_sales.csv successfully loaded
zuma_features.csv successfully loaded
zuma_location.csv successfully loaded
zuma_property.csv successfully loaded


In [38]:
def get_data(table_name):
    con = connect_to_db()
    cursor = con.cursor()
    cursor.execute(f'SELECT * FROM {table_name}')
    data = cursor.fetchone()
    print(data)
    cursor.close()
    con.close()

get_data('location')

(0, '511 John St', 'unknown', 'Bryan', 'OH', '43506', 'Williams', Decimal('-84.570163'), Decimal('41.469595'), '511 John St, Bryan, OH 43506')
