In [1]:
import pandas as pd

In [10]:
data_df = pd.read_csv('richmond_complete.csv', index_col = [0])

In [62]:
data_df.head(2)

Unnamed: 0,house_type,size,maint_fee,approx_age,mls_number,levels,garage,garage_size,taxes,avg_price_sqft,...,list_date,list_price,end_date,sold_price,house_age,Difference_in_days,difference_in_days,price_difference,price_difference_abs,price_difference_pct
0,Single Family Residence,2151 sqft,,1965,R2794039,Residential,No,2.0,"$4,513",$880,...,21/02/2024,"$1,860,000",11/03/2024,"$1,780,000",59.0,19.0,19.0,-80000.0,-80000.0,-4.301075
1,Townhouse,1328 sqft,,2010,R2856270,Residential,Yes,2.0,"$2,931",$820,...,04/03/2024,"$1,090,000",10/03/2024,"$1,098,500",14.0,6.0,6.0,8500.0,8500.0,0.779817


In [59]:
from datetime import datetime

def age_of_house(yr_house_built, year = 2024):
    """
    Calculate the age of the house. By default, year is set to 2024.
    """
    try:
        return int(year - int(yr_house_built))
    except:
        return None

def calculate_days_to_sell(list_date, end_date):
    """
    Calculate the number of days it took for house listing to sell
    """
    try:
        listing_date = datetime.strptime(list_date, "%d/%m/%Y")
        sold_date = datetime.strptime(end_date, "%d/%m/%Y")
    
        days_difference = (sold_date - listing_date).days
        return days_difference
    except:
        return None

def calculate_price_diff_or_pct(list_price, sold_price, operation): 
    """
    Calculate the price difference or price percentage between the list price and sold price of a house listing.
    """
    try:
        # Check if list_price_str and sold_price_str are strings
        if not isinstance(list_price, str) or not isinstance(sold_price, str):
            raise ValueError("Input values are not strings")
        # Remove the dollar sign and commas, and convert strings to floats
        list_price = float(list_price.replace('$', '').replace(',', ''))
        sold_price = float(sold_price.replace('$', '').replace(',', ''))

        if operation == 'difference':
            result = sold_price - list_price
        elif operation == 'percentage':
            result = ((sold_price - list_price) / list_price) * 100
        else:
            raise ValueError("Invalid")
            
        return result
    except ValueError:
        return None
    
data_df['house_age'] = data_df['approx_age'].apply(age_of_house)
data_df['difference_in_days'] = data_df.apply(lambda row: calculate_days_to_sell(row['list_date'], row['end_date']), axis=1)
data_df['price_difference_abs'] = data_df.apply(lambda row: calculate_price_diff_or_pct(row['list_price'], row['sold_price'], 'difference'), axis=1)
data_df['price_difference_pct'] = data_df.apply(lambda row: calculate_price_diff_or_pct(row['list_price'], row['sold_price'], 'percentage'), axis=1)