# Comparing the Date-Price Pair Approach to the HISTORY Object Approach for Generating CRMLS Listing Histories
The purpose of this notebook is to generate listing histories for a sample of listings and HISTORY objects in order to compare the results of the two methods and evaluate the accuracy of the date-pair approach.

In [1]:
import pandas as pd
from tqdm.notebook import tqdm
from datetime import datetime, timedelta

## Reading in Samples

In [2]:
history_sample = pd.read_csv('data/history_sample_1017.csv')
history_sample.head()

Unnamed: 0,new_value,field_name,history_key,modification_timestamp,resource_record_key
0,H,STANDARDSTATUS,9999999,2020-02-24 10:27:40,363940030
1,270405,LISTPRICE,9999998,2020-02-24 10:15:54,363882069
2,S,STANDARDSTATUS,9999997,2020-02-24 10:16:21,363822160
3,225000.00,LISTPRICE,9999996,2020-02-24 10:25:09,363798709
4,539900,LISTPRICE,9999995,2020-02-24 10:16:36,363747921


In [3]:
print(history_sample['resource_record_key'].nunique())
# get listing keys to pull all listings that are in this sample
history_sample['resource_record_key'].drop_duplicates().to_csv('data/listing_keys.csv', index=False)

376457


In [4]:
listings_sample = pd.read_csv('data/crmls_listings_sample1.csv')

In [5]:
print(listings_sample.shape)
for i in range(2,13):
    listings_sample = pd.concat([listings_sample, pd.read_csv(f'data/crmls_listings_sample{i}.csv')])

(32560, 19)


In [6]:
listings_sample.shape

(337904, 19)

In [7]:
listings_sample.head()

Unnamed: 0,source_id,close_date,expiration_date,listing_contract_date,purchase_contract_date,withdrawn_date,list_price,current_price,original_list_price,close_price,price_change_timestamp,status,previous_list_price,previous_status,status_change_timestamp,on_market_date,off_market_date,contingent_date,original_entry_timestamp
0,10002014,2015-10-27,2016-04-28,2015-06-24,2015-07-17,,219900.0,220000.0,219900.0,220000.0,,S,,P,2015-10-27 17:23:41,2015-06-24,2015-07-17,,2015-06-24 07:56:16
1,100198211,2017-05-07,2017-06-30,2017-03-01,2017-04-27,,2350.0,2350.0,2350.0,2350.0,2017-03-22 20:27:45,S,,U,2017-05-07 19:48:00,2017-03-01,2017-05-07,,2017-03-22 20:27:45
2,10025396,,2020-06-30,2015-06-28,,2020-02-24,8750000.0,8750000.0,12500000.0,,2017-12-02 10:46:26,X,12500000.0,W,2020-07-01 00:04:17,2015-06-28,2020-02-24,,2015-06-28 11:00:14
3,10025573,,2020-06-30,2015-06-28,,2020-02-24,5300000.0,5300000.0,6200000.0,,2017-12-02 10:52:23,X,6200000.0,W,2020-07-01 01:04:19,2015-06-28,2020-02-24,,2015-06-28 12:10:16
4,10051516,2016-02-23,2016-02-29,2015-07-04,2016-01-08,,245000.0,240000.0,265000.0,240000.0,2015-08-19 18:49:13,S,249900.0,U,2016-02-25 14:40:27,2015-07-04,2016-02-23,,2015-07-04 20:25:01


In [8]:
history_sample.head()

Unnamed: 0,new_value,field_name,history_key,modification_timestamp,resource_record_key
0,H,STANDARDSTATUS,9999999,2020-02-24 10:27:40,363940030
1,270405,LISTPRICE,9999998,2020-02-24 10:15:54,363882069
2,S,STANDARDSTATUS,9999997,2020-02-24 10:16:21,363822160
3,225000.00,LISTPRICE,9999996,2020-02-24 10:25:09,363798709
4,539900,LISTPRICE,9999995,2020-02-24 10:16:36,363747921


## Housekeeping Functions/Objects

In [9]:
event_codes = { 
                'C': 50,
                'A': 100,
                'B': 200,
                'U': 240,
                'P': 240,
                'X': 300,
                'S': 500,
                'H': 640,
                'K': 620,
                'W': 600,
                'Z': 600
              }

In [10]:
def coalesce(*values):
    """Return the first non-None value or None if all values are None"""
    return next((v for v in values if pd.notna(v)), None)

In [11]:
listings_sample.columns

Index(['source_id', 'close_date', 'expiration_date', 'listing_contract_date',
       'purchase_contract_date', 'withdrawn_date', 'list_price',
       'current_price', 'original_list_price', 'close_price',
       'price_change_timestamp', 'status', 'previous_list_price',
       'previous_status', 'status_change_timestamp', 'on_market_date',
       'off_market_date', 'contingent_date', 'original_entry_timestamp'],
      dtype='object')

In [12]:
listings_sample.head()

Unnamed: 0,source_id,close_date,expiration_date,listing_contract_date,purchase_contract_date,withdrawn_date,list_price,current_price,original_list_price,close_price,price_change_timestamp,status,previous_list_price,previous_status,status_change_timestamp,on_market_date,off_market_date,contingent_date,original_entry_timestamp
0,10002014,2015-10-27,2016-04-28,2015-06-24,2015-07-17,,219900.0,220000.0,219900.0,220000.0,,S,,P,2015-10-27 17:23:41,2015-06-24,2015-07-17,,2015-06-24 07:56:16
1,100198211,2017-05-07,2017-06-30,2017-03-01,2017-04-27,,2350.0,2350.0,2350.0,2350.0,2017-03-22 20:27:45,S,,U,2017-05-07 19:48:00,2017-03-01,2017-05-07,,2017-03-22 20:27:45
2,10025396,,2020-06-30,2015-06-28,,2020-02-24,8750000.0,8750000.0,12500000.0,,2017-12-02 10:46:26,X,12500000.0,W,2020-07-01 00:04:17,2015-06-28,2020-02-24,,2015-06-28 11:00:14
3,10025573,,2020-06-30,2015-06-28,,2020-02-24,5300000.0,5300000.0,6200000.0,,2017-12-02 10:52:23,X,6200000.0,W,2020-07-01 01:04:19,2015-06-28,2020-02-24,,2015-06-28 12:10:16
4,10051516,2016-02-23,2016-02-29,2015-07-04,2016-01-08,,245000.0,240000.0,265000.0,240000.0,2015-08-19 18:49:13,S,249900.0,U,2016-02-25 14:40:27,2015-07-04,2016-02-23,,2015-07-04 20:25:01


## Creating Events from Listing
aka date-price pair approach

In [13]:
def create_pairs(close_date, expiration_date, listing_contract_date,
                 purchase_contract_date, list_price, current_price, 
                 original_list_price, close_price):
        """Based on which variables are present/not null, create events from date-price pairs

        Returns:
            list: list of events
        """
        pairs = []
        if type(listing_contract_date) == str:
            pairs.append((100, listing_contract_date, coalesce(original_list_price, list_price, current_price)))
        if type(purchase_contract_date) == str:
            pairs.append((240, purchase_contract_date, coalesce(current_price, list_price, current_price)))
        if type(expiration_date) == str:
            pairs.append((300, expiration_date, coalesce(current_price, list_price)))
        if type(close_date) == str:
            pairs.append((500, close_date, coalesce(close_price, current_price, list_price)))
        pairs.sort(key=lambda x: x[1])
        return pairs

def clean_events(events: list) -> list:
    """Remove out of order events (i.e. all status codes should be in
       ascending order)

    Args:
        events (list): list of events in order of date

    Returns:
        list: list of cleaned events
    """
    index = 1
    while index <= len(events):
        try:
            if events[index] < events[index - 1]:
                del events[index]
            else:
                index += 1
        except IndexError:
            break
    return events

def clean_expiry(events: list, status: str) -> list:
    """Remove expiry event if status is not expired

    Args:
        events (list): list of events
        status (str): status of listing

    Returns:
        list: list of events with expiry event removed if necessary
    """
    if status != 'X':
        t = None
        for i in range(len(events)):
            if 300 in events[i]:
                t = i
        if t:
            events.pop(t)
    return events

def add_withdraw_cancel(events, status, withdraw_date, expiration_date, current_price, list_price, status_change_timestamp) -> list:
    """Add withdraw, cancel, and hold events

    Returns:
        list: list of events with cancel, withdraw, and hold events added
    """
    if type(withdraw_date) == str and status != 640:
        events.append((status, withdraw_date, coalesce(current_price, list_price)))
    elif type(status_change_timestamp) == str:
        events.append((status, status_change_timestamp, coalesce(current_price, list_price)))
    else:
        events.append((status, expiration_date, coalesce(current_price, list_price)))
    return events

def create_listing_history(close_date, expiration_date, listing_contract_date, purchase_contract_date, withdrawn_date,
                           list_price,current_price, original_list_price, close_price, status, status_change_timestamp) -> list:
    """Given the dates and prices available, generate all events

    Returns:
        list: list of ordered and cleaned events that a listing went through
    """
    ec = event_codes[status]
    events = []
    if ec == 50:
        events.append(f'{ec}: {listing_contract_date}, {coalesce(original_list_price, list_price)}')
    else:
        events = create_pairs(close_date, expiration_date, listing_contract_date,
                              purchase_contract_date, list_price,current_price, 
                              original_list_price, close_price)
        if ec in (600, 620, 640):
            events = add_withdraw_cancel(events, ec, withdrawn_date, expiration_date, current_price, list_price, status_change_timestamp)
        events = clean_events(events)
        events = clean_expiry(events, status)
    return events

In [15]:
events = [create_listing_history(close_date, expiration_date, listing_contract_date, purchase_contract_date, withdrawn_date, 
                        list_price,current_price, original_list_price, close_price, status, status_change_timestamp) for close_date, expiration_date, listing_contract_date, purchase_contract_date, withdrawn_date,
                           list_price,current_price, original_list_price, close_price, status, status_change_timestamp in zip(listings_sample.close_date, listings_sample.expiration_date, listings_sample.listing_contract_date, listings_sample.purchase_contract_date, listings_sample.withdrawn_date,
                           listings_sample.list_price,listings_sample.current_price, listings_sample.original_list_price, listings_sample.close_price, listings_sample.status, listings_sample.status_change_timestamp)]
len(events)

337904

In [165]:
listing_keys = list(set(history_sample['resource_record_key']))
history_sample[history_sample['resource_record_key'] == listing_keys[3]]

Unnamed: 0,new_value,field_name,history_key,modification_timestamp,resource_record_key
253827,P,STANDARDSTATUS,9767882,2020-01-13 17:20:18,363855877
380618,2.500,BUYERAGENCYCOMPENSATION,9651862,2020-01-01 14:46:03,363855877
382506,A,STANDARDSTATUS,9650143,2020-01-01 14:46:03,363855877


In [166]:
listings_sample['events'] = events

In [167]:
listings_sample.head()

Unnamed: 0,source_id,close_date,expiration_date,listing_contract_date,purchase_contract_date,withdrawn_date,list_price,current_price,original_list_price,close_price,price_change_timestamp,status,previous_list_price,previous_status,status_change_timestamp,on_market_date,off_market_date,contingent_date,original_entry_timestamp,events
0,10002014,2015-10-27,2016-04-28,2015-06-24,2015-07-17,,219900.0,220000.0,219900.0,220000.0,,S,,P,2015-10-27 17:23:41,2015-06-24,2015-07-17,,2015-06-24 07:56:16,"[(100, 2015-06-24, 219900.0), (240, 2015-07-17..."
1,100198211,2017-05-07,2017-06-30,2017-03-01,2017-04-27,,2350.0,2350.0,2350.0,2350.0,2017-03-22 20:27:45,S,,U,2017-05-07 19:48:00,2017-03-01,2017-05-07,,2017-03-22 20:27:45,"[(100, 2017-03-01, 2350.0), (240, 2017-04-27, ..."
2,10025396,,2020-06-30,2015-06-28,,2020-02-24,8750000.0,8750000.0,12500000.0,,2017-12-02 10:46:26,X,12500000.0,W,2020-07-01 00:04:17,2015-06-28,2020-02-24,,2015-06-28 11:00:14,"[(100, 2015-06-28, 12500000.0), (300, 2020-06-..."
3,10025573,,2020-06-30,2015-06-28,,2020-02-24,5300000.0,5300000.0,6200000.0,,2017-12-02 10:52:23,X,6200000.0,W,2020-07-01 01:04:19,2015-06-28,2020-02-24,,2015-06-28 12:10:16,"[(100, 2015-06-28, 6200000.0), (300, 2020-06-3..."
4,10051516,2016-02-23,2016-02-29,2015-07-04,2016-01-08,,245000.0,240000.0,265000.0,240000.0,2015-08-19 18:49:13,S,249900.0,U,2016-02-25 14:40:27,2015-07-04,2016-02-23,,2015-07-04 20:25:01,"[(100, 2015-07-04, 265000.0), (240, 2016-01-08..."


## Creating Events from HISTORY Object

In [168]:
def get_og_event(listing_key: int) -> list:
    """Get original event of listing

    Args:
        listing_key (int): key of listing

    Returns:
        list: first event of listing history
    """
    try:
        listing = listings_sample[listings_sample['source_id'] == listing_key]
        list_date = listing['original_entry_timestamp'].values[0]
        list_price = coalesce(listing['original_list_price'].values[0], listing['list_price'].values[0])
        return [100, list_date, list_price]
    except IndexError:
        return [100, None, None]

def add_event(history_row: pd.Series, lh: dict):
    """Given a change record, create a new event for that listing and add
    it to the listing's history.

    Args:
        history_row (pd.Series): row of change history
        lh (dict): dictionary of listing histories with key as listing_key
    """
    try:
        last_event = lh[history_row['resource_record_key']][-1]
    except KeyError:
        last_event = get_og_event(history_row['resource_record_key'])
        lh[history_row['resource_record_key']] = [last_event]
        #last_event = (100, 0, 0)
    new_time = history_row['modification_timestamp']
    if 'STATUS' in history_row['field_name']:
        try:
            new_status = event_codes[history_row['new_value']]
        except KeyError:
            print(f"{history_row['new_value']} not found")
            return None
        if new_status == last_event[0]:
            if new_status == 100:
                for i in range(len(lh[history_row['resource_record_key']])):
                    lh[history_row['resource_record_key']][i][0] = 50
                last_event = lh[history_row['resource_record_key']][-1]
            elif new_status == 500:
                return None
        current_event = [new_status, new_time, last_event[2]]
        lh[history_row['resource_record_key']].append(current_event)
    elif 'LISTPRICE' == history_row['field_name']:
        current_event = [last_event[0], new_time, history_row['new_value']]
        lh[history_row['resource_record_key']].append(current_event)
    else:
        if last_event[0] == 500:
            last_event[-1] = history_row['new_value']
            lh[history_row['resource_record_key']][-1] = last_event
        else:
            current_event = [500, new_time, history_row['new_value']]
            lh[history_row['resource_record_key']].append(current_event)

def clean_coming_soon(lh: list) -> list:
    if lh[-1][0] == 50:
        lh = [lh[-1]]
    return lh


In [169]:
history_sample.groupby('resource_record_key')['history_key'].nunique().sort_values(ascending=False)#.to_clipboard()

resource_record_key
365507734    79
365807302    38
367465241    33
363654223    28
13401009     27
             ..
360135615     1
360135650     1
360135942     1
360135954     1
362925237     1
Name: history_key, Length: 376457, dtype: int64

In [210]:
listings_sample[listings_sample['source_id'] == 1061779]

Unnamed: 0,source_id,close_date,expiration_date,listing_contract_date,purchase_contract_date,withdrawn_date,list_price,current_price,original_list_price,close_price,price_change_timestamp,status,previous_list_price,previous_status,status_change_timestamp,on_market_date,off_market_date,contingent_date,original_entry_timestamp,events
20,1061779,2013-02-22,2013-06-07,2012-12-06,2012-12-18,,215000.0,217000.0,215000.0,217000.0,,S,,P,2013-02-22 08:38:47,2012-12-18,2012-12-18,,2012-12-06 21:26:10,"[(100, 2012-12-06, 215000.0), (240, 2012-12-18..."


In [171]:
history_sample[history_sample['resource_record_key'] == 1061779]

Unnamed: 0,new_value,field_name,history_key,modification_timestamp,resource_record_key
5923,217000,SELLINGPRICE,9994575,2020-01-31 16:55:00,1061779
5924,2.5,SELLINGOFFICECOMPENSATION,9994574,2020-01-31 16:55:00,1061779
9399,S,STATUS,9991390,2020-01-31 16:55:00,1061779
9401,P,STATUS,9991389,2020-01-31 16:55:00,1061779
9402,B,STATUS,9991388,2020-01-31 16:55:00,1061779
9403,A,STATUS,9991387,2020-01-31 16:55:00,1061779
9404,P,STATUS,9991386,2020-01-31 16:55:00,1061779
9405,A,STATUS,9991385,2020-01-31 16:55:00,1061779
11467,215000,LISTPRICE,9989505,2020-01-31 16:55:00,1061779


In [172]:
history_sample = history_sample[(history_sample['field_name'].str.contains('STATUS')) | (history_sample['field_name'].str.contains('PRICE'))]
history_sample.sort_values(['modification_timestamp', 'history_key'], inplace=True)
listing_history = {}
for index, row in tqdm(history_sample.iterrows(), total=len(history_sample)):
    add_event(row, listing_history)
for k in listing_history:
    listing_history[k] = clean_coming_soon(listing_history[k])

  0%|          | 0/807781 [00:00<?, ?it/s]

In [173]:
s = pd.DataFrame(columns=['listing_key', 'listing_history'])
l = []
lh = []
for k in listing_history:
    l.append(k)
    lh.append(listing_history[k])
s['listing_key'] = l
s['listing_history'] = lh
s.head()

Unnamed: 0,listing_key,listing_history
0,359956806,"[[50, 2018-09-04 10:14:32, 464900.0], [100, 20..."
1,359954635,"[[50, 2018-09-03 12:22:01, 669000.0], [100, 20..."
2,359948951,"[[50, 2018-08-31 21:14:01, 380000.0], [100, 20..."
3,359946798,"[[50, 2018-08-31 15:51:31, 664900.0], [100, 20..."
4,359944373,"[[50, 2018-08-31 11:41:00, 850000.0], [100, 20..."


In [174]:
listings_sample.merge(right=s, how='left', right_on='listing_key', left_on='source_id')[['source_id', 'events', 'listing_history']].to_csv('data/listings_sample_with_events_and_lh.csv', index=False)

## Checking out the output

In [175]:
history_sample[history_sample['resource_record_key'] == 13401009]

Unnamed: 0,new_value,field_name,history_key,modification_timestamp,resource_record_key
13934,354900,LISTPRICE,998725,2019-04-11 11:35:02,13401009
13913,A,STATUS,998727,2019-04-11 11:35:02,13401009
13835,B,STATUS,998734,2019-04-11 11:35:02,13401009
13736,P,STATUS,998743,2019-04-11 11:35:02,13401009
13685,A,STATUS,998748,2019-04-11 11:35:02,13401009
13640,359900,LISTPRICE,998752,2019-04-11 11:35:02,13401009
13630,354900,LISTPRICE,998753,2019-04-11 11:35:02,13401009
13552,P,STATUS,998760,2019-04-11 11:35:02,13401009
13434,B,STATUS,998771,2019-04-11 11:35:02,13401009
13424,A,STATUS,998772,2019-04-11 11:35:02,13401009


In [176]:
listing_history[13401009]

[[50, '2016-05-25 10:57:49', 354900.0],
 [50, '2019-04-11 11:35:02', '354900'],
 [100, '2019-04-11 11:35:02', '354900'],
 [200, '2019-04-11 11:35:02', '354900'],
 [240, '2019-04-11 11:35:02', '354900'],
 [100, '2019-04-11 11:35:02', '354900'],
 [100, '2019-04-11 11:35:02', '359900'],
 [100, '2019-04-11 11:35:02', '354900'],
 [240, '2019-04-11 11:35:02', '354900'],
 [200, '2019-04-11 11:35:02', '354900'],
 [100, '2019-04-11 11:35:02', '354900'],
 [640, '2019-04-11 11:35:02', '354900'],
 [240, '2019-04-11 11:35:02', '354900'],
 [100, '2019-04-11 11:35:02', '354900'],
 [600, '2019-04-11 11:35:02', '354900'],
 [600, '2019-04-11 11:35:02', '366000'],
 [100, '2019-04-11 11:35:02', '366000'],
 [100, '2019-04-11 11:35:02', '355000'],
 [240, '2019-04-11 11:35:02', '355000'],
 [100, '2019-04-11 11:35:02', '355000'],
 [100, '2019-04-11 11:35:02', '350000'],
 [100, '2019-04-11 11:35:02', '348900'],
 [100, '2019-04-11 11:35:02', '344900'],
 [240, '2019-04-11 11:35:02', '344900'],
 [240, '2019-04-11

In [177]:
listings_sample[listings_sample['source_id'] == 13401009]

Unnamed: 0,source_id,close_date,expiration_date,listing_contract_date,purchase_contract_date,withdrawn_date,list_price,current_price,original_list_price,close_price,price_change_timestamp,status,previous_list_price,previous_status,status_change_timestamp,on_market_date,off_market_date,contingent_date,original_entry_timestamp,events
82,13401009,2016-12-29,2017-03-31,2016-05-25,2016-11-20,,344900.0,345000.0,354900.0,345000.0,2016-11-22 10:33:35,S,348900.0,P,2016-12-30 15:11:47,2016-09-30,2016-11-20,,2016-05-25 10:57:49,"[(100, 2016-05-25, 354900.0), (240, 2016-11-20..."


In [178]:
history_sample[history_sample['resource_record_key'] == 360135615]

Unnamed: 0,new_value,field_name,history_key,modification_timestamp,resource_record_key
391642,K,STANDARDSTATUS,96414,2019-01-11 11:15:42,360135615


In [179]:
listing_history[360135615]

[[100, '2018-09-29 11:53:42', 135000.0],
 [620, '2019-01-11 11:15:42', 135000.0]]

In [180]:
listings_sample[listings_sample['source_id'] == 360135615]

Unnamed: 0,source_id,close_date,expiration_date,listing_contract_date,purchase_contract_date,withdrawn_date,list_price,current_price,original_list_price,close_price,price_change_timestamp,status,previous_list_price,previous_status,status_change_timestamp,on_market_date,off_market_date,contingent_date,original_entry_timestamp,events
676,360135615,,2019-02-28,2018-09-29,,,125000.0,125000.0,135000.0,,2018-11-06 18:43:35,K,135000.0,A,2019-01-11 11:15:42,2018-09-29,2019-01-11,,2018-09-29 11:53:42,"[(100, 2018-09-29, 135000.0), (620, 2019-01-11..."


In [181]:
history_sample[history_sample['resource_record_key'] == 350951796]

Unnamed: 0,new_value,field_name,history_key,modification_timestamp,resource_record_key
225985,A,STANDARDSTATUS,9793306,2020-01-18 00:01:44,350951796
329911,H,STANDARDSTATUS,9698422,2020-01-19 10:07:36,350951796
305151,A,STANDARDSTATUS,9721005,2020-01-20 00:02:18,350951796
45533,H,STANDARDSTATUS,9958338,2020-01-27 12:49:25,350951796


In [182]:
listing_history[350951796]

[[50, '2018-05-29 14:32:11', 435000.0],
 [100, '2020-01-18 00:01:44', 435000.0],
 [640, '2020-01-19 10:07:36', 435000.0],
 [100, '2020-01-20 00:02:18', 435000.0],
 [640, '2020-01-27 12:49:25', 435000.0]]

In [183]:
listings_sample[listings_sample['source_id'] == 350951796]['status']

294    K
Name: status, dtype: object

In [184]:
listings_sample[listings_sample['status'] == 'K']['events'].head(10)

6     [(100, 2015-08-01, 739888.0), (620, 2016-02-16...
8     [(100, 2015-08-12, 85000.0), (620, 2016-03-02 ...
17    [(100, 2015-09-25, 325000.0), (620, 2015-12-18...
24    [(100, 2015-10-21, 1200.0), (620, 2016-01-05 1...
26    [(100, 2015-10-24, 299900.0), (620, 2015-12-19...
34    [(100, 2015-12-21, 749900.0), (620, 2016-02-19...
38    [(100, 2016-01-09, 3338000.0), (620, 2020-02-2...
59    [(100, 2017-05-01, 19900.0), (240, 2019-06-11,...
74    [(100, 2017-05-30, 3290000.0), (620, 2019-11-0...
75    [(100, 2017-05-31, 468500.0), (240, 2018-05-08...
Name: events, dtype: object

In [185]:
history_sample[history_sample['resource_record_key'] == 10390789]

Unnamed: 0,new_value,field_name,history_key,modification_timestamp,resource_record_key
14741,399900,LISTPRICE,998651,2019-04-11 11:35:02,10390789
14720,A,STATUS,998653,2019-04-11 11:35:02,10390789
14709,P,STATUS,998654,2019-04-11 11:35:02,10390789
14261,399900,SELLINGPRICE,998695,2019-04-11 11:35:02,10390789
14250,S,STATUS,998696,2019-04-11 11:35:02,10390789


In [186]:
listing_history[10390789]

[[50, '2015-09-07 11:32:22', 399900.0],
 [50, '2019-04-11 11:35:02', '399900'],
 [100, '2019-04-11 11:35:02', '399900'],
 [240, '2019-04-11 11:35:02', '399900'],
 [500, '2019-04-11 11:35:02', '399900']]

In [187]:
events_sample = listings_sample.merge(right=s, how='left', right_on='listing_key', left_on='source_id')[['source_id', 'events', 'listing_history', 'status']]
events_sample.head()

Unnamed: 0,source_id,events,listing_history,status
0,10002014,"[(100, 2015-06-24, 219900.0), (240, 2015-07-17...","[[50, 2015-06-24 07:56:16, 219900.0], [50, 201...",S
1,100198211,"[(100, 2017-03-01, 2350.0), (240, 2017-04-27, ...","[[50, 2017-03-22 20:27:45, 2350.0], [100, 2019...",S
2,10025396,"[(100, 2015-06-28, 12500000.0), (300, 2020-06-...","[[100, 2015-06-28 11:00:14, 12500000.0], [600,...",X
3,10025573,"[(100, 2015-06-28, 6200000.0), (300, 2020-06-3...","[[100, 2015-06-28 12:10:16, 6200000.0], [600, ...",X
4,10051516,"[(100, 2015-07-04, 265000.0), (240, 2016-01-08...","[[50, 2015-07-04 20:25:01, 265000.0], [50, 201...",S


In [188]:
for status in set(events_sample['status']):
    events_sample[events_sample['status'] == status].to_csv(f'data/events_comparison_{status}.csv')

In [189]:
listings_sample.merge(right=s, how='left', right_on='listing_key', left_on='source_id').head()

Unnamed: 0,source_id,close_date,expiration_date,listing_contract_date,purchase_contract_date,withdrawn_date,list_price,current_price,original_list_price,close_price,...,previous_list_price,previous_status,status_change_timestamp,on_market_date,off_market_date,contingent_date,original_entry_timestamp,events,listing_key,listing_history
0,10002014,2015-10-27,2016-04-28,2015-06-24,2015-07-17,,219900.0,220000.0,219900.0,220000.0,...,,P,2015-10-27 17:23:41,2015-06-24,2015-07-17,,2015-06-24 07:56:16,"[(100, 2015-06-24, 219900.0), (240, 2015-07-17...",10002014.0,"[[50, 2015-06-24 07:56:16, 219900.0], [50, 201..."
1,100198211,2017-05-07,2017-06-30,2017-03-01,2017-04-27,,2350.0,2350.0,2350.0,2350.0,...,,U,2017-05-07 19:48:00,2017-03-01,2017-05-07,,2017-03-22 20:27:45,"[(100, 2017-03-01, 2350.0), (240, 2017-04-27, ...",100198211.0,"[[50, 2017-03-22 20:27:45, 2350.0], [100, 2019..."
2,10025396,,2020-06-30,2015-06-28,,2020-02-24,8750000.0,8750000.0,12500000.0,,...,12500000.0,W,2020-07-01 00:04:17,2015-06-28,2020-02-24,,2015-06-28 11:00:14,"[(100, 2015-06-28, 12500000.0), (300, 2020-06-...",10025396.0,"[[100, 2015-06-28 11:00:14, 12500000.0], [600,..."
3,10025573,,2020-06-30,2015-06-28,,2020-02-24,5300000.0,5300000.0,6200000.0,,...,6200000.0,W,2020-07-01 01:04:19,2015-06-28,2020-02-24,,2015-06-28 12:10:16,"[(100, 2015-06-28, 6200000.0), (300, 2020-06-3...",10025573.0,"[[100, 2015-06-28 12:10:16, 6200000.0], [600, ..."
4,10051516,2016-02-23,2016-02-29,2015-07-04,2016-01-08,,245000.0,240000.0,265000.0,240000.0,...,249900.0,U,2016-02-25 14:40:27,2015-07-04,2016-02-23,,2015-07-04 20:25:01,"[(100, 2015-07-04, 265000.0), (240, 2016-01-08...",10051516.0,"[[50, 2015-07-04 20:25:01, 265000.0], [50, 201..."


In [190]:
events_sample[events_sample['source_id'] == 10025573]

Unnamed: 0,source_id,events,listing_history,status
3,10025573,"[(100, 2015-06-28, 6200000.0), (300, 2020-06-3...","[[100, 2015-06-28 12:10:16, 6200000.0], [600, ...",X


In [191]:
listing_history[362150008]

[[100, '2019-02-19 11:38:38', 950000.0],
 [300, '2020-01-01 06:33:00', 950000.0],
 [300, '2020-01-15 11:53:37', '985000.00'],
 [100, '2020-01-15 11:53:37', '985000.00']]

In [192]:
history_sample[history_sample['resource_record_key'] == 362150008]

Unnamed: 0,new_value,field_name,history_key,modification_timestamp,resource_record_key
366501,X,STANDARDSTATUS,9664926,2020-01-01 06:33:00,362150008
346627,985000.00,LISTPRICE,9683162,2020-01-15 11:53:37,362150008
346535,A,STANDARDSTATUS,9683246,2020-01-15 11:53:37,362150008


In [193]:
events_sample.head()

Unnamed: 0,source_id,events,listing_history,status
0,10002014,"[(100, 2015-06-24, 219900.0), (240, 2015-07-17...","[[50, 2015-06-24 07:56:16, 219900.0], [50, 201...",S
1,100198211,"[(100, 2017-03-01, 2350.0), (240, 2017-04-27, ...","[[50, 2017-03-22 20:27:45, 2350.0], [100, 2019...",S
2,10025396,"[(100, 2015-06-28, 12500000.0), (300, 2020-06-...","[[100, 2015-06-28 11:00:14, 12500000.0], [600,...",X
3,10025573,"[(100, 2015-06-28, 6200000.0), (300, 2020-06-3...","[[100, 2015-06-28 12:10:16, 6200000.0], [600, ...",X
4,10051516,"[(100, 2015-07-04, 265000.0), (240, 2016-01-08...","[[50, 2015-07-04 20:25:01, 265000.0], [50, 201...",S


In [194]:
def events_time_diff(lh: list) -> list:
    """get average time in between events

    Args:
        lh (list): list of events

    Returns:
        datetime.timedelta: average difference in time between events
    """
    timediff = []
    for i in range(len(lh)-1):
        try:
            e1 = datetime.fromisoformat(lh[i][1])
            e2 = datetime.fromisoformat(lh[i+1][1])
            timediff.append(e2 - e1)
        except TypeError:
            print(lh[i], lh[i+1])
            pass
    '''if len(timediff) > 0:
        return sum(timediff, timedelta(0)) / len(timediff)
    else:
        return None'''
    return timediff

In [195]:
events_diff = pd.DataFrame({'source_id': [x for x, y in zip(events_sample['source_id'], events_sample['listing_history']) if type(y) == list], 'avg_timediff': [events_time_diff(x) for x in events_sample.listing_history if type(x) == list]})

[50, nan, 775000.0] [100, '2019-04-04 09:23:11', 775000.0]
[100, nan, 439000.0] [500, '2021-11-19 06:20:24', '439000']
[100, nan, 295000.0] [500, '2021-11-06 20:16:46', '285000']


In [196]:
events_diff.head()

Unnamed: 0,source_id,avg_timediff
0,10002014,"[1297 days, 6:03:45, 0:00:00, 0:00:00, 0:00:00..."
1,100198211,"[749 days, 18:37:17, 0:00:00, 0:00:00, 0:00:00..."
2,10025396,"[1701 days, 22:45:47]"
3,10025573,"[1701 days, 21:35:13]"
4,10051516,"[1376 days, 15:10:01, 0:00:00, 0:00:00, 0:00:0..."


In [197]:
events_diff['has_zero'] = [1 if timedelta(0) in x else 0 for x in events_diff.avg_timediff]
events_diff[events_diff['has_zero'] == 1]

Unnamed: 0,source_id,avg_timediff,has_zero
0,10002014,"[1297 days, 6:03:45, 0:00:00, 0:00:00, 0:00:00...",1
1,100198211,"[749 days, 18:37:17, 0:00:00, 0:00:00, 0:00:00...",1
4,10051516,"[1376 days, 15:10:01, 0:00:00, 0:00:00, 0:00:0...",1
5,10089929,"[1370 days, 14:45:45, 0:00:00, 0:00:00, 0:00:0...",1
6,10203662,"[1259 days, 5:26:17, 0:00:00, 0:00:00, 0:00:00...",1
...,...,...,...
336971,367461486,[0:00:00],1
336972,367461544,[0:00:00],1
336973,367461621,[0:00:00],1
336974,367461723,[0:00:00],1


In [198]:
len(events_diff[events_diff['has_zero'] == 1])/len(events_diff)

0.44618110166121633

In [199]:
events_diff.to_csv('data/avg_time_difference.csv', index=False)

In [200]:
events_sample.head()

Unnamed: 0,source_id,events,listing_history,status
0,10002014,"[(100, 2015-06-24, 219900.0), (240, 2015-07-17...","[[50, 2015-06-24 07:56:16, 219900.0], [50, 201...",S
1,100198211,"[(100, 2017-03-01, 2350.0), (240, 2017-04-27, ...","[[50, 2017-03-22 20:27:45, 2350.0], [100, 2019...",S
2,10025396,"[(100, 2015-06-28, 12500000.0), (300, 2020-06-...","[[100, 2015-06-28 11:00:14, 12500000.0], [600,...",X
3,10025573,"[(100, 2015-06-28, 6200000.0), (300, 2020-06-3...","[[100, 2015-06-28 12:10:16, 6200000.0], [600, ...",X
4,10051516,"[(100, 2015-07-04, 265000.0), (240, 2016-01-08...","[[50, 2015-07-04 20:25:01, 265000.0], [50, 201...",S


In [201]:
correct_end = 0
incorrect_end = 0
correct = []
incorrect = []
for index, row in events_sample.iterrows():
    # print(row)
    if type(row['listing_history']) == list:
        if row['listing_history'][-1][0] == event_codes[row['status']]:
            correct_end += 1
            correct.append(index)
        else:
            incorrect_end += 1
            incorrect.append(index)

print(correct_end, incorrect_end)

170326 166656


In [202]:
events_sample[events_sample.index.isin(incorrect)]

Unnamed: 0,source_id,events,listing_history,status
2,10025396,"[(100, 2015-06-28, 12500000.0), (300, 2020-06-...","[[100, 2015-06-28 11:00:14, 12500000.0], [600,...",X
3,10025573,"[(100, 2015-06-28, 6200000.0), (300, 2020-06-3...","[[100, 2015-06-28 12:10:16, 6200000.0], [600, ...",X
6,10203662,"[(100, 2015-08-01, 739888.0), (620, 2016-02-16...","[[50, 2015-08-01 08:33:44, 739888.0], [50, 201...",K
8,10265990,"[(100, 2015-08-12, 85000.0), (620, 2016-03-02 ...","[[50, 2015-08-12 21:34:40, 85000.0], [50, 2019...",K
15,10419458,"[(100, 2015-09-11, 550000.0), (240, 2020-12-07...","[[100, 2015-09-11 19:24:23, 550000.0], [100, 2...",S
...,...,...,...,...
337898,367462126,"[(100, 2021-10-21, 1700000.0), (620, 2022-04-1...","[[50, 2021-10-21 09:24:19, 1700000.0], [100, 2...",K
337899,367464545,"[(100, 2021-10-22, 269000.0), (600, 2023-06-07...","[[50, 2021-10-22 13:00:59, 269000.0], [100, 20...",W
337900,4119647,"[(100, 2009-04-01, 160000.0), (300, 2010-01-04...","[[100, 2009-04-01 22:00:00, 160000.0], [500, 2...",X
337901,79771482,"[(100, 2017-02-12, 40000.0), (300, 2022-01-19,...","[[100, 2017-02-12 16:32:59, 40000.0], [100, 20...",X


In [203]:
history_sample[history_sample['resource_record_key'] == 10025396]

Unnamed: 0,new_value,field_name,history_key,modification_timestamp,resource_record_key
2137,W,STANDARDSTATUS,9998040,2020-02-24 09:46:01,10025396


In [204]:
events_sample.listing_history[0][-1][0]

500

In [205]:
events_sample.status[0]

'S'

In [206]:
correct_end = 0
incorrect_end = 0
for index, row in events_sample.iterrows():
    # print(row)
    if type(row['listing_history']) == list:
        for event in row['listing_history']:
            if 500 in event:
                for e in row['listing_history']:    
                    if 240 in e:
                        correct_end += 1
                        break
                else:
                    incorrect_end += 1

print(correct_end, incorrect_end)

45440 68492


In [211]:
events_sample.head()

Unnamed: 0,source_id,events,listing_history,status
0,10002014,"[(100, 2015-06-24, 219900.0), (240, 2015-07-17...","[[50, 2015-06-24 07:56:16, 219900.0], [50, 201...",S
1,100198211,"[(100, 2017-03-01, 2350.0), (240, 2017-04-27, ...","[[50, 2017-03-22 20:27:45, 2350.0], [100, 2019...",S
2,10025396,"[(100, 2015-06-28, 12500000.0), (300, 2020-06-...","[[100, 2015-06-28 11:00:14, 12500000.0], [600,...",X
3,10025573,"[(100, 2015-06-28, 6200000.0), (300, 2020-06-3...","[[100, 2015-06-28 12:10:16, 6200000.0], [600, ...",X
4,10051516,"[(100, 2015-07-04, 265000.0), (240, 2016-01-08...","[[50, 2015-07-04 20:25:01, 265000.0], [50, 201...",S


In [207]:
events_sample['listing_history'][0]

[[50, '2015-06-24 07:56:16', 219900.0],
 [50, '2019-01-11 14:00:01', '219900'],
 [100, '2019-01-11 14:00:01', '219900'],
 [200, '2019-01-11 14:00:01', '219900'],
 [240, '2019-01-11 14:00:01', '219900'],
 [500, '2019-01-11 14:00:01', '220000']]

In [208]:
correct_end = 0
incorrect_end = 0
correct = []
incorrect = []
for index, row in events_diff.iterrows():
    try:
        if row['avg_timediff'][0] < timedelta(days=365):
            correct_end += 1
            correct.append(index)
        else:
            incorrect_end += 1
            incorrect.append(index)
    except:
        print(row)
print(correct_end, incorrect_end)

source_id       367383247
avg_timediff           []
has_zero                0
Name: 171386, dtype: object
source_id       367388621
avg_timediff           []
has_zero                0
Name: 171407, dtype: object
source_id       367708902
avg_timediff           []
has_zero                0
Name: 174581, dtype: object
source_id       367708906
avg_timediff           []
has_zero                0
Name: 174585, dtype: object
source_id       367708992
avg_timediff           []
has_zero                0
Name: 174613, dtype: object
source_id       367709165
avg_timediff           []
has_zero                0
Name: 174655, dtype: object
source_id       367709349
avg_timediff           []
has_zero                0
Name: 174701, dtype: object
source_id       367709696
avg_timediff           []
has_zero                0
Name: 174782, dtype: object
source_id       367709708
avg_timediff           []
has_zero                0
Name: 174791, dtype: object
source_id       367709779
avg_timediff        

In [209]:
events_diff

Unnamed: 0,source_id,avg_timediff,has_zero
0,10002014,"[1297 days, 6:03:45, 0:00:00, 0:00:00, 0:00:00...",1
1,100198211,"[749 days, 18:37:17, 0:00:00, 0:00:00, 0:00:00...",1
2,10025396,"[1701 days, 22:45:47]",0
3,10025573,"[1701 days, 21:35:13]",0
4,10051516,"[1376 days, 15:10:01, 0:00:00, 0:00:00, 0:00:0...",1
...,...,...,...
336977,367464545,[0:00:00],1
336978,4119647,"[4554 days, 21:49:58]",0
336979,79771482,"[697 days, 0:52:25]",0
336980,8060235,"[1627 days, 1:43:27]",0
