In [12]:
import pandas as pd
import numpy as np
import math

In [10]:
def date_feats(feat_df):
    # grab offer time
    off_series = feat_df['src_cre_date'].values
    # grab auction post time
    post_series = feat_df['auct_start_dt'].values
    # grab auction expiration time
    close_series = feat_df['auct_end_dt'].values
    close_series = close_series + np.timedelta64(24, 'h')

    # get total duration in hours
    dur = (close_series - post_series).astype(int)/1e9/math.pow(60, 2)

    rem = (close_series - off_series).astype(int)/1e9/math.pow(60, 2)
    passed = (off_series - post_series).astype(int)/1e9/math.pow(60, 2)

    # creating series for each new feature
    duration = pd.Series(dur, index=feat_df.index)
    remain = pd.Series(rem, index=feat_df.index)
    passed_time = pd.Series(passed, index=feat_df.index)
    frac_passed = pd.Series(passed/dur, index=feat_df.index)
    frac_remain = pd.Series(remain/dur, index=feat_df.index)

    feat_df['frac_remain'] = frac_remain
    feat_df['frac_passed'] = frac_passed
    feat_df['passed'] = passed_time
    feat_df['remain'] = remain
    feat_df['duration'] = duration

    return feat_df

In [18]:
def accept_bool(df):
    df.sort_values(by='src_cre_date', ascending=True,
                   inplace=True)
    accepted = df['status_id'].isin([1, 9]).values
    tot = np.sum(accepted)
    if tot > 0:
        if tot > 1:
            return False
        else:
            return accepted[len(accepted) - 1]
    else:
        return True

In [66]:
def remove_accept(df, accept_series, both_inds, val):
    accept_series.drop(index=both_inds, inplace=True)
    big_inds = accept_series[accept_series > 1].index
    big_inds = big_inds.values
    if big_inds.size > 0:
        print(big_inds)
        df_inds = df[df['unique_thread_id'].isin(big_inds)].index
        df.drop(index=df_inds, inplace=True)
    accept_series.drop(index=big_inds, inplace=True)
    remaining_threads = accept_series.index
    remaining_threads = remaining_threads.values
    accepted_df = df.loc[df['unique_thread_id'].isin(remaining_threads), ['status_id', 'unique_thread_id', 
                                                                         'turn_count']].copy()
    if len(accepted_df.index) > 0:
        num_turns = accepted_df.groupby('unique_thread_id').size()
        accepted_df = accepted_df[accepted_df['status_id'] == val].copy()
        accepted_df.set_index('unique_thread_id', inplace=True)
        loc_accepted = accepted_df['turn_count']
        num_turns = num_turns - 1
        thread_inds = loc_accepted[num_turns != loc_accepted].index
        thread_inds = thread_inds.values
        print(thread_inds)
        df_inds = df[df['unique_thread_id'].isin(thread_inds)].index
        df.drop(index=df_inds, inplace=True)
    return df


def clean_data(df):
    org_ids = len(np.unique(df['unique_thread_id'].values))
    # remove thread ids corresponding to threads where at least one offer is greater
    # than the start price
    larg_off = df['start_price_usd'].values < df['offr_price'].values
    larg_off_threads = np.unique(df.loc[larg_off, 'unique_thread_id'].values)
    print(larg_off_threads)
    larg_off = df['unique_thread_id'].isin(larg_off_threads)
    del larg_off_threads
    larg_off_inds = df[larg_off].index
    df.drop(larg_off_inds, inplace=True)
    del larg_off
    del larg_off_inds

    print('Removed threads where one offer is greater than the start price')
    # remove thread ids corresponding to threads where more than 6 turns have been taken
    long_thread = df['turn_count'] > 5
    long_thread = long_thread.values
    long_thread_ids = np.unique(df.loc[long_thread, 'unique_thread_id'].values)
    print(long_thread_ids)
    long_thread = df['unique_thread_id'].isin(long_thread_ids)
    del long_thread_ids
    long_thread_inds = df[long_thread].index
    df.drop(long_thread_inds, inplace=True)
    del long_thread
    del long_thread_inds

    print('Removed threads where more than 6 offers have been made')

    # filter by unique_thread_id, and remove threads where an offer is accepted
    # but there are other offers after it
    prev_ids = len(np.unique(df['unique_thread_id'].values))

    max_turns = feat_df.groupby(['status_id', 'unique_thread_id']).size()
    max_turns_accept = max_turns.xs(1, level='status_id', drop_level=True)
    max_turns_auto = max_turns.xs(9, level='status_id', drop_level=True)
    del max_turns
    auto_inds = max_turns_auto.index
    accept_inds = max_turns_accept.index
    both_inds = np.intersect1d(auto_inds.values, accept_inds.values)
    if both_inds.size > 0:
        print(both_inds)
        both_accept_ids = df[df['unique_thread_id'].isin(both_inds)].index
        df.drop(both_accept_inds, inplace=True)
    df = remove_accept(df, max_turns_accept, both_inds, 1)
    df = remove_accept(df, max_turns_auto, both_inds, 9)

    print('Removed threads that have an accepted offer but not as the last offer')
    cut_ids = len(np.unique(df['unique_thread_id'].values))
    print('Threads had an accept offer entered in the wrong place: %d' %
          (prev_ids - cut_ids))
    print('Total Removed: %d' % (org_ids - cut_ids))
    return df

In [70]:
feat_df = pd.read_csv('data/' + 'toy' + '/' + 'toy-1_feats.csv',
                      parse_dates=['src_cre_date', 'auct_start_dt',
                                   'auct_end_dt', 'response_time'],
                      dtype={'unique_thread_id': np.int64})
feat_df.drop(columns=['item_cndtn_id', 'meta_categ_id', 'anon_leaf_categ_id', 'src_cre_dt'], inplace=True)
feat_df.rename(columns={'Unnamed: 0': 'turn_count'}, inplace=True)

In [67]:
feat_df = date_feats(feat_df)

In [17]:
feat_df[['duration', 'frac_passed', 'passed', 'remain', 'frac_remain', 'frac_passed']]

Unnamed: 0,duration,frac_passed,passed,remain,frac_remain,frac_passed.1
0,48.0,0.880851,42.280833,5.719167,0.119149,0.880851
1,48.0,0.883183,42.392778,5.607222,0.116817,0.883183
2,3960.0,0.995940,3943.921389,16.078611,0.004060,0.995940
3,3960.0,0.996047,3944.347222,15.652778,0.003953,0.996047
4,3960.0,0.996627,3946.643889,13.356111,0.003373,0.996627
5,3960.0,0.996651,3946.736111,13.263889,0.003349,0.996651
6,3960.0,0.996916,3947.788056,12.211944,0.003084,0.996916
7,120.0,0.634757,76.170833,43.829167,0.365243,0.634757
8,120.0,0.634801,76.176111,43.823889,0.365199,0.634801
9,96.0,0.459850,44.145556,51.854444,0.540150,0.459850


In [71]:
clean_df = clean_data(feat_df)

[    163    1969    4186 ... 4026733 4029323 4031202]
Removed threads where one offer is greater than the start price
[]
Removed threads where more than 6 offers have been made
Removed threads that have an accepted offer but not as the last offer
Threads had an accept offer entered in the wrong place: 4
Total Removed: 3906
