In [1]:
import glob2
import pandas as pd

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 100) #50) # default

In [6]:
filenames = glob2.glob('./output/*Mazda-Miata*.csv')
#filenames = glob2.glob('./output/*Nissan-370Z*.csv')  
#filenames = glob2.glob('./output/*BMW-Z4*.csv')  
#filenames = glob2.glob('./output/*Toyota-4Runner*.csv')  

df = pd.read_csv(filenames[0])
for filename in filenames[1:]:
    df = pd.concat([df, pd.read_csv(filename)], ignore_index=True, sort=False)

In [8]:
df.index.name = 'orig_index'

In [9]:
#df.head(2)

In [10]:
agg = df.groupby('vin').agg({'date': ['max', 'min', 'count']})
agg.columns = ['max_date', 'min_date', 'count_date']

# add aggregations to row by vin
df_stats = df.join(agg, on='vin')


In [11]:
#df_stats.head(2)

In [12]:
def avail(row, date_max):
    if row['count_date'] == 1 and row['date'] == date_max:
        return 'new listing'
    if row['max_date'] < date_max:
        return 'listing not avail'
    return 'listing avail'

date_max = df['date'].max()
date_max

df_stats['availability'] = df_stats.apply(avail, axis=1, date_max=date_max)


In [13]:
#df_stats.head(2)

In [14]:
# order rows by vin and date to setup for calculating the price difference between dates
df_stats = df_stats.sort_values(by=['vin', 'date'])


In [15]:
#df_stats.head(15)

In [16]:
df_stats_price_diff = df_stats.groupby('vin')['vehicle_price'].rolling(2).apply(lambda price: price[1] - price[0], raw=True).to_frame('price_diff').reset_index()


In [17]:
#df_stats_price_diff.head(5)

In [18]:
assert len(df_stats) == len(df_stats_price_diff)

In [19]:
# join(..., left_on='index', right_on='level_1') is sufficient
# adding 'vin' is redundant, but keeps from adding duplicate columns vin_x, vin_y to dataframe
df_stats = df_stats.merge(df_stats_price_diff, left_on=['vin', 'orig_index'], right_on=['vin', 'orig_index'])


In [20]:
#df_stats.head(5)

In [21]:
#df_stats.columns

In [22]:
#['year_make_model_1', ''index', detail_url', 'vin', 'price_rating', 'vehicle_price', 'year_make_model_2', 'trim', 'location', 'mileage', 'exterior_color',
# 'interior_color', 'mpg', 'engine', 'transmission', 'drive_type', 'fuel_type', 'accident_check', 'usage', 'title', 'number_of_owners', 'date', 'max_date', 'min_date', 
# 'count_date', 'availability', 'price_diff']

# columns wanted for analysis
df_less_columns = df_stats[['year_make_model_1', 'vin', 'price_rating', 'vehicle_price', 'price_diff', 'trim', 'location', 'mileage', 
                 'exterior_color', 'interior_color', 'transmission', 'accident_check', 'usage', 'title', 'number_of_owners', 'date', 'availability', 'detail_url']]

In [23]:
#df_less_columns.head(5)

In [24]:
# various row filters
filter1 = df_less_columns['accident_check'] == 0 # no accidents
filter2 = df_less_columns['number_of_owners'] < 3 # less than three owners
filter3 = df_less_columns['title'] == 'Clean' # only interested in clean titles
filter4 = df_less_columns['availability'] != 'listing not avail' # not interested in listings that are no longer available


In [25]:
# note, for very large datasets it would be better to save new dataframe with the applied filter(s) instead of repeated filtering

#df_less_columns[filter1
#df_less_columns[filter1 & filter2]
#df_less_columns[filter1 & filter2 & filter3]
df_less_columns[filter1 & filter2 & filter3 & filter4]

Unnamed: 0,year_make_model_1,vin,price_rating,vehicle_price,price_diff,trim,location,mileage,exterior_color,interior_color,transmission,accident_check,usage,title,number_of_owners,date,availability,detail_url
0,2010 Mazda MX-5 Miata,JM1NC2EF6A0210464,Great Price,10990,,Touring Automatic,"Augusta, GA",62100,Red,Black,Automatic,0,Personal or Rental Use,Clean,1,2020-03-10 09:21:04.388781,listing avail,https://penfed.truecar.com/used-cars-for-sale/listing/JM1NC2EF6A0210464/2010-mazda-mx-5-miata/
1,2010 Mazda MX-5 Miata,JM1NC2EF6A0210464,Great Price,10990,0.0,Touring Automatic,"Augusta, GA",62100,Red,Black,Automatic,0,Personal or Rental Use,Clean,1,2020-03-10 20:42:49.214281,listing avail,https://penfed.truecar.com/used-cars-for-sale/listing/JM1NC2EF6A0210464/2010-mazda-mx-5-miata/
2,2010 Mazda MX-5 Miata,JM1NC2EF6A0210464,Great Price,10990,0.0,Touring Automatic,"Augusta, GA",62100,Red,Black,Automatic,0,Personal or Rental Use,Clean,1,2020-03-13 08:24:30.589904,listing avail,https://penfed.truecar.com/used-cars-for-sale/listing/JM1NC2EF6A0210464/2010-mazda-mx-5-miata/
17,2010 Mazda MX-5 Miata,JM1NC2FF6A0210107,Great Price,11959,,Grand Touring PRHT Automatic,"Avenel, NJ",40696,Silver,Black,Automatic,0,Personal or Rental Use,Clean,2,2020-03-07 09:42:18.653476,listing avail,https://penfed.truecar.com/used-cars-for-sale/listing/JM1NC2FF6A0210107/2010-mazda-mx-5-miata/
18,2010 Mazda MX-5 Miata,JM1NC2FF6A0210107,Great Price,11959,0.0,Grand Touring PRHT Automatic,"Avenel, NJ",40696,Silver,Black,Automatic,0,Personal or Rental Use,Clean,2,2020-03-08 22:14:26.172861,listing avail,https://penfed.truecar.com/used-cars-for-sale/listing/JM1NC2FF6A0210107/2010-mazda-mx-5-miata/
19,2010 Mazda MX-5 Miata,JM1NC2FF6A0210107,Excellent Price,11959,0.0,Grand Touring PRHT Automatic,"Avenel, NJ",40696,Silver,Black,Automatic,0,Personal or Rental Use,Clean,2,2020-03-09 10:22:43.797047,listing avail,https://penfed.truecar.com/used-cars-for-sale/listing/JM1NC2FF6A0210107/2010-mazda-mx-5-miata/
20,2010 Mazda MX-5 Miata,JM1NC2FF6A0210107,Excellent Price,11959,0.0,Grand Touring PRHT Automatic,"Avenel, NJ",40696,Silver,Black,Automatic,0,Personal or Rental Use,Clean,2,2020-03-09 23:29:42.335789,listing avail,https://penfed.truecar.com/used-cars-for-sale/listing/JM1NC2FF6A0210107/2010-mazda-mx-5-miata/
21,2010 Mazda MX-5 Miata,JM1NC2FF6A0210107,Great Price,11959,0.0,Grand Touring PRHT Automatic,"Avenel, NJ",40696,Silver,Black,Automatic,0,Personal or Rental Use,Clean,2,2020-03-10 09:21:04.388781,listing avail,https://penfed.truecar.com/used-cars-for-sale/listing/JM1NC2FF6A0210107/2010-mazda-mx-5-miata/
22,2010 Mazda MX-5 Miata,JM1NC2FF6A0210107,Great Price,11959,0.0,Grand Touring PRHT Automatic,"Avenel, NJ",40696,Silver,Black,Automatic,0,Personal or Rental Use,Clean,2,2020-03-10 20:42:49.214281,listing avail,https://penfed.truecar.com/used-cars-for-sale/listing/JM1NC2FF6A0210107/2010-mazda-mx-5-miata/
23,2010 Mazda MX-5 Miata,JM1NC2FF6A0210107,Excellent Price,11959,0.0,Grand Touring PRHT Automatic,"Avenel, NJ",40696,Silver,Black,Automatic,0,Personal or Rental Use,Clean,2,2020-03-13 08:24:30.589904,listing avail,https://penfed.truecar.com/used-cars-for-sale/listing/JM1NC2FF6A0210107/2010-mazda-mx-5-miata/


In [26]:
# filter by index range
#df_less_columns[filter1 & filter2 & filter3 & filter4].loc[66:79] # view range of listings of interest

In [27]:
# filter by index row
#df_less_columns[filter1 & filter2 & filter3 & filter4].loc[109] # view listings details for a specific listing