After testing multiple other models (KNN, SVC, Descision Tree), I landed on a Random Forest Classifier with the parameters below as the best model.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [64]:
crv_n = pd.read_csv('/Users/avacheevers/Desktop/crv_rfc.csv')
crv_m = pd.read_csv('/Users/avacheevers/Desktop/crv_rfc.csv', index_col=0)

crv_m = crv_m.drop(columns = ['str_year'])
labels = crv_m.pop('sold')
labels = labels.apply(lambda x: 1 if x==True else 0)
crv_m = pd.get_dummies(crv_m)

In [65]:
# web-scrapes several pages of crv listings

listing_dic = {}       #keep scraped data here
entries_completed = 0

for item in list(range(200,206)):
    page_number = str(item)
    URL = 'https://www.truecar.com/used-cars-for-sale/listings/honda/cr-v/?page='+page_number+'&sort[]=best_match'
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    main = soup.find(class_ = "row row-2 margin-bottom-3")
    
    try:
        listings = main.find_all('div', class_ = "card-content vehicle-card-body")
    
        for listing in listings:
            entries_completed += 1
            internal_dic = {}
    
            year = listing.find('span', class_ = "vehicle-card-year font-size-1")
            internal_dic['model_year'] = year.text
    
            make = listing.find('span', class_ ="vehicle-header-make-model text-truncate")
            internal_dic['make_model'] = make.text
    
            try:
                trim = listing.find('div', class_ = "font-size-1 text-truncate")
                internal_dic['trim'] = trim.text
            except AttributeError:
                print("no trim found for #", entries_completed)
    
            try:
                price_label = listing.find('span', class_ = "graph-icon-title margin-left-1 vehicle-card-price-rating-label text-truncate font-weight-bold")
                internal_dic['price_label'] = price_label.text
            except AttributeError:
                print("no price_label found for #", entries_completed)
    
            try:
                price = listing.find('h4', class_ = 'heading-3 margin-y-1 font-weight-bold')
                internal_dic['price'] = price.text
            except AttributeError:
                print("no price found for #", entries_completed)
    
            try:
                mileage = listing.find('div', class_ = 'd-flex w-100 justify-content-between')
                internal_dic['mileage'] = mileage.text #CLEAN 'DISCOUNT AVAIL' PART OUT OF THIS
            except AttributeError:
                print("no mileage found for #", entries_completed)

            try:
                location = listing.find('div', class_ = 'vehicle-card-location font-size-1 margin-top-1')
                internal_dic['location'] = location.text
            except AttributeError:
                print("no location found for #", entries_completed)
    
            try:
                color = listing.find('div', class_ = 'vehicle-card-location font-size-1 margin-top-1 text-truncate')
                internal_dic['color'] = color.text
            except AttributeError:
                print("no color found for #", entries_completed)
    
            listing_dic[entries_completed] = internal_dic
    except AttributeError:
        continue
    #time.sleep(0.5)

crv = pd.DataFrame.from_dict(listing_dic)
crv = crv.transpose()
crv['price'] = crv['price'].str.replace('$', '')
crv['mileage'] = crv['mileage'].str.replace('Discount Available', '')
crv['mileage'] = crv['mileage'].str.replace('Upfront Price Available', '')
crv['mileage'] = crv['mileage'].str.replace('miles', '')
crv['mileage'] = crv['mileage'].str.strip()
crv['city'] = crv['location'].str.split(',')
crv['city'] = crv['city'].apply(lambda x: x[0])
crv['city'] = crv['city'].str.replace(',', '')
crv['city'] = crv['city'].str.strip()
crv['state'] = crv['location'].str.split(',')
crv['state'] = crv['state'].apply(lambda x: x[1])
crv['state'] = crv['state'].str.replace(',', '')
crv['state'] = crv['state'].str.strip()
crv = crv.drop('location', 1)
crv['exterior_color'] = crv['color'].str.split(',')
crv['exterior_color'] = crv['exterior_color'].apply(lambda x: x[0])
crv['exterior_color'] = crv['exterior_color'].str.replace('exterior','')
crv['exterior_color'] = crv['exterior_color'].str.strip()
crv['trim'] = crv['trim'].apply(lambda x: None if 'Hybrid' in str(x) else x)
crv['trim'] = crv['trim'].apply(lambda x: None if 'Rear Entertainment' in str(x) else x)
crv['trim'] = crv['trim'].apply(lambda x: 'LX FWD' if 'LX 4WD' in str(x) else x)
crv['trim'] = crv['trim'].apply(lambda x: None if 'EX 4WD Manual' in str(x) else x)
crv['trim'] = crv['trim'].apply(lambda x: 'EX FWD' if 'EX 4WD' in str(x) else x)
crv['trim'] = crv['trim'].apply(lambda x: 'EX FWD' if 'EX FWD' in str(x) else x)
crv['trim'] = crv['trim'].apply(lambda x: 'LX FWD' if 'LX FWD' in str(x) else x)
crv['trim'] = crv['trim'].apply(lambda x: 'SE FWD' if 'SE 4WD' in str(x) else x)
crv['trim'] = crv['trim'].apply(lambda x: 'EX-L FWD' if 'EX-L 4WD' in str(x) else x)
crv['trim'] = crv['trim'].apply(lambda x: 'EX-L with Navigation FWD' if 'EX-L with Navigation 4WD' in str(x) else x)
crv['trim'] = crv['trim'].apply(lambda x: 'EX Special Edition FWD' if 'EX Special' in str(x) else x)
crv['trim'] = crv['trim'].apply(lambda x: 'EX FWD' if '4WD' in str(x) else x)
crv['drive'] = crv['trim']
crv['drive'] = crv['drive'].apply(lambda x: 'FWD' if 'FWD' in str(x) else ('AWD' if 'AWD' in str(x) else None))
crv['specs'] = crv['trim'].str.strip()
crv['specs'] = crv['specs'].apply(lambda x: 'EX-L with Nav' if 'EX-L with Nav' in str(x) else x)
crv['specs'] = crv['specs'].apply(lambda x: 'Touring' if 'Touring' in str(x) else x)
crv['specs'] = crv['specs'].apply(lambda x: 'SE' if 'SE' in str(x) else x)
crv['specs'] = crv['specs'].apply(lambda x: 'EX Special Edition' if 'EX Spe' in str(x) else x)
crv['specs'] = crv['specs'].apply(lambda x: 'EX-L' if str(x)=='EX-L FWD' else x)
crv['specs'] = crv['specs'].apply(lambda x: 'EX-L' if str(x)=='EX-L AWD' else x)
crv['specs'] = crv['specs'].apply(lambda x: 'EX' if str(x)=='EX AWD' else x)
crv['specs'] = crv['specs'].apply(lambda x: 'EX' if str(x)=='EX FWD' else x)
crv['specs'] = crv['specs'].apply(lambda x: 'LX' if str(x)=='LX FWD' else x)
crv['specs'] = crv['specs'].apply(lambda x: 'LX' if str(x)=='LX AWD' else x)
crv['specs'] = crv['specs'].apply(lambda x: None if str(x)=='' else x)
crv['exterior_color'] = crv['exterior_color'].apply(lambda x: 'Unknown' if 'Unknown' in str(x) else x)
crv['exterior_color'] = crv['exterior_color'].apply(lambda x: 'Blue' if 'Purple' in str(x) else x)
crv['exterior_color'] = crv['exterior_color'].apply(lambda x: 'Brown' if 'Copper' in str(x) else x)
crv['exterior_color'] = crv['exterior_color'].apply(lambda x: 'Blue' if 'Teal' in str(x) else x)
crv['exterior_color'] = crv['exterior_color'].apply(lambda x: 'Tan' if 'Beige' in str(x) else x)
crv['exterior_color'] = crv['exterior_color'].apply(lambda x: 'Tan' if 'Gold' in str(x) else x)
crv['price'] = crv['price'].str.replace(',','')
crv['mileage'] = crv['mileage'].str.replace(',','')
crv["price"] = pd.to_numeric(crv["price"])
crv["mileage"] = pd.to_numeric(crv["mileage"])
crv_raw = crv[['model_year', 'price', 'mileage', 'exterior_color', 'drive', 'specs', 'city', 'state']]
crv['ext_color'] = crv['exterior_color']
crv['common_color'] = crv.ext_color.apply(lambda x: True if x in(['Gray', 'Silver', 'White', 'Black']) else False)
crv['year_mean'] = crv.model_year.apply(lambda x: crv_n[crv_n['model_year']==int(x)].price.mean())
crv['dev'] = (crv.price - crv.year_mean)/crv.year_mean
crv['ext_dev'] = (abs(crv['dev'])>0.3)
crv['model_year'] = crv.model_year.apply(lambda x: int(x))
crv = crv[['model_year', 'price', 'mileage', 'specs','dev', 'ext_dev', 'common_color', 'ext_color']]
crv = pd.get_dummies(crv)
for col in crv_m.columns:
    if not col in crv.columns:
        crv[col] = 0




no price_label found for # 13
no price found for # 13
no price_label found for # 21
no price found for # 21
no price_label found for # 32
no price found for # 32
no price_label found for # 42
no price found for # 42
no price_label found for # 46
no price_label found for # 52
no price found for # 52
no price_label found for # 82
no price found for # 82
no price_label found for # 132
no price found for # 132
no price_label found for # 145
no price_label found for # 148
no price found for # 148
no price found for # 150


In [66]:
# trains model on previously scraped fast sell vs no sell CRV training data
train, test, train_labels, test_labels = train_test_split(crv_m,
                                         labels, 
                                         stratify = labels,
                                         test_size = 0.2, 
                                         random_state = 100)
train = train.fillna(train.mean())
test = test.fillna(test.mean())
model = RandomForestClassifier(n_estimators=150,
                               min_samples_split=3,
                               min_samples_leaf=1,
                               random_state=100, 
                               max_features = 'sqrt',
                               max_depth = 40,
                               bootstrap=True,
                               n_jobs=-1, verbose = 1)
model.fit(train, train_labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    1.6s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=40, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=-1, oob_score=False, random_state=100, verbose=1,
                       warm_start=False)

In [67]:
# predicts outcomes for just-scraped data 
crv = crv.dropna()
predictions = model.predict_proba(crv)[:, 1]
results = pd.DataFrame(crv)
results['model_year'] = results['model_year'].apply(lambda x: str(x))
results['pred'] = predictions

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    0.1s finished


In [68]:
results = results.merge(crv_raw, on = ['model_year', 'price', 'mileage'])

In [69]:
results = results[['pred', 'model_year', 'price', 'dev','mileage', 'specs', 'drive', 'exterior_color', 'city', 'state']]

In [79]:
# creates dataframe of today's most and least likely cars to sell fast
results = results.sort_values(['pred'])
no_sell = results[:10]
sell = results[-10:]

In [80]:
no_sell

Unnamed: 0,pred,model_year,price,dev,mileage,specs,drive,exterior_color,city,state
79,0.481937,2012,8990.0,-0.245809,193118,EX,FWD,Silver,Charlottesville,VA
123,0.494659,2010,8999.0,-0.053314,146010,EX-L,FWD,Blue,Raleigh,NC
14,0.506778,2007,7995.0,0.0553,95241,EX-L,FWD,Blue,La Habra,CA
90,0.516841,2015,13488.0,-0.17434,125549,EX-L,FWD,Silver,Oklahoma City,OK
122,0.525111,2013,10990.0,-0.149884,129371,EX,AWD,Maroon,Huntington Station,NY
87,0.551444,2010,7777.0,-0.181867,179592,EX-L,FWD,Blue,Albuquerque,NM
129,0.557675,2018,26933.0,0.117171,32134,EX-L,AWD,Blue,Bend,OR
99,0.557759,2017,19995.0,-0.108623,45896,LX,AWD,Black,Ewing,NJ
84,0.571667,2019,30700.0,0.159984,13079,Touring,FWD,Orange,Cartersville,GA
107,0.574905,2011,8995.0,-0.118027,149412,EX,FWD,Gray,Cincinnati,OH


In [81]:
sell

Unnamed: 0,pred,model_year,price,dev,mileage,specs,drive,exterior_color,city,state
27,0.825111,2018,23000.0,-0.045968,28251,LX,FWD,Gray,Tracy,CA
128,0.832175,2018,26127.0,0.083739,6831,EX,FWD,Silver,Pittsburg,CA
125,0.83554,2017,18415.0,-0.17906,50191,EX,FWD,Red,Pompano Beach,FL
146,0.835556,2016,14300.0,-0.19991,98997,SE,FWD,Gray,Clermont,FL
134,0.844802,2019,24999.0,-0.055425,35213,EX,AWD,Orange,Watertown,NY
102,0.853667,2020,26924.0,-0.068729,2058,EX,FWD,Black,Baytown,TX
43,0.857587,2011,10986.0,0.077194,90535,EX,FWD,Gray,Heath,OH
61,0.866778,2011,13900.0,0.362916,40085,EX-L,FWD,Unknown,Jackson,WY
37,0.872556,2019,27480.0,0.038318,3955,EX,FWD,Gray,Kernersville,NC
154,0.879087,2019,23900.0,-0.096951,11449,EX,FWD,Gray,Delray Beach,FL
