In [1]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import time
pd.set_option('display.max_columns', 1000)
import tkinter as tk
from tkinter import *
from tkinter import messagebox
import time
from pandastable import Table, TableModel
# load make-mkId
makeid_dict=pd.read_csv('make-mkId.csv', index_col='Unnamed: 0').to_dict()['0']
makes=list(makeid_dict.keys())

In [2]:
def get_model(make0,zc0):    
    # input make, zipcode
    MAKE=make0
    zip_code=zc0

    ###
    name=[]
    price=[]
    mileage=[]
    certified=[]
    url='https://www.cars.com/for-sale/searchresults.action/?mkId={}&page={}&perPage=100&rd=30&searchSource=PAGINATION&sort=relevance&stkTypId=28881&zc={}'
    errornum=0
    j=1
    while j < 51:
        result1= requests.get(url.format(int(makeid_dict[MAKE]),j,zip_code)) # page number, 1 min, 50 max
        if result1.status_code==200: # good connection
            # get content
            source=result1.content
            soup=BeautifulSoup(source,'lxml')
            try: 
                # see if no so many pages but may cause issues(due to unstable connection)
                int(soup.find('a',class_='selected').attrs['data-page'])==j 
                # get car info on one page
                for item in soup.find_all('div', class_='shop-srp-listings__listing-container'):
                    name+=[item.find('h2').string]
                    price+=[item.find('span',class_='listing-row__price').string]
                    mileage+=[item.find('span',class_='listing-row__mileage').string]
                    certified+=[item.find('div',class_='listing-row__stocktype').string]
            except:
                break
        elif result1.status_code==500: # bad connection, retry this page. 
            j=j-1
            errornum+=1
        if errornum>50: # if retry too many times, raise a notice and break
            print('check connection')
            break
        # print(j)
        j+=1
        # print('page{}finished'.format(j))
    total_pages=j-1
    #print('total pages {}'.format(j-1))
    # get year make model
    # some with no mileage
    textmile=[]
    rows_to_drop=[]
    for i,j in enumerate(mileage):
        try:
            textmile+=[int(nltk.word_tokenize(j)[0].replace(',' , ''))] # replace '1,000' to 1000 int
        except:
            textmile+=[np.nan]
            rows_to_drop+=[i]
    textyear=[int(nltk.word_tokenize(i)[0]) for i in name]
    textmake=[nltk.word_tokenize(i)[1] for i in name]
    # some with no model
    textmodel=[]
    for i,j in enumerate(name):
        try:
            textmodel+=[nltk.word_tokenize(j)[2]] # first token of the model name
        except:
            textmodel+=[np.nan]
            rows_to_drop+=[i]
    # some with no price
    textp=[]
    for i,j in enumerate(price):
        try:
            textp+=[int(nltk.word_tokenize(j)[1].replace(',' , ''))]
        except:
            textp+=[np.nan]
            rows_to_drop+=[i]
    textcert=[1 if i==None else 0 for i in certified] # certified uses a sublabel thus result in none

    usedcarinfo=pd.DataFrame([textmake,textyear,textmodel,textmile,textcert,textp]).T
    usedcarinfo.rename(columns={0:'make',1:'year',2:'model',3:'mileage',4:'certified',5:'price'},inplace=True)
    usedcarinfo.drop(rows_to_drop,inplace= True)
    usedcarinfo=usedcarinfo.reset_index().drop(columns='index')
    # convert type object to numeric
    columns_to_convert=['year','mileage','certified','price']
    for i in columns_to_convert:
        usedcarinfo[i]=pd.to_numeric(usedcarinfo[i])
    # model preperation
    columns_to_drop=['make','model','price']
    X=usedcarinfo.join(pd.get_dummies(usedcarinfo['model']))
    X=X[ (X['year']>1999) & (X['price']<max(X['price'])) & (X['price']>min(X['price']))] # drop outliers
    y=X['price']
    X=X.drop(columns=columns_to_drop)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    model_xgb=xgb.XGBRegressor(colsample_bytree=0.5, gamma=0.1, 
                                learning_rate=0.05, max_depth=3, 
                                min_child_weight=1, n_estimators=800,
                                subsample=0.5, verbosity=1,
                                random_state = 2, nthread = -1)
    model_xgb.fit(X_train, y_train)
    # evaluation
    y_xgb = model_xgb.predict(X_test)

    #print('R^2 of the training set: {:.3f}'.format(model_xgb.score(X_train, y_train)))
    #print('R^2 of the test set: {:.3f}'.format(model_xgb.score(X_test, y_test)))
    return model_xgb,usedcarinfo['model'].unique(),X.columns,total_pages,model_xgb.score(X_train, y_train),model_xgb.score(X_test, y_test),X,y,usedcarinfo

# input a car info and output the predicted price
def inputinfo(yearin,milein,certin,modelin,features):
    inputdict={'year':yearin,'mileage':milein,'certified':certin,'model':modelin}
    inputdf=pd.DataFrame(columns=features)
    for i in inputdict.keys():
        if i in inputdf.columns:
            inputdf[i]=[inputdict[i]]
        else:
            inputdf[inputdict[i]]=[1.0]
    return inputdf.fillna(0)

# input your budget range (lb,hb), output the number of recommendations
def good_deals(model,lb,hb,num_results=10):
    ypred=model.predict(Xglobal)
    y=yglobal
    ydif=[] # (index,real, real - predicted)
    for i in range(len(y)):
        yindex=y.index[i]
        if (y.iloc[i]-ypred[i] < 0) and (y.iloc[i] < hb) and (y.iloc[i] > lb):
            ydif+=[[yindex,y.iloc[i],y.iloc[i]-ypred[i]]]
    good_index=list(list(zip(*sorted(ydif,key=lambda x:x[2])))[0][:num_results]) # sort by the third column, * for transpose, zip the first row(index), '[0]' returns index, convert to list
    return usedcarinfog.loc[good_index].reset_index().drop(columns=['index'])

In [9]:
fm = tk.Tk()
fm.title('Used Cars')
fm.geometry("1400x1000")
# insert a pic 
pic1=PhotoImage(file='usedcarspic.png')
pic2 = pic1.subsample(1, 1)
Label(fm,image=pic2).place(x=450,y=50)
# pop-up notification
def notification(top):
    top = Toplevel()
    top.title('In progress')
    me1=Message(top, text='Please wait...', padx=100, pady=100,font=("Courier",20))
    me1.pack()
    top.after(5000,top.destroy)
# initial position
x0=50
y0=50
# position of first part
fr1 = Frame(fm, width=1250, height=400,bg='white')
fr1.place(x=x0,y=y0+150)
# label config row 1
lb1=Label(fr1,text='Make',bg='white')
lb1.place(x=x0+30,y=y0+20)
lb1.config(font=("Courier",20))
lb2=Label(fr1,text='Year',bg='white')
lb2.place(x=x0+230,y=y0+20)
lb2.config(font=("Courier",20))
lb3=Label(fr1,text='Mileage',bg='white')
lb3.place(x=x0+430,y=y0+20)
lb3.config(font=("Courier",20))
lb4=Label(fr1,text='Zipcode',bg='white')
lb4.place(x=x0+630,y=y0+20)
lb4.config(font=("Courier",20))
lb5=Label(fr1,text='Certified',bg='white')
lb5.place(x=x0+830,y=y0+20)
lb5.config(font=("Courier",20))
# drop list 1
en1=StringVar(fm)
choices1 = makes
en1.set(choices1[0]) # set the default option
popupMenu = OptionMenu(fr1, en1,  *choices1)
popupMenu.config(width=20)
popupMenu.place(x=x0+30,y=y0+65)
# entries
en2=Entry(fr1,bd=2,width=25,fg='grey')
en2.insert(0,'2020')
en2.place(x=x0+230,y=y0+70)
en3=Entry(fr1,bd=2,width=25,fg='grey')
en3.insert(0,'12345')
en3.place(x=x0+430,y=y0+70)
en4=Entry(fr1,bd=2,width=25,fg='grey')
en4.insert(0,'75080')
en4.place(x=x0+630,y=y0+70)
en5=StringVar(fm)
en5.set('0') # set the default option
popupMenu = OptionMenu(fr1, en5,  *['0','1'])
popupMenu.config(width=20)
popupMenu.place(x=x0+830,y=y0+65)
# label row 2
lb4=Label(fr1,text='Choose a Model',bg='white')
lb4.place(x=x0+30,y=y0+120)
lb4.config(font=("Courier",20))
# drop list 2
en40=StringVar(fm)
choices2 = ['']
en40.set(choices2[0]) 
popupMenu = OptionMenu(fr1, en40,  *choices2)
popupMenu.config(width=20)
popupMenu.place(x=x0+30,y=y0+185)
# button row 1
def getentry():
    getcon=[]
    for m,n in enumerate([en1,en2,en3,en4,en5]):
        getcon+=[n.get()]
    
    # run the function get info
    global model,features,yearg,mileageg,certg,carmodelg,Xglobal,yglobal,usedcarinfog
    model,car_models,features,tpages,r2train,r2test,Xglobal,yglobal,usedcarinfog=get_model(getcon[0],getcon[3])
    yearg=int(getcon[1])
    mileageg=int(getcon[2])
    certg=int(getcon[4])

    # modify droplist
    choices2 = car_models
    en40.set(choices2[0]) 
    popupMenu = OptionMenu(fr1, en40,  *choices2)
    popupMenu.config(width=20)
    popupMenu.place(x=x0+30,y=y0+185)

    messagebox.showinfo('Message', 'About {} records collected! \n The model has R^2 of ({:.3f},{:.3f}) for (training,testing)'.format(tpages*100,r2train,r2test))


def getentry2(fr1=fr1):  
    # messagebox.showinfo('Message', 'The quote is {}'.format(model.predict(inputinfo(yearg,mileageg,certg,en40.get(),features))[0]))

    # create a label
    lb5=Label(fr1,text='The quote is ${:,.2f}'.format(model.predict(inputinfo(yearg,mileageg,certg,en40.get(),features))[0]),bg='white')
    lb5.place(x=x0+30,y=y0+220)
    lb5.config(font=("Courier",20))
    
 
bget = tk.Button(fr1, text='Submit', width=20,bg='blue',fg='white',command=getentry)
bget.place(x=x0+1030,y=y0+65)
bget.bind('<Button-1>',notification)

bget2 = tk.Button(fr1, text='Get quote', width=20,bg='blue',fg='white',command=getentry2)
bget2.place(x=x0+230,y=y0+185)

# frame two for good deals
fr2=Frame(fm, width=1250, height=500,bg='white')
fr2.place(x=50,y=500)

# label config row 1
lb21=Label(fr2,text='Make',bg='white')
lb21.place(x=x0+30,y=y0+20)
lb21.config(font=("Courier",20))
lb22=Label(fr2,text='Zipcode',bg='white')
lb22.place(x=x0+230,y=y0+20)
lb22.config(font=("Courier",20))
# corresponding entries
en22=Entry(fr2,bd=2,width=25,fg='grey')
en22.insert(0,'75080')
en22.place(x=x0+230,y=y0+70)
# drop list 1
en21=StringVar(fm)
choices1 = makes
en21.set(choices1[0]) # set the default option
popupMenu = OptionMenu(fr2, en21,  *choices1)
popupMenu.config(width=20)
popupMenu.place(x=x0+30,y=y0+65)

y1=0

lb21=Label(fr2,text='Enter your price range',bg='white')
lb21.place(x=x0+430,y=y1+70)
lb21.config(font=("Courier",20))
lb22=Label(fr2,text='to',bg='white')
lb22.place(x=x0+610,y=y1+115)
lb22.config(font=("Courier",15))
# price lower bound 
en23=Entry(fr2,bd=2,width=25,fg='grey')
en23.insert(0,'0')
en23.place(x=x0+430,y=y1+120)
# price higher bound
en24=Entry(fr2,bd=2,width=25,fg='grey')
en24.insert(0,'100000')
en24.place(x=x0+640,y=y1+120)

def get_deal():
    # frame three for display
    fr3=Frame(fm, width=1050, height=500,bg='white')
    fr3.place(x=150,y=680)

    global model,features,yearg,mileageg,certg,carmodelg,Xglobal,yglobal,usedcarinfog
    model,car_models,features,tpages,r2train,r2test,Xglobal,yglobal,usedcarinfog=get_model(en21.get(),en22.get())

    messagebox.showinfo('Message', 'About {} records collected! \n The model has R^2 of ({:.3f},{:.3f}) for (training,testing)'.format(tpages*100,r2train,r2test))

    pt = Table(fr3, dataframe=good_deals(model,int(en23.get()),int(en24.get())))
    pt.place(x=x0+30,y=y0+150)
    pt.show()

bget3 = tk.Button(fr2, text='Get good deals', width=20,bg='blue',fg='white',command=get_deal)
bget3.place(x=x0+830,y=y0+65)
bget3.bind('<Button-1>',notification)

# close button
bget4 = tk.Button(fm, text='X', width=5,bg='Red',fg='white',command=fm.destroy)
bget4.place(x=1220,y=y0+170)

mainloop()