In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gzip
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [2]:
cars = pd.read_csv('vehicles.csv')

In [3]:
cars_naless = cars.drop(columns=['Unnamed: 0', 'url', 'region_url', 'VIN', 'image_url', 'lat','long','posting_date']).dropna()

In [4]:
cars_0_naless = cars_naless[cars_naless['price'] != 0].reset_index(drop=True)

In [71]:
desc_counts = cleaned_cars['description'].value_counts()
desc_dict = desc_counts[desc_counts > 1].to_dict()

In [73]:
cleaned_cars = cars_0_naless[(cars_0_naless['price'] < 300000) & (cars_0_naless['price'] > 100)]
cleaned_cars = cleaned_cars[cleaned_cars['odometer'] < 500000]
cleaned_cars['duplicate_descriptions'] = cleaned_cars['description'].apply(lambda x: desc_dict.get(x, 0))

In [7]:
year_max = cleaned_cars['year'].max()
year_min = cleaned_cars['year'].min()
((cleaned_cars['year'] - year_min)/(year_max - year_min)).tolist()

[0.5,
 0.8297872340425532,
 0.8085106382978723,
 0.9893617021276596,
 0.8829787234042553,
 0.9361702127659575,
 0.9468085106382979,
 0.9468085106382979,
 0.8297872340425532,
 0.9042553191489362,
 0.9468085106382979,
 0.925531914893617,
 0.9468085106382979,
 0.8404255319148937,
 0.8723404255319149,
 0.9574468085106383,
 0.9893617021276596,
 0.9787234042553191,
 0.9361702127659575,
 0.925531914893617,
 0.8936170212765957,
 0.9361702127659575,
 0.8617021276595744,
 0.9468085106382979,
 0.9042553191489362,
 0.9468085106382979,
 0.9042553191489362,
 0.9361702127659575,
 0.8404255319148937,
 0.851063829787234,
 0.8404255319148937,
 0.925531914893617,
 0.925531914893617,
 0.9574468085106383,
 0.8404255319148937,
 0.9468085106382979,
 0.9680851063829787,
 0.9787234042553191,
 0.9468085106382979,
 0.9893617021276596,
 0.9680851063829787,
 0.6914893617021277,
 0.8085106382978723,
 0.43617021276595747,
 0.9468085106382979,
 0.8829787234042553,
 0.9787234042553191,
 0.8829787234042553,
 0.85106382

In [78]:
# Features Transforming
# Numerical
year = ((cleaned_cars['year'] - year_min)/(year_max - year_min)).tolist()
odom = (cleaned_cars['odometer']/max(cleaned_cars['odometer'])).tolist()

# Categorical/One Hot Encode
manu = cleaned_cars['manufacturer'].unique().tolist()
cond = cleaned_cars['condition'].unique().tolist()
cycl = cleaned_cars['cylinders'].unique().tolist()
fuel = cleaned_cars['fuel'].unique().tolist()
tit = cleaned_cars['title_status'].unique().tolist()
tran = cleaned_cars['transmission'].unique().tolist()
drive = cleaned_cars['drive'].unique().tolist()
size = cleaned_cars['size'].unique().tolist()
typ = cleaned_cars['type'].unique().tolist()
paint = cleaned_cars['paint_color'].unique().tolist()
state = cleaned_cars['state'].unique().tolist()
dupl = cleaned_cars['duplicate_descriptions'].unique().tolist()

In [79]:
# One Hot vector for each feature
manu_One = np.zeros((len(cleaned_cars),len(manu)))
cond_One = np.zeros((len(cleaned_cars),len(cond)))
cycl_One = np.zeros((len(cleaned_cars),len(cycl)))
fuel_One = np.zeros((len(cleaned_cars),len(fuel)))
tit_One = np.zeros((len(cleaned_cars),len(tit)))
tran_One = np.zeros((len(cleaned_cars),len(tran)))
drive_One = np.zeros((len(cleaned_cars),len(drive)))
size_One = np.zeros((len(cleaned_cars),len(size)))
typ_One = np.zeros((len(cleaned_cars),len(typ)))
paint_One = np.zeros((len(cleaned_cars),len(paint)))
state_One = np.zeros((len(cleaned_cars),len(state)))
dupl_One = np.zeros((len(cleaned_cars),len(dupl)))

In [80]:
# Transform all features
for x in range(len(cleaned_cars)):
    manu_One[x][manu.index(cleaned_cars['manufacturer'].iloc[x])] = 1
    cond_One[x][cond.index(cleaned_cars['condition'].iloc[x])] = 1
    cycl_One[x][cycl.index(cleaned_cars['cylinders'].iloc[x])] = 1
    fuel_One[x][fuel.index(cleaned_cars['fuel'].iloc[x])] = 1
    tit_One[x][tit.index(cleaned_cars['title_status'].iloc[x])] = 1
    tran_One[x][tran.index(cleaned_cars['transmission'].iloc[x])] = 1
    drive_One[x][drive.index(cleaned_cars['drive'].iloc[x])] = 1
    size_One[x][size.index(cleaned_cars['size'].iloc[x])] = 1
    typ_One[x][typ.index(cleaned_cars['type'].iloc[x])] = 1
    paint_One[x][paint.index(cleaned_cars['paint_color'].iloc[x])] = 1
    state_One[x][state.index(cleaned_cars['state'].iloc[x])] = 1
    dupl_One[x][dupl.index(cleaned_cars['duplicate_descriptions'].iloc[x])] = 1

In [46]:
# Get model category counts and transform them for those above 100
categoryCounts = {}
for x in cleaned_cars['model']:
    if x not in categoryCounts.keys():
        categoryCounts[x] = 0
    categoryCounts[x] += 1
categories = [c for c in categoryCounts if categoryCounts[c] > 100]
catID = dict(zip(list(categories),range(len(categories))))

mod_One = np.zeros((len(cleaned_cars),len(catID.keys())))
for x in range(len(cleaned_cars)):
    if cleaned_cars['model'].iloc[x] in catID.keys():
        mod_One[x][catID[cleaned_cars['model'].iloc[x]]] = 1

In [81]:
# List of all features to append
car_columns = [year,
               odom,
               manu_One,
               mod_One,
               cond_One,
               cycl_One,
               fuel_One,
               tit_One,
               tran_One,
               drive_One,
               size_One,
               typ_One,
               paint_One,
               state_One,
               dupl_One]

# array to put all features in
clean_car_data = np.array([])

for i in car_columns:
    if clean_car_data.size == 0:
        clean_car_data = np.array([i]).reshape(-1, 1)
    else:
        data = np.array(i)
        
        if len(data.shape) == 1:
            data = data.reshape(-1, 1)
            
        clean_car_data = np.concatenate((clean_car_data, data), axis=1)

In [82]:
# Get training data
x_train, x_test, y_train, y_test = train_test_split(clean_car_data, cleaned_cars['price'], test_size=0.3, random_state=42)

In [83]:
# Get validation and testing data
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

In [84]:
def RMSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return np.sqrt(sum(differences) / len(differences))

In [85]:
# Make and train linear regression model
lin_reg_model = LinearRegression(fit_intercept=True)
lin_reg_model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [86]:
# Prediction on train
lin_reg_train_preds = lin_reg_model.predict(x_train)
lin_reg_train_RMSE = RMSE(lin_reg_train_preds, y_train)
lin_reg_train_RMSE

7015.781470508533

In [87]:
# Prediction on validation
lin_reg_valid_preds = lin_reg_model.predict(x_valid)
lin_reg_valid_RMSE = RMSE(lin_reg_valid_preds, y_valid)
lin_reg_valid_RMSE

7040.70049790455

### Ablation

In [88]:
def create_features(columns_list):
    """Creates all features to put into a features list."""

    # array to put all features in
    clean_car_data = np.array([])

    for i in columns_list:
        if clean_car_data.size == 0:
            data = np.array(i)
            
            if len(data.shape) == 1:
                data = data.reshape(-1, 1)
                
            clean_car_data = data
        else:
            data = np.array(i)
            
            if len(data.shape) == 1:
                data = data.reshape(-1, 1)

            clean_car_data = np.concatenate((clean_car_data, data), axis=1)

    return clean_car_data

def get_evaluations(x_train, x_valid, y_train, y_valid):
    """Train and test using data"""
    # Make and train linear regression model
    model = LinearRegression(fit_intercept=True)
    model.fit(x_train, y_train)
    
    # Predict on train
    train_preds = model.predict(x_train)
    train_RMSE = RMSE(train_preds, y_train)
    
    # Predict on validation
    valid_preds = model.predict(x_valid)
    valid_RMSE = RMSE(valid_preds, y_valid)
    
    return model, train_RMSE, valid_RMSE
    
def find_ablation(columns_list, y):
    """Find the best columns through ablation."""
    
    all_models = {}

    all_features = create_features(columns_list)

    x_tr, x_te, y_tr, y_te = train_test_split(all_features, y, test_size=0.3, random_state=42)
    x_va, x_te, y_va, y_te = train_test_split(x_te, y_te, test_size=0.5, random_state=42)

    temp_model, tr_RMSE, va_RMSE = get_evaluations(x_tr, x_va, y_tr, y_va)

    all_models['all'] = temp_model
    print("With all features: Train RMSE " + str(tr_RMSE) + " | Valid RMSE " + str(va_RMSE))
    
    for i in range(len(columns_list)):
        modded_list = columns_list.copy()
        modded_list.pop(i)
        
        all_features = create_features(modded_list)
        
        x_tr, x_te, y_tr, y_te = train_test_split(all_features, y, test_size=0.3, random_state=42)
        x_va, x_te, y_va, y_te = train_test_split(x_te, y_te, test_size=0.5, random_state=42)
        
        temp_model, tr_RMSE, va_RMSE = get_evaluations(x_tr, x_va, y_tr, y_va)
        
        all_models[i] = temp_model
        print("Without feature " + str(i) + ": Train RMSE " + str(tr_RMSE) + " | Valid RMSE " + str(va_RMSE))
    
    return all_models

def find_double_ablation(columns_list, y):
    """Find the best columns through ablation."""
    
    all_models = {}

    all_features = create_features(columns_list)

    x_tr, x_te, y_tr, y_te = train_test_split(all_features, y, test_size=0.3, random_state=42)
    x_va, x_te, y_va, y_te = train_test_split(x_te, y_te, test_size=0.5, random_state=42)

    temp_model, tr_RMSE, va_RMSE = get_evaluations(x_tr, x_va, y_tr, y_va)

    all_models['all'] = temp_model
    print("With all features: Train RMSE " + str(tr_RMSE) + " | Valid RMSE " + str(va_RMSE))
    
    for i in range(len(columns_list)):
        for j in range(i, len(columns_list) - 1):
            modded_list = columns_list.copy()
            modded_list.pop(i)
            modded_list.pop(j)

            all_features = create_features(modded_list)

            x_tr, x_te, y_tr, y_te = train_test_split(all_features, y, test_size=0.3, random_state=42)
            x_va, x_te, y_va, y_te = train_test_split(x_te, y_te, test_size=0.5, random_state=42)

            temp_model, tr_RMSE, va_RMSE = get_evaluations(x_tr, x_va, y_tr, y_va)

            all_models[(i, j)] = temp_model
            print("Without features " + str(i) + " and " + str(j+1) +
                  ": Train RMSE " + str(tr_RMSE) + " | Valid RMSE " + str(va_RMSE))

    return all_models

In [93]:
first_ablation_columns = [year,
               odom,
               manu_One,
               mod_One,
               cond_One,
               cycl_One,
               fuel_One,
               tit_One,
               tran_One,
               drive_One,
               size_One,
               typ_One,
               paint_One,
               state_One]

In [96]:
second_ablation_columns = [year,
               odom,
               manu_One,
               mod_One,
               cond_One,
               cycl_One,
               fuel_One,
               tit_One,
               tran_One,
               drive_One,
               size_One,
               typ_One,
               paint_One,
               state_One,
               dupl_One]

In [94]:
first_ablation = find_ablation(first_ablation_columns, cleaned_cars['price'])

With all features: Train RMSE 7038.197748807188 | Valid RMSE 7063.529033458708
Without feature 0: Train RMSE 7361.667136782332 | Valid RMSE 7414.56143973078
Without feature 1: Train RMSE 8061.65552326332 | Valid RMSE 8097.0450603902655
Without feature 2: Train RMSE 7196.1995019767965 | Valid RMSE 7239.606426378233
Without feature 3: Train RMSE 7177.803282577516 | Valid RMSE 7172.694704331935
Without feature 4: Train RMSE 7111.353038217762 | Valid RMSE 7121.082530615473
Without feature 5: Train RMSE 7109.989963311475 | Valid RMSE 7150.627393415714
Without feature 6: Train RMSE 7215.9388225818175 | Valid RMSE 7270.555318267805
Without feature 7: Train RMSE 7074.3815077868085 | Valid RMSE 7090.695223038541
Without feature 8: Train RMSE 7049.010335884166 | Valid RMSE 7081.163468511664
Without feature 9: Train RMSE 7109.233131143464 | Valid RMSE 7158.732899812135
Without feature 10: Train RMSE 7045.3366058558495 | Valid RMSE 7070.384487710459
Without feature 11: Train RMSE 7140.924774988746

In [97]:
second_ablation = find_ablation(second_ablation_columns, cleaned_cars['price'])

With all features: Train RMSE 7015.781470508533 | Valid RMSE 7040.70049790455
Without feature 0: Train RMSE 7338.961260745495 | Valid RMSE 7390.689041546563
Without feature 1: Train RMSE 8036.3717049342 | Valid RMSE 8075.6634714021275
Without feature 2: Train RMSE 7171.979275116932 | Valid RMSE 7214.1495300418665
Without feature 3: Train RMSE 7149.513218913484 | Valid RMSE 7142.650908652812
Without feature 4: Train RMSE 7090.259770669415 | Valid RMSE 7100.507956881796
Without feature 5: Train RMSE 7087.509691040395 | Valid RMSE 7127.592110602163
Without feature 6: Train RMSE 7191.730753864489 | Valid RMSE 7244.407149351614
Without feature 7: Train RMSE 7052.200272589356 | Valid RMSE 7068.688837023177
Without feature 8: Train RMSE 7026.69336167199 | Valid RMSE 7058.355424357105
Without feature 9: Train RMSE 7086.9954888977345 | Valid RMSE 7134.763557904511
Without feature 10: Train RMSE 7023.597934831996 | Valid RMSE 7048.59988727383
Without feature 11: Train RMSE 7117.875332885606 | Va

In [60]:
first_double_ablation = find_double_ablation(first_ablation_columns, cleaned_cars['price'])

With all features: Train RMSE 7038.197748807188 | Valid RMSE 7063.529033458708
Without features 0 and 1: Train RMSE 8602.039307045397 | Valid RMSE 8674.089117351383
Without features 0 and 2: Train RMSE 7512.598036257236 | Valid RMSE 7589.941954618639
Without features 0 and 3: Train RMSE 7530.600317120363 | Valid RMSE 7547.057206575836
Without features 0 and 4: Train RMSE 7505.756023759494 | Valid RMSE 7539.260005666788
Without features 0 and 5: Train RMSE 7392.182994826418 | Valid RMSE 7457.166521293509
Without features 0 and 6: Train RMSE 7540.623222235628 | Valid RMSE 7632.2178313696395
Without features 0 and 7: Train RMSE 7396.962609406127 | Valid RMSE 7437.668877879769
Without features 0 and 8: Train RMSE 7362.019735305869 | Valid RMSE 7415.20785529813
Without features 0 and 9: Train RMSE 7455.381683165372 | Valid RMSE 7535.194052828997
Without features 0 and 10: Train RMSE 7370.086937810868 | Valid RMSE 7418.874526859306
Without features 0 and 11: Train RMSE 7455.3531264745525 | V

In [44]:
limit_50_columns = [year,
               odom,
               manu_One,
               mod_One,
               cond_One,
               cycl_One,
               fuel_One,
               tit_One,
               tran_One,
               drive_One,
               size_One,
               typ_One,
               paint_One,
               state_One]

In [45]:
# Above 50 appearances for model feature
limit_50_features = create_features(limit_50_columns)
        
x_tr_50, x_te_50, y_tr_50, y_te_50 = train_test_split(limit_50_features, cleaned_cars['price'], test_size=0.3, random_state=42)
x_va_50, x_te_50, y_va_50, y_te_50 = train_test_split(x_te_50, y_te_50, test_size=0.3, random_state=42)
        
model_50, train_50_RMSE, valid_50_RMSE = get_evaluations(x_tr_50, x_va_50, y_tr_50, y_va_50)

print("Train RMSE " + str(train_50_RMSE))
print("Valid RMSE " + str(valid_50_RMSE))

Train RMSE 6985.137285830318
Valid RMSE 6912.987529889483


In [39]:
# Above 25 appearances for model feature
limit_25_features = create_features(limit_25_columns)
        
x_tr_25, x_te_25, y_tr_25, y_te_25 = train_test_split(limit_25_features, cleaned_cars['price'], test_size=0.3, random_state=42)
x_va_25, x_te_25, y_va_25, y_te_25 = train_test_split(x_te_25, y_te_25, test_size=0.3, random_state=42)
        
model_25, train_25_RMSE, valid_25_RMSE = get_evaluations(x_tr_25, x_va_25, y_tr_25, y_va_25)

print("Train RMSE " + str(train_25_RMSE))
print("Valid RMSE " + str(valid_25_RMSE))

Train RMSE 6899.386262976084
Valid RMSE 6856.100200443149


In [29]:
# Above 5 appearances for model feature
limit_5_features = create_features(limit_5_columns)
        
x_tr_5, x_te_5, y_tr_5, y_te_5 = train_test_split(limit_5_features, cleaned_cars['price'], test_size=0.3, random_state=42)
x_va_5, x_te_5, y_va_5, y_te_5 = train_test_split(x_te_5, y_te_5, test_size=0.3, random_state=42)
        
model_5, train_5_RMSE, valid_5_RMSE = get_evaluations(x_tr_5, x_va_5, y_tr_5, y_va_5)

print("Train RMSE " + str(train_5_RMSE))
print("Valid RMSE " + str(valid_5_RMSE))

Train RMSE 6475.5888129185805
Valid RMSE 6634.978308415088
