# Predicting the price of a used car(SUV/Sedan) in Greater Cleveland area.


In [None]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
%matplotlib inline

In [None]:
sns.set_style("whitegrid")

In [None]:
df = pd.read_csv("vehicles.csv")
pd.set_option('display.max_columns', None)
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.info()

## Data cleaning

### ID for each vehicle must be unique. Looks like we have a few duplicates

In [None]:
df["Id"].value_counts()

 Looks like the labels also got scraped from each page!

In [None]:
df.drop(index = df[df["Id"] == "Id"].index,inplace=True)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.dtypes

### Some of them are numerical columns while some are categorical ones. We need to change the types before we could do any feature engineering. Pandas doesn't let us change the dtype to int when there are null values in the column. Let's change all the numerical columns to float type

Numerical feature Fuel_economy_combined has MPG at the end. Let's get rid of the MPG from the column

In [None]:
df["Fuel_economy_combined"]=df["Fuel_economy_combined"].str.replace("MPG","")

In [None]:
columns_to_float = ["Fuel_economy_combined","Accidents","Year","Days_on_market","Distance","Mileage",
                    "Additional_features","Previous_owners","Seats","Price"]

In [None]:
df[columns_to_float] = df[columns_to_float].astype(np.float)

In [None]:
df.dtypes

##  Missing values

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(df.isnull(),cbar=False,cmap='viridis')

In [None]:
df.drop("Price_drops",axis=1,inplace=True) 

### Let's look at each of the columns and see how we can impute the missing values

In [None]:
missing = df.isnull().sum().sort_values(ascending=False)

In [None]:
missing/len(df)*100   # In terms of percentage

###  Fuel economy

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(df["Fuel_economy_combined"],fit = norm)

In [None]:
# Looks more or less normally distributed. We can impute the missing values with the mean
df["Fuel_economy_combined"].mean()

In [None]:
df['Fuel_economy_combined'].fillna(df["Fuel_economy_combined"].mean(),inplace = True)

### Photos

In [None]:
df["Photos"].fillna(0,inplace = True)

In [None]:
df["Photos"].replace("True",1,inplace=True)

###  Drivetrain

In [None]:
df["Drivetrain"].value_counts()

Four wheel drive is same as all wheel drive. Also, Four by Two is similar to FWD or RWD and switches in between.

In [None]:
df["Drivetrain"].replace("FOUR_WHEEL_DRIVE","ALL_WHEEL_DRIVE",inplace = True)
df["Drivetrain"].replace("FOUR_BY_TWO","FRONT_WHEEL_DRIVE",inplace = True)

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = "Drivetrain",y ="Price",data =df)

The averages and the overall distribution for FWD and RWD doesn't deviate much from each other.

In [None]:
df["Drivetrain"].replace("REAR_WHEEL_DRIVE","FRONT_WHEEL_DRIVE",inplace = True)

In [None]:
df["Drivetrain"].value_counts()

In [None]:
df["Drivetrain"].isnull().sum() #check for missing values

In [None]:
df.groupby("Body_type")["Drivetrain"].value_counts() # See how does the drivetrain stack up against the bodytype

Looks like more SUVs have AWD while sedans predominantly have FWD.
So, let's impute the missing value with AWD when the bodytype is suv and FWD when sedan.

In [None]:
# Function to impute missing values in the drivetrain column based on the kind of bodytype
def impute_drive_train(columns):
    body_type = columns[0]
    drive_train = columns[1]
    if pd.isnull(drive_train):
        if body_type == "SUV / Crossover":
            return "ALL_WHEEL_DRIVE"
        else:
            return "FRONT_WHEEL_DRIVE"
    else:
        return drive_train    

In [None]:
df["Drivetrain"] = df[["Body_type","Drivetrain"]].apply(impute_drive_train,axis=1)

In [None]:
df["Drivetrain"].value_counts()  # No need for this line

###  Rating

In [None]:
# Rating
df["Rating"].value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = "Rating",y = "Price",data =df)

In [None]:
df["Rating"].replace(dict.fromkeys(["FAIR_PRICE","GOOD_PRICE","GREAT_PRICE"],"Great_Price"),inplace = True)
df["Rating"].replace(dict.fromkeys(["POOR_PRICE","OVERPRICED","OUTLIER"],"Poor_Price"),inplace = True)

In [None]:
df["Rating"].fillna("Great_Price",inplace=True)

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = "Rating",y = "Price",data =df)

#### Note: Vehicles with the rating "Great" cost lower as compared with "Poor".

###  Previous owners

In [None]:
sns.distplot(df["Previous_owners"])

In [None]:
df["Previous_owners"].value_counts()

In [None]:
df["Previous_owners"].fillna(1.0,inplace=True)

## The rest of the columns have less than 2 % of missing values. Let's drop the rows with the missing values.

In [None]:
df.dropna(inplace=True)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.isnull(),cbar=False,cmap='viridis') # Checking one more time for any missing value.

## Categorical features
* Engine
* Drivetrain : Done already!
* Transmission
* Trim
* Photos
* Rating : Done already!
* Color
* Make
* Model
* Bodytype

### Engine

In [None]:
df["Engine"].value_counts()

Combining similar engines into one category:
* I3, I4 and I5: base engine
* V6, I6 and V8: strong engine
* H4 and H6: Hybrid engine

In [None]:
df["Engine"].replace(dict.fromkeys(["I3","I5","I4"],"I4"),inplace = True)
df["Engine"].replace(dict.fromkeys(["V6","V8","I6"],"V6"),inplace = True)
df["Engine"].replace(dict.fromkeys(["H4","H6"],"H4"),inplace = True)

### Transmission

In [None]:
df["Transmission"].value_counts()

Lots of different names for the same kinds of transmission.
The three main categories are:
1. Automatic
2. Manual
3. Continuous Variable Transmission(CVT)

Let's create a mask for each of them

In [None]:
mask_automatic = df["Transmission"].value_counts().index.str.contains("Automatic",case=False,regex=False)
mask_manual = df["Transmission"].value_counts().index.str.contains("(Manual|Clutch)",case=False,regex=True)
mask_cvt = df["Transmission"].value_counts().index.str.contains("(Continuous|CVT)",case=False,regex=True)

In [None]:
auto_list = df["Transmission"].value_counts().index[mask_automatic].tolist()
manual_list = df["Transmission"].value_counts().index[mask_manual].tolist()
cvt_list = df["Transmission"].value_counts().index[mask_cvt].tolist()

In [None]:
df["Transmission"].replace(dict.fromkeys(auto_list,"Automatic"),inplace = True)
df["Transmission"].replace(dict.fromkeys(manual_list,"Manual"),inplace = True)
df["Transmission"].replace(dict.fromkeys(cvt_list,"CVT"),inplace = True)

In [None]:
df["Transmission"].value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(df["Transmission"],df["Price"])
plt.xticks(rotation=0)
plt.tight_layout()

In [None]:
df["Transmission"].replace("CVT","Automatic",inplace=True)

In [None]:
df["Transmission"].value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(df["Transmission"],df["Price"])
plt.xticks(rotation=0)
plt.tight_layout()

### Seller type

In [None]:
df["Seller_type"].value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = "Seller_type",y = "Price", data = df)

The spread of price when buying from a dealer is quite large which is expected. However, regardless of the other factors, a vehicle bought from a private party is almost always lower, interesting!

### Photos
Having photos on the website could potentially be a variable when it comes to the pricing. Let's see!

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = "Photos", y = "Price", data = df)

## Make, Body type, Model and Trim levels

### Body type

In [None]:
df['Body_type'].value_counts()

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x = "Body_type", y = "Price", data = df)

### Make

In [None]:
df['Make'].value_counts()

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x = "Make", y = "Price", data = df, hue = "Body_type")

### For body type, we have only two categories: sedan and suv/crossover.

### Almost all the automakers are represented in both the sedan as well as the suv segment except Jeep which only builds suvs and Volkswagen which is known for building sedans.

### For almost all the makes, the respective SUVs tend to be slightly expensive as compared with their sedans.

In [None]:
plt.figure(figsize=(10,6))
df.groupby("Make")["Price"].mean().sort_values(ascending = False)

Except for Jeep and Volkswagen, the mean price for the rest of the automakers is hovering in the 13k to 16k range.

Since Jeep sells suvs, it's obvious that the average price of their vehicles are the highest.
Similarly, Volkawagen is at the bottom because it sells only sedans.

Each carmaker has a trim level! We need to change it to numeric values so that we can use them to build our model.

### Trim levels

In [None]:
df["Trim_copy"] = df["Trim"].copy()

In [None]:
# Look for various trims across make
for make in df["Make"].unique():
    print("The make is: {} and the trims are: {}".format(make,df[df["Make"] == make]["Trim"].unique().tolist()))
    print("-"*115)

## 3 levels of trims for each make

### Masks for each trim

In [None]:
mask_1 = df["Trim_copy"].value_counts().index.str.contains(
    "(LX|SportSi|Gx|L|LE|Sedan|Base|LS|RS|FWD|XLT|value|ES|SR|Latitude)",
    case=False,regex=True)   # Mask for base trim

In [None]:
mask_2 = df["Trim_copy"].value_counts().index.str.contains(
    "(EX|Navigation|RES|SE|S|LT|Hybrid|SEL|ST|Signature|SX|SV|SL|Altitude)",
    case=False,regex=True) # Mask for mid trim

In [None]:
mask_3 = df["Trim_copy"].value_counts().index.str.contains(
    "(Touring|EX-T|Elite|XLE|XSE|Limited|Edition|Premier|Titanium|Luxury|Premium|Ultimate|Grand|GT|Platinum|GLI|2.0T|Trailhawk|Overland)",
    case=False,regex=True)  # Mask for high trim

In [None]:
len(df["Trim_copy"].value_counts())   #Omega  # This sample space is divided into 8 parts.

In [None]:
len(df["Trim_copy"].value_counts().index[mask_1])  # Mask1

In [None]:
len(df["Trim_copy"].value_counts().index[mask_2]) # Mask2

In [None]:
len(df["Trim_copy"].value_counts().index[mask_3]) # Mask3

### Some of the values are shared among the 3 masks

<img src="venn.png">

### Here, we're going to replace the trim levels with the respective numbers:
* Values that belong to 100: 1
* Values that belong to 010: 3
* Values that belong to 001: 5
* Values that belong to 110: 2 : Mean of 100 and 010
* Values that belong to 101: 3 : Mean of 100 and 001
* Values that belong to 001: 4 : Mean of 010 and 001
* Values that belong to 111: 3 : Mean of 100 and 101 and 001
* Values that belong to 000: 5 : These are usually the models which have fancy names and are mostly higher trim.

In [None]:
list_111 = df["Trim_copy"].value_counts().index[(mask_1)&(mask_2)&(mask_3)].tolist()        # 111
list_110 = df["Trim_copy"].value_counts().index[(~mask_3)&((mask_1)&(mask_2))].tolist()     # 110
list_101 = df["Trim_copy"].value_counts().index[(~mask_2)&((mask_1)&(mask_3))].tolist()     # 101
list_011 = df["Trim_copy"].value_counts().index[(~mask_1)&((mask_3)&(mask_2))].tolist()     # 011
list_100 = df["Trim_copy"].value_counts().index[(mask_1)&(~((mask_2)|(mask_3)))].tolist()   # 100
list_010 = df["Trim_copy"].value_counts().index[(mask_2)&(~((mask_1)|(mask_3)))].tolist()   # 010
list_001 = df["Trim_copy"].value_counts().index[(mask_3)&(~((mask_2)|(mask_1)))].tolist()   # 001
list_000 = df["Trim_copy"].value_counts().index[~((mask_1)|(mask_2)|(mask_3))].tolist()     # 000

In [None]:
df["Trim_copy"].replace(dict.fromkeys(list_100,1),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_010,3),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_001,5),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_110,13),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_101,15),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_011,35),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_111,135),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_000,0),inplace = True)

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = "Trim_copy", y = "Price", data = df)

In [None]:
df["Trim_copy"].value_counts()

### Adjusting for the intersecting trims..

In [None]:
df["Trim_copy"] = df["Trim"].copy()

In [None]:
df["Trim_copy"].replace(dict.fromkeys(list_100,1),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_010,1),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_001,5),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_110,1),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_101,5),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_011,5),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_111,3),inplace = True)
df["Trim_copy"].replace(dict.fromkeys(list_000,1),inplace = True)

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = "Trim_copy", y = "Price", data = df)

In [None]:
df.groupby("Trim_copy")["Price"].mean().sort_values(ascending=False)

### Model
Usually there are three levels of size for each bodytype: sedan as well as suv.
And, for the same trim, the Price usually goes up with the level. Let's see!

In [None]:
df["Model_copy"] = df["Model"].copy()

In [None]:
for make in df["Make"].unique():
    print("The make is: {} and the models are: {}".format(make,df[df["Make"] == make]["Model_copy"].unique().tolist()))
    print("-"*115)

In [None]:
mask_compact = df["Model_copy"].value_counts().index.str.contains(
    "(Sonic|Cruze|Equinox|Volt|Trailblazer|Trax|Fiesta|Edge|EcoSport|Escape|Civic|HR-V|Element|Accent|Elantra|Kona|Tucson|Forte|Soul|Rio|Niro|Sportage|Seltos|Mazda3|CX-3|CX-30|Mirage|Lancer|Eclipse|Galant|Sentra|Versa|Rogue|Xterra|Juke|Kicks|Impreza|Crosstrek|Corolla|Yaris|C-HR|Jetta|Renegade|Compass)",
    case=False,regex=True)   # Mask for compact size

In [None]:
mask_mid = df["Model_copy"].value_counts().index.str.contains(
    "(Malibu|Traverse|Tahoe|Fusion|Explorer|Accord|Insight|CR-V|Passport|Sonata|Santa Fe|Optima|Sorento|Mazda6|CX-5|CX-7|Tribute|Outlander|Altima|Murano|Legacy|Forester|Outback|WRX|Tribeca|Camry|Rav-4|Highlander|4Runner|Passat|CC|Wrangler|Cherokee)",
    case=False,regex=True)   # Mask for mid size

In [None]:
mask_full = df["Model_copy"].value_counts().index.str.contains(
    "(Impala|Suburban|Taurus|Flex|Expedition|Pilot|Palisade|Cadenza|Stinger|Telluride|CX-9|Endeavor|Montero|Maxima|Pathfinder|Armada|Ascent|Avalon|Land Cruiser|Sequoia|Cruiser|Arteon|Grand Cherokee)",
    case=False,regex=True)   # Mask for full size

### We'll use the same idea of venn diagram dealing with the intersecting samples for the category "Model"
* Values that belong to 100: 1
* Values that belong to 010: 3
* Values that belong to 001: 5
* Values that belong to 110: 3 : More inclined towards 010
* Values that belong to 101: 3 : Mean of 100 and 001
* Values that belong to 001: 4 : Mean of 010 and 001
* Values that belong to 111: 3 : Mean of 100 and 101 and 001
* Values that belong to 000: 2 : The distribution is more like the mix of 100 and 010

In [None]:
model_list_111 = df["Model_copy"].value_counts().index[(mask_compact)&(mask_mid)&(mask_full)].tolist()        # 111
model_list_110 = df["Model_copy"].value_counts().index[(~mask_full)&((mask_compact)&(mask_mid))].tolist()     # 110
model_list_101 = df["Model_copy"].value_counts().index[(~mask_mid)&((mask_compact)&(mask_full))].tolist()     # 101
model_list_011 = df["Model_copy"].value_counts().index[(~mask_compact)&((mask_full)&(mask_mid))].tolist()     # 011
model_list_100 = df["Model_copy"].value_counts().index[(mask_compact)&(~((mask_mid)|(mask_full)))].tolist()   # 100
model_list_010 = df["Model_copy"].value_counts().index[(mask_mid)&(~((mask_compact)|(mask_full)))].tolist()   # 010
model_list_001 = df["Model_copy"].value_counts().index[(mask_full)&(~((mask_mid)|(mask_compact)))].tolist()   # 001
model_list_000 = df["Model_copy"].value_counts().index[~((mask_compact)|(mask_mid)|(mask_full))].tolist()     # 000

In [None]:
len(df["Model_copy"].value_counts().index[(~mask_mid)&((mask_compact)&(mask_full))])

In [None]:
len(df["Model_copy"].value_counts().index[(mask_compact)&(mask_mid)&(mask_full)])

In [None]:
df["Model_copy"].replace(dict.fromkeys(model_list_100,1),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_010,3),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_001,5),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_110,13),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_101,15),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_011,35),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_111,135),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_000,0),inplace = True)

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x="Model_copy",y = "Price", data = df)

So, no common value for all three sizes as well as 1 and 5, that's good!
### Adjusting for the values

In [None]:
df["Model_copy"] = df["Model"].copy()

In [None]:
df["Model_copy"].replace(dict.fromkeys(model_list_100,1),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_010,3),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_001,5),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_110,1),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_101,15),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_011,5),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_111,135),inplace = True)
df["Model_copy"].replace(dict.fromkeys(model_list_000,1),inplace = True)

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x="Model_copy",y = "Price", data = df)

### Color

In [None]:
df["Color"].value_counts()/len(df)*100

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(df["Color"],df["Price"])
plt.tight_layout()

### Color doesn't seem to affect the sales much here in our case.
### Let's drop it 

In [None]:
df.drop("Color",axis=1,inplace=True)

## Now on to the numerical features
* Accidents
* Year
* Fuel economy
* Distance
* Days on market
* Mileage
* Additional features (eg: heated seats, push-button start, etc.)
* Previous owners
* Seats

We'll analyze all of these features and see which ones affect the price the most.

In [None]:
plt.figure(figsize = (15,10))
sns.heatmap(df.corr(),annot=True,linewidths=1,cmap = "coolwarm")

In [None]:
df.corr()["Price"].sort_values(ascending = False) # Understand each of these blocks

### Photos, Distance and Days on market don't seem to correlate much(+ve or -ve) with the target variable, Price. Let's drop these three. 

In [None]:
df.drop("Distance",axis=1,inplace=True)
df.drop("Days_on_market",axis=1,inplace=True)
df.drop("Photos",axis=1,inplace=True) 

## Price
### This is our target variable. 
It has positive strong correlation with some features while strongly negative with some. We'll drop the columns such as Distance and days on market since they don't really correlate with the Price column.

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(df["Price"],fit=norm)

In [None]:
df["Price"].sort_values(ascending=False)

### Outliers

In [None]:
df.loc[df["Price"] > 30000]

In [None]:
df.drop(index = df[df["Price"] > 30000].index, inplace=True)

### Year
We expect the price to go up with the year as newer cars are going to be more expesive.

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = "Year", y = "Price", data = df)

### Additional features
More the features in a vehicle, the expensive it gets!

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = "Additional_features", y = "Price", data = df)

We do see an upward trend in the Price as the number of features increase.

### Seats

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = "Seats", y = "Price", data = df)

### Fuel economy

In [None]:
df.groupby("Body_type")["Fuel_economy_combined"].mean()

In [None]:
sns.distplot(df["Fuel_economy_combined"])

In [None]:
sns.set_style("white")
sns.jointplot(df["Fuel_economy_combined"],df["Price"])
sns.set_style("whitegrid")

This is interesting! Contrary to my initial thought that vehicles with low fuel economy must be cheaper to buy,  the opposite is true! Actually, the bigger cars with additional features tend to have low fuel economy and are usually expensive.

### Accidents and Previous owners
With the number of the accidents and previous owners, the value of the vehicle depreciates quickly. Let's see!

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = "Accidents", y = "Price", data = df)

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = "Previous_owners", y = "Price", data = df)

### Mileage
The value of the vehicle should go down with mileage!

In [None]:
sns.set_style("white")
sns.jointplot(df["Mileage"],df["Price"])
sns.set_style("whitegrid")

In [None]:
sns.distplot(df["Mileage"],fit = norm)

### The distribution of the Mileage looks skewed. Let's do a log transformation and see if it makes it better..

In [None]:
sns.distplot(np.log(df["Mileage"]),fit = norm)

In [None]:
df["Mileage"] = df["Mileage"].apply(np.log)

### The column ID is of no use in building the model. Also, I'm not so much interested in a particular brand as I am in the body type i.e. I'm interested in predicting the price of a sedan vs an suv regardless of the carmaker. 

In [None]:
df.drop("Id",axis=1,inplace=True)
df.drop("Make",axis=1,inplace=True)

In [None]:
df.drop(["Model","Trim"],axis=1,inplace=True)

### Get dummies on the categorical columns.

In [None]:
body = pd.get_dummies(df["Body_type"],drop_first=True)
rating = pd.get_dummies(df["Rating"],drop_first=True)
drivetrain = pd.get_dummies(df["Drivetrain"],drop_first=True)
engine = pd.get_dummies(df["Engine"],drop_first=True)
transmission = pd.get_dummies(df["Transmission"],drop_first=True)
seller = pd.get_dummies(df["Seller_type"],drop_first=True)

In [None]:
df.drop(["Body_type","Rating","Drivetrain","Engine","Transmission","Seller_type"],axis=1,inplace=True)

In [None]:
df = pd.concat([df,body,rating,drivetrain,engine,transmission,seller],axis=1)

In [None]:
df.head()

## Building the model

In [None]:
# Importing the libraries

from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict,cross_val_score,cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
X = df.drop("Price",axis=1)
y = df["Price"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state= 100)              # Train-test split                                                

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.30,random_state = 100)    # Train-validation split

### Support Vector Machine regressor

In [None]:
svm_reg = SVR(epsilon=0.1,kernel="poly",C = 100, degree=2)

In [None]:
svm_reg.fit(X_train,y_train)

In [None]:
svm_reg_pred = svm_reg.predict(X_val)

In [None]:
error_svm = np.sqrt(mean_squared_error(y_val,svm_reg_pred))

In [None]:
error_svm  

In [None]:
Percentage_of_error_svm = (error_svm/y_val.mean())*100

In [None]:
Percentage_of_error_svm

This model does a terrible job at predicting the prices.

### Random forest regressor

In [None]:
random_forest = RandomForestRegressor(n_estimators=800,min_samples_split=16,max_depth=6)

In [None]:
random_forest.fit(X_train,y_train)

In [None]:
random_forest_pred = random_forest.predict(X_val)

In [None]:
error_rf = np.sqrt(mean_squared_error(y_val,random_forest_pred))

In [None]:
error_rf

In [None]:
Percentage_of_error_rf = (error_rf/y_val.mean())*100

In [None]:
Percentage_of_error_rf

Random forest works much better than Support Vector Machine

### Gradient boosting regressor

In [None]:
gradient_boost = GradientBoostingRegressor(n_estimators=800,min_samples_split=16,max_depth=2,learning_rate=0.1)

In [None]:
gradient_boost.fit(X_train,y_train)

In [None]:
gradient_boost_pred = grb.predict(X_val)

In [None]:
error_grb = np.sqrt(mean_squared_error(y_val,gradient_boost_pred))

In [None]:
error_grb

In [None]:
Percentage_of_error_grb = (error_grb/y_val.mean())*100

In [None]:
Percentage_of_error_grb

### Gradient boost seems to be working better than the random forest

In [None]:
print("The mean of the distribution is: {}".format(y_val.mean()))
print("The standard deviation is:       {}".format(y_val.std()))
print("The root mean squared error is:  {}".format(error_grb))
print("The error percentage is:         {}".format(Percentage_of_error_grb))

### Let's look at the distribution of the original price and the predicted price 

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(y_val)
sns.distplot(grb_predict)

## From the figure, we could see that these two overlap a lot indicating that the model we built does a really good job predicting the prices.


###  It looks like the predictions deviate a bit from the true values towards the end of the bell curve  where the cars are high priced. The model is not able to capture this part of the distribution and it could very well be due to the following reasons:
* Vehicles with some fancy additional features could be more costly since we just counted # of features.
* Prices seem to blow much quickly as we go higher on the trims ladder.  
* Similarly, a large car from another brand could be more expensive than other brands.
* Some brands hold their value more than other brands and that could very well have been the factor behind the error.
* It probably would've done better had we access to more data at our disposal.

## Concluding remarks: At this point, I feel like it would lead to overfitting if we tried to match the prices too closely.  