In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# setting dimensions for plot
sns.set(rc={'figure.figsize':(11.7, 8.27)})
sns.set(rc={'figure.figsize':(5,4)})

In [None]:
dataset = pd.read_csv('../input/car-pricing-prediction/cars_sampled.csv')
dataset2 = dataset.copy() # keeping a deepcopy of original dataset

### Feature Description
* 1. dateCrawled: when this ad was first crawled, all field-values are taken from this date
* 2. name: name of the car
* 3. seller: private or dealer
* 4. offerType:
* 5. Price: (target Variable) the price on the ad to sell the car
* 6. abtest:
* 7. vehicleType:
* 8. yearOfRegistration: the year on which car was first registered
* 9. gearbox
* 10. powerPS: power of car in PS (horsepower)
* 11. model
* 12. kilometer: how many kilometers the car has driven
* 13. monthOfRegistration: at which month the car was first registered
* 14. fuelType
* 15. brand
* 16. notRepairedDamage: if the car has a damage which is not yet repaired
* 17. dateCreated: the date for which the ad at ebay was created
* 18. nrOfPictures: number of pictures added in the ad
* 19. postalCode
* 20. lastSeenOnline: when the crawler saw this ad last online

### Getting familier with data

In [None]:
# examining dataset
dataset.info()

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# to convert all the displays in .000 format and avoid scientific values

dataset.describe()

### Data Cleaning & Feature Engineering

#### 1. Dropping features which will not be much helpful for the problem solving (prediction)

In [None]:

dataset = dataset.drop(['dateCrawled', 'dateCreated', 'postalCode',
                       'lastSeen', 'name'], axis=1)

#### 2. Dropping duplicate records and keeping the first occurrences


In [None]:
dataset.drop_duplicates(keep='first', inplace=True)

#### 3. Removing outliers 

##### 3.a: year of registration (feature)

In [None]:
print(np.unique(dataset['yearOfRegistration']))

print("\n\n")

print(dataset['yearOfRegistration'].value_counts())

##### Observations: there are some rubbish years, which needs cleaning

In [None]:
yearwise_count = dataset['yearOfRegistration'].value_counts().sort_index()
yearwise_count

In [None]:
sum(dataset['yearOfRegistration'] > 2021)
# to see how many values are from future (non-relevant)

In [None]:
sum(dataset['yearOfRegistration'] < 1900)
# too old or wrongly entered year of Registration

In [None]:
# plot to see how car prices varied over the years
sns.regplot(x='yearOfRegistration', y='price', scatter=True, 
           fit_reg=False, data=dataset)
# seaborn's regplots are scatter plots

##### Observations: because of outliers present in yearOfRegistration, not much could be deducted from the plot

##### 3.b: price (target variable)

In [None]:
sns.distplot(dataset['price'], kde=True, bins=200, color="k")
# seaborn's distplots' are same as histogram in matplotlib

##### Observations: Clear presence of outlier, because of which the graph is looking highly skewed (right skewed)

In [None]:
price_count = dataset['price'].value_counts().sort_index()
price_count

##### Observations: skewed price=0, which has 1415 records

In [None]:
sns.boxplot(y=dataset['price'])

In [None]:
# setting a price range
print(len(dataset[dataset['price'] < 100].index))
print(len(dataset[dataset['price'] > 100].index))
print(sum(dataset['price'] > 150000))

In [None]:
sns.boxplot(y=dataset[(dataset['price'] > 50000) & (dataset['price'] < 150000)]['price'])

##### 3.c: powerPS (feature)

In [None]:
powerPS_count = dataset['powerPS'].value_counts().sort_index()
powerPS_count

##### Observations: skewness would exist because of 0

In [None]:
sns.distplot(dataset['powerPS'], kde=True, bins=100)

In [None]:
dataset['powerPS'].describe()

In [None]:
sns.regplot(x='powerPS', y='price', scatter=True, fit_reg=False,
           data=dataset)

In [None]:
print(sum(dataset['powerPS'] > 500))
print(sum(dataset['powerPS'] < 10))

##### 3.d: dropping outliers

In [None]:
dataset = dataset[(dataset.yearOfRegistration <= 2021)
                 & (dataset.yearOfRegistration >= 1900)
                 & (dataset.price >= 100)
                 & (dataset.price <= 15000)
                 & (dataset.powerPS >= 10)
                 & (dataset.powerPS <= 500)]


#### 4: ageFromRegistration -- new feature creation

In [None]:
np.unique(dataset['monthOfRegistration'])

In [None]:
dataset['monthOfRegistration'] /= 12
dataset['monthOfRegistration'] = round(dataset['monthOfRegistration'], 2)
np.unique(dataset['monthOfRegistration'])

In [None]:
current_year = 2021
dataset['ageFromRegistration'] = (current_year - dataset['yearOfRegistration'])
dataset['ageFromRegistration'] += dataset['monthOfRegistration']
dataset['ageFromRegistration'].describe()

In [None]:
# dropping month and year columns as new derived feature created which holds their importance
dataset = dataset.drop(['monthOfRegistration', 'yearOfRegistration'], axis=1)
dataset.head()

#### 5. Uni-Variate vs Bi-Variable Analysis to select significant features

##### 5.a: ageFromRegistration and powerPS

In [None]:
## ageFromRegistration
sns.distplot(dataset['ageFromRegistration'], kde=True, bins=20)

In [None]:
sns.boxplot(y=dataset['ageFromRegistration'])

In [None]:
sns.distplot(dataset['price'], kde=True, bins=100)

In [None]:
sns.boxplot(y=dataset['price'])

In [None]:
sns.distplot(dataset['powerPS'], kde=True, bins=100)

In [None]:
sns.boxplot(y=dataset['powerPS'])

In [None]:
## age VS price
fig, ax = plt.subplots()
sns.regplot(x='ageFromRegistration', y='price', scatter=True,
           fit_reg=True, data=dataset, ax=ax) 
# regplot -- scatter plot in seaborn
# ax.set(xlim=(0,80))
# ax.set_ylim(0,30000)
plt.show()

In [None]:
## powerPS vs price
sns.regplot(x='powerPS', y='price', scatter=True, fit_reg=True,
           data=dataset)

##### 5.b: feature seller

In [None]:
dataset['seller'].value_counts()

In [None]:
pd.crosstab(dataset['seller'], columns='count', normalize=True)

In [None]:
sns.countplot(x='seller', data=dataset)

##### 5.c: feature offerType

In [None]:
dataset['offerType'].value_counts()

In [None]:
sns.countplot(x='offerType', data=dataset)

##### 5.d: feature abtest

In [None]:
pd.crosstab(dataset['abtest'], columns='count', normalize=True)

In [None]:
sns.boxplot(x='abtest', y='price',data=dataset)

##### 5.e: feature vehicleType

In [None]:
pd.crosstab(dataset['vehicleType'],columns='count',normalize=True)

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x='vehicleType', data=dataset)

In [None]:
plt.figure(figsize=(10,8))
sns.boxplot(x=dataset['vehicleType'],  y=dataset['price'],data=dataset)

##### 5.f: feature gearbox

In [None]:
pd.crosstab(dataset['gearbox'], columns='count', normalize=True) * 100

In [None]:
sns.boxplot(x='gearbox', y='price',  data=dataset)

In [None]:
dataset['gearbox'].describe(include= ['O'])

##### 5.g: feature model

In [None]:
cross_tab = pd.crosstab(dataset['model'], columns='count', normalize=True)*100
cross_tab.sort_values(by='count', ascending=False)

In [None]:
plt.figure(figsize=(28,18))
sns.boxplot(x='model',  y='price',data=dataset)
plt.xticks(rotation='vertical')
plt.show()

##### 5.h: feature fuelType

In [None]:
pd.crosstab(dataset['fuelType'], columns='count', normalize=True)*100

In [None]:
sns.boxplot(x='fuelType', y='price', data=dataset)

##### 5.i: feature brand

In [None]:
pd.crosstab(dataset['brand'], columns='count', normalize=True)*100

In [None]:
plt.figure(figsize=(18,16))
sns.boxplot(x='brand', y='price', data=dataset)
plt.xticks(rotation='vertical')
plt.show()

##### 5.j: feature notRepairedDamage
* one more important variable is notRepairedDamage
    * yes - car is currently in damaged state and has not been rectified
    * no - car was damaged but has  also been rectified

In [None]:
pd.crosstab(dataset['notRepairedDamage'], columns='count', normalize=True)*100

In [None]:
sns.boxplot(x='notRepairedDamage',  y='price', data=dataset)

##### Observations:

* [ageFromRegistration] with increase in age the price mostly decreases. SIGNIFICANT 
* [powerPS] with increase in powerPS the price is also increasing. SIGNIFICANT
* [seller] commercial category occupies  only 1 row, thus redundant. AS  SELLER IS NOT CATEGORICALLY RICH--> INSIGNIFICANT VARIABLE
* [offerType] ONLY ONE CATEGORY IS THERE--> INSIGNIFICANT VARIABLE
* [abtest] for every price  value there  is  almost 50-50 distribution of both categories in abtest (i.e., test, control). So it does not affect price much==> INSIGNIFICANT.
* [vehicleType] vehicleType  is a SIGNIFICANT VARIABLE, as it has  various  categories and all of them affects the price in different ways.
* [gearbox] gearbox is a SIGNIFICANT VARIABLE  --> althought it has 2  categories  but  each category is affecting our dependent variable price differently (automatic gearbox cars have higher medean price)
* [model] we will retain the 'model' variable as it  holds various categories:  golf being the dominant
* [fuelType] clearly  fuelType affects price as various categories of fuelType gives different prices. Hybrid seems to have higher median prices. SIGNIFICANT VARIABLE
* [brand] boxplot  makes  it extremely clear that price is highly dependent on brand. Brands like porche has higher median value  in relation to price SIGNIFICANT VARIABLE
* [notRepairedDamage] boxplot clearly shows  that, car where damage  has been repaired  (no) is having higher median value  with respect to price. SIGNIFICANT VARIABLE


In [None]:
dataset = dataset.drop(['seller', 'offerType', 'abtest'], axis=1)

### Correlations

In [None]:
dataset.corr()
# we are getting correlation among numerical variable only

In [None]:
sns.heatmap(dataset.corr(), annot=True)

### Model Building
* we will use two types of models 
    * Linear Regression
    * Random Forest model

* We will use two sets of data
    * data obtained from removing rows containing even  a single  missing value
    * data obtained by imputing the missing values

#### 1. Dataset - removing rows with missing values
* Baseline model - using test data mean values
* Our objective is to build models whose RMSE would be less than Baseline model's RMSE

In [None]:
dataset_omit = dataset.dropna(axis=0)

In [None]:
dataset_omit.info()

##### 1.a Categorical variable Encoding

In [None]:
# Encoding categorical variables
dataset_omit = pd.get_dummies(dataset_omit, drop_first=True)
dataset_omit.info()

In [None]:
dataset_omit.head()

In [None]:
# splitting dependent and independent features
X = dataset_omit.drop(['price'], axis=1)
y = dataset_omit.price

##### 1.b Feature Scaling - MinMaxScaler
* Feature Scaling was worsening the LinearRegression Model

In [None]:
# col_names = X.columns

# scaler = MinMaxScaler()
# x_scaled = scaler.fit_transform(X)
# X_final = pd.DataFrame(x_scaled, columns = col_names)
# X_final.head()

##### Target Variable transformation

In [None]:
prices = pd.DataFrame({'1. Before':y, '2. After':np.log(y)})
# plt.figure(figsize=(12,10))
prices.hist(bins=10)

In [None]:
# transforming prices to logarithmic values to avoid huge ranges
y = np.log(y)

In [None]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1)


##### Baseline Model

In [None]:
base_pred = np.mean(y_test)
print(base_pred)

In [None]:
# repeating the same value till length of the test data
base_pred = np.repeat(base_pred, len(y_test))
base_pred

In [None]:
# finding the root mean squared error
base_rmse_ommited1 = np.sqrt(mean_squared_error(y_test, base_pred))
print(base_rmse_ommited1)

##### LINEAR REGRESSION WITH OMITTED DATA

In [None]:
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

In [None]:
# predicting the test data
lr_pred = lr_model.predict(x_test)

lr_rmse1 = np.sqrt(mean_squared_error(y_test, lr_pred))
lr_rmse1

In [None]:
# R-squared value
# R-squared is a statistical measure of how close the data are to the fitted regression line
# The value varies for  0  to 1.
# higher values indicates the model has better  fitted the data, model was able to explain all variability of our
# response (dependent) data around its mean, points are closer to regression line
r2_linR_test1 = lr_model.score(x_test, y_test) #test values together
r2_linR_train1 = lr_model.score(x_train, y_train) #train values together
print(r2_linR_test1) #0.714851702564
print(r2_linR_train1) # 0.722351030384

In [None]:
# Regression diagnostic - Residual plot analysis
residuals1 = y_test - lr_pred
sns.regplot(x=lr_pred, y=residuals1, scatter=True, fit_reg=True, label='residual vs pred')
sns.regplot(x=lr_pred, y=y_test, scatter=True, fit_reg=True, label='residual vs y-true')
plt.legend(fontsize=11)
# residuals1.describe()# mean=0.002 which shows the Y_test and predicted values  are very close

##### RANDOM FOREST WITH OMITTED  DATA

In [None]:
rf = RandomForestRegressor(n_estimators=100, max_features='auto',\
                           max_depth=100, min_samples_split=10,\
                           min_samples_leaf=4,random_state=1)

In [None]:
# model
model_rf1 = rf.fit(x_train, y_train)

# Predicting  model on test  set
cars_predictions_rf1 = rf.predict(x_test)

# computing MSE and RMSE
rf_mse1 = mean_squared_error(y_test, cars_predictions_rf1)
rf_rmse1 = np.sqrt(rf_mse1)
print(rf_rmse1) 

In [None]:
# Regression diagnostic - Residual plot analysis
residuals1 = y_test - cars_predictions_rf1
sns.regplot(x=cars_predictions_rf1, y=residuals1, scatter=True, fit_reg=True, label='residual vs pred')
sns.regplot(x=cars_predictions_rf1, y=y_test, scatter=True, fit_reg=True, label='residual vs y-true')
plt.legend(fontsize=11)
# residuals1.describe()# mean=0.002 which shows the Y_test and predicted values  are very close

##### LASSO REGRESSION (L1-Regularization) WITH OMITTED  DATA

In [None]:
# lasso regression
lasso_r = Lasso()

lasso_r.fit(x_train, y_train)

# predicting for test data
test_data_pred = lasso_r.predict(x_test)

# computing MSE and RMSE
lasso_rmse1 = np.sqrt(mean_squared_error(y_test, test_data_pred))
print(lasso_rmse1) 

In [None]:
# Regression diagnostic - Residual plot analysis
residuals1 = y_test - test_data_pred
sns.regplot(x=test_data_pred, y=residuals1, scatter=True, fit_reg=True, label='residual vs pred')
sns.regplot(x=test_data_pred, y=y_test, scatter=True, fit_reg=True, label='residual vs y-true')
plt.legend(fontsize=11)
# residuals1.describe()# mean=0.002 which shows the Y_test and predicted values  are very close

#### 2. Dataset - imputing missing values
* we will not drop the np.nan holding columns this time,  instead update them  with median(numeric variable) and mode (categorical variables)
* Baseline model - using test data mean values
* Our objective is to build models whose RMSE would be less than Baseline model's RMSE

In [None]:
dataset_imputed = dataset.apply(lambda x:x.fillna(x.median()) \
                               if x.dtype == 'float' else \
                               x.fillna(x.value_counts().index[0]))

dataset_imputed.info()

In [None]:
# converting categorical variables to numeric using dummy variables
dataset_imputed = pd.get_dummies(dataset_imputed, drop_first=True)
dataset_imputed.info()

##### Model Building

In [None]:
# seperating input  and output features
X = dataset_imputed.drop(['price'], axis='columns', inplace=False)
y = dataset_imputed['price']

In [None]:
# # feature scaling --- worsen the linear regression model
# col_names = X.columns

# scaler = MinMaxScaler()
# x_scaled = scaler.fit_transform(X)
# X_final = pd.DataFrame(x_scaled, columns = col_names)
# X_final.head()

In [None]:
# plotting the variable  price
# normalval VS log value
prices = pd.DataFrame({"1. before":y, "2. After":np.log(y)})
prices.hist() 

In [None]:
# logarithmic values are giving more bell  shaped graph
# thus transforming y2 to logarithmic  form
y = np.log(y)

In [None]:
# splitting training and test data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1)

In [None]:
## BASELINE MODEL FOR IMPUTED DATA
'''
The base model is being built using the test data mean value
This is to set a benchmark and to compare with out regression model later
'''

base_pred = np.mean(y_test) # type - numpy.ndarray
base_pred = np.repeat(base_pred, len(y_test)) # to repeat same  value and makeit  of same size as y_test

In [None]:
# finding the RMSE
base_rmse_imputed2 = np.sqrt(mean_squared_error(y_test, base_pred))
print(base_rmse_imputed2)

##### LINEAR REGRESSION WITH IMPUTED DATA

In [None]:
#setting intercept as true
lgr2 = LinearRegression(fit_intercept=True)

# model
model_lin2 = lgr2.fit(x_train, y_train)

#  predicting model on test set
cars_predictions_lin2 = lgr2.predict(x_test)

# computing MSE and RMSE
lin_mse2 = mean_squared_error(y_test, cars_predictions_lin2)
lin_rmse2 = np.sqrt(lin_mse2)
print(lin_rmse2)

##### RANDOM FOREST WITH IMPUTED DATA

In [None]:
rf2 = RandomForestRegressor(n_estimators = 100, max_features='auto',
                            max_depth=100, min_samples_split=10,
                            min_samples_leaf=4, random_state=1)

# model
model_rf2 = rf2.fit(x_train, y_train)

# Predicting model on test set
cars_predictions_rf2 = rf2.predict(x_test)

# computing MSE and RMSE
rf_mse2 = mean_squared_error(y_test, cars_predictions_rf2)
rf_rmse2 = np.sqrt(rf_mse2)
print(rf_rmse2)

##### LASSO WITH IMPUTED DATA

In [None]:
# lasso regression
lasso_r2 = Lasso()

lasso_r2.fit(x_train, y_train)

# predicting for test data
test_data_pred = lasso_r2.predict(x_test)

# computing MSE and RMSE
lasso_rmse2 = np.sqrt(mean_squared_error(y_test, test_data_pred))
print(lasso_rmse2) 

#### ERRORS

In [None]:
print("Number of total data points (train+test) in ommited dataset: {}".format(len(dataset_omit.index)))
print("Ommited Data, Base Model Error: {}".format(base_rmse_ommited1))
print("Ommited Data, Linear Regression Error: {}".format(lr_rmse1))
print("Ommited Data, RANDOM FOREST Error: {}".format(rf_rmse1))
print("Ommited Data, Lasso Regression Error: {}".format(lasso_rmse1))

print("\n\n\n")

print("Number of total data points (train+test) in imputed dataset: {}".format(len(dataset_imputed.index)))
print("Imputed Data, Base Model Error: {}".format(base_rmse_imputed2))
print("Imputed Data, Linear Regression Error: {}".format(lin_rmse2))
print("Imputed Data, RANDOM FOREST Error: {}".format(rf_rmse2))
print("Imputed Data, Lasso Regression Error: {}".format(lasso_rmse2))

#### Observations:
* Random Forest performed the best, followed by linear regression and then lasso regression.
* Although the error values are less in ommited data, but it can be because of less rows/records present in ommited data as compared to imputed data