# This notebook is to prepare and pre-process data for various prediction models from the Used Car Price data at : https://www.kaggle.com/jpayne/852k-used-car-listings/data, after the initial Data exploration, as given in CarPricePredictionAnalysis-mileage.ipynb
The numerical variables are scaled with StandardScaler, imputation strategy is used to replace 0 values with mean
StratifiedshuffleSplit is done based on Age of car (Curr Year - Year of Car), by creating Age category (Age / 5), and 
putting the values in different Age category buckets. The same distribution is maintained in Train and Test data.
The categorical variables (Make, Model, State) are one-hot encoded and added to the feature vector. The numerical variables considered are : Age of Car in yrs,  and Mileage (Miles driven). 

In [1]:
#Import all necessary libraries
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures

Read the pickle files prepared by stratifying the Car Sales Data based on Make, Model, and State. This is necessary since cars belong to different price segments, and including all make and models is not a viable solution, as the range of price is different for the same features for different makes. The stratification details can be found in the 
Data exploration notebook, which precedes this and outputs the data into .pkl files, based on car segment/price category

In [2]:
df_ordinary=pd.read_pickle('C:/users/hackuser1/carSalesUSJapModels.pkl')

In [3]:
df_ordinary.head()

Unnamed: 0,Price,Mileage,Age,log_Price,Chevrolet,Chrysler,Ford,Honda,Hyundai,Mazda,...,TX,UT,VA,VT,Va,WA,WI,WV,WY,ga
909,17233,25076,1,9.754639,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
910,8575,47036,6,9.056723,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
911,16733,34415,1,9.725198,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
912,17233,29601,1,9.754639,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
913,17933,21580,1,9.794454,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
print(len(df_ordinary))

247396


We check the distribution of Car Sales on the basis of Age of Car, and create Age-cat and check the distribution of the Car data based on Age-cat (Age / 5). We plan to use StratifiedSampling to make sure both Test and Train data represents same distribution of cars based on Age of Car

In [5]:
df_ordinary["Age"].value_counts()
#create a field Age-cat to divide the data into 5 Age categories, based on the Age of the car
df_ordinary["Age-cat"] = np.ceil(df_ordinary["Age"] / 5)
df_ordinary["Age-cat"].where(df_ordinary["Age-cat"] < 5, 5.0, inplace=True)
#check distribution of Age Cat in the original data
df_ordinary["Age-cat"].value_counts() / len(df_ordinary)

1.0    0.799132
2.0    0.145083
3.0    0.048141
4.0    0.006455
0.0    0.000962
5.0    0.000226
Name: Age-cat, dtype: float64

We treat Make, Model, State as Categorical variables and these are already one-hot encoded as part of analysis

In [7]:
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)

for train_index, test_index in split.split(df_ordinary,df_ordinary["Age-cat"]):
    strat_train_set = df_ordinary.iloc[train_index]
    strat_test_set = df_ordinary.iloc[test_index]

In [8]:
#check distribution of Age Cat in the train data
strat_train_set["Age-cat"].value_counts() / len(strat_train_set)

1.0    0.799132
2.0    0.145082
3.0    0.048142
4.0    0.006457
0.0    0.000960
5.0    0.000227
Name: Age-cat, dtype: float64

In [9]:
#check distribution of Age Cat in the test data
strat_test_set["Age-cat"].value_counts() / len(strat_test_set)

1.0    0.799131
2.0    0.145089
3.0    0.048141
4.0    0.006447
0.0    0.000970
5.0    0.000222
Name: Age-cat, dtype: float64

Create the X and Y variables from the Feature analysis done in Exploration notebook. Repeat the same operations 
for Train and Test data.

In [10]:
carSales_X = strat_train_set.copy()
carSales_X = carSales_X.drop("Price", axis=1) # drop labels for training set
carSales_X = carSales_X.drop("log_Price", axis=1) # drop labels for training set
carSales_Y = strat_train_set["log_Price"].copy() # use Log Price as labels for training set, based on data Exploration
carSales_X = carSales_X.drop("Age-cat", axis=1)

carSales_test_X = strat_test_set.copy()
carSales_test_X = carSales_test_X.drop("Price", axis=1) # drop labels for test set
carSales_test_X = carSales_test_X.drop("log_Price", axis=1) # drop labels for test set
carSales_test_X = carSales_test_X.drop("Age-cat", axis=1)
carSales_test_Y = strat_test_set["log_Price"].copy()# use Log Price as labels for test set, based on data Exploration

In [11]:
carSales_Y = carSales_Y.values.reshape(carSales_Y.shape[0],1)
carSales_test_Y = carSales_test_Y.values.reshape(carSales_test_Y.shape[0],1)

print(carSales_Y.shape)
print(carSales_test_Y.shape)


(197916, 1)
(49480, 1)


In [12]:
carSales_X.head()

Unnamed: 0,Mileage,Age,Chevrolet,Chrysler,Ford,Honda,Hyundai,Mazda,Nissan,Pontiac,...,TX,UT,VA,VT,Va,WA,WI,WV,WY,ga
504445,99693,13,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
811108,149559,8,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
529458,24507,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
822648,40362,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
268627,13604,2,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
#Use StandardScaler on the numeric features
carSales_X_num = carSales_X.filter(['Mileage','Age'],axis=1)
carSales_test_X_num=carSales_test_X.filter(['Mileage','Age'],axis=1)
carSales_X_num.head()

Unnamed: 0,Mileage,Age
504445,99693,13
811108,149559,8
529458,24507,2
822648,40362,1
268627,13604,2


In [14]:
m=carSales_X_num.isnull().any()
print(m[m])
m=np.isfinite(carSales_X_num.select_dtypes(include=['float64'])).any()
print(m[m])
m=carSales_test_X_num.isnull().any()
print(m[m])
m=np.isfinite(carSales_test_X_num.select_dtypes(include=['float64'])).any()
print(m[m])

Series([], dtype: bool)
Series([], dtype: bool)
Series([], dtype: bool)
Series([], dtype: bool)


Wherever there are 0 values, we replace by the mean 

In [15]:
imputer = Imputer(missing_values=0,strategy="mean")
imputer.fit(carSales_X_num)
imputer.fit(carSales_test_X_num)

Imputer(axis=0, copy=True, missing_values=0, strategy='mean', verbose=0)

In [16]:
#Standardize the numerical data using sklearn StandardScaler
scaler = StandardScaler()
train_X = scaler.fit_transform(carSales_X_num)
test_X = scaler.transform(carSales_test_X_num)
print(train_X.shape)
print(test_X.shape)

(197916, 2)
(49480, 2)


In [17]:
carSales_X_cat = carSales_X.drop(['Mileage','Age'],axis=1)
carSales_test_X_cat = carSales_test_X.drop(['Mileage','Age'],axis=1)
print(carSales_X_cat.shape)
print(carSales_test_X_cat.shape)

(197916, 116)
(49480, 116)


In [18]:
#Concatenate the standardized numerical data to the categorical features
train_X =  np.concatenate((train_X,carSales_X_cat.values),axis=1)
test_X =  np.concatenate((test_X,carSales_test_X_cat.values),axis=1)
print(train_X.shape)
print(test_X.shape)

(197916, 118)
(49480, 118)


In [19]:
train_Y = pd.DataFrame(carSales_Y)
m=train_Y.isnull().any()
print(m[m])
m=np.isfinite(train_Y.select_dtypes(include=['float64'])).any()
print(m[m])

#train_Y_log = pd.DataFrame(carSales_Y_log)
#m=train_Y_log.isnull().any()
#print(m[m])
#m=np.isfinite(train_Y_log.select_dtypes(include=['float64'])).any()
#print(m[m])

test_Y = pd.DataFrame(carSales_test_Y)
m=test_Y.isnull().any()
print(m[m])
m=np.isfinite(test_Y.select_dtypes(include=['float64'])).any()
print(m[m])

#test_Y_log = pd.DataFrame(carSales_test_Y_log)
#m=test_Y_log.isnull().any()
#print(m[m])
#m=np.isfinite(test_Y_log.select_dtypes(include=['float64'])).any()
#print(m[m])



Series([], dtype: bool)
0    True
dtype: bool
Series([], dtype: bool)
0    True
dtype: bool


We now take backup of the pre-processed data, so the modeling can be done instantaneously on the pre-processed data
at any later point of time

In [None]:
train_X_mileage='C:/users/hackuser1/train_X_mileage1.pkl'
test_X_mileage='C:/users/hackuser1/test_X_mileage1.pkl'
train_Y_mileage='C:/users/hackuser1/train_Y_mileage1.pkl'
test_Y_mileage='C:/users/hackuser1/test_Y_mileage1.pkl'


with open(train_X_mileage, "wb") as f:
    w = pickle.dump(train_X,f)
with open(test_X_mileage, "wb") as f:
    w = pickle.dump(test_X,f)
with open(train_Y_mileage, "wb") as f:
    w = pickle.dump(train_Y,f)
with open(test_Y_mileage, "wb") as f:
    w = pickle.dump(test_Y,f)