# Car Sales Prediction


## Steps in Achieving this Exercise
1. Get the data ready
2. choose a model
3. split the data into features and labels (Feature Engineering)
4. fit/train the model
5. save/load the model

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Get the data ready

In [63]:
car_sales = pd.read_csv('car-sales.csv')
car_sales

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,"$4,000"
1,Honda,Red,87899.0,4.0,"$5,000"
2,Toyota,Blue,,3.0,"$7,000"
3,BMW,Black,11179.0,5.0,"$22,000"
4,Nissan,White,213095.0,4.0,"$3,500"
5,Toyota,Green,,4.0,"$4,500"
6,Honda,,,4.0,"$7,500"
7,Honda,Blue,,4.0,
8,Toyota,White,60000.0,,
9,,White,31600.0,4.0,"$9,700"


In [64]:
# Check for missing values
car_sales.isnull().sum()

Make        1
Colour      1
Odometer    4
Doors       1
Price       2
dtype: int64

In [65]:
# fill missing values
car_sales['Make'].fillna('Toyota', inplace = True)
car_sales['Colour'].fillna('White', inplace = True)
car_sales['Odometer'].fillna(car_sales['Odometer'].mean(), inplace = True)
car_sales.dropna(subset=['Price'], inplace=True)


In [66]:
# Change the remove symbols [$ and ,] in price column
car_sales['Price'] = car_sales['Price'].str.replace('[\$\,\.]','').astype(int)

  car_sales['Price'] = car_sales['Price'].str.replace('[\$\,\.]','').astype(int)


## 2. Choose a Model

In [70]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

## 3. Split the data into Features and Label (Feature Engineering)

In [87]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

X = car_sales.drop('Price', axis=1)
y = car_sales['Price']

# Convert categorical data to numerical data
# This is because Machines do not understand strings 'Hello, etc.' so we need to convert them into numbers
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',one_hot,['Make','Colour','Doors'])], remainder='passthrough')
X_encoded = transformer.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded,y, test_size=0.25)

## 4. Fit / Train the model

In [88]:
model.fit(X_train, y_train)
model.score(X_test, y_test)

-2.32272928

## 5. Save and Load the model

In [89]:
import pickle