# Machine Learning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Used_Bikes.csv')

In [3]:
df.head()

Unnamed: 0,bike_name,price,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus Dual Tone 110cc,35000.0,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,119900.0,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Triumph Daytona 675R,600000.0,Delhi,110.0,First Owner,8.0,675.0,Triumph
3,TVS Apache RTR 180cc,65000.0,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,80000.0,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha


In [4]:
df.duplicated().sum()

25324

In [5]:
df.drop_duplicates(inplace=True)  # inplce = True keeps changes in the og dataset

In [6]:
df.shape

(7324, 8)

In [7]:
df.isnull().sum()

bike_name     0
price         0
city          0
kms_driven    0
owner         0
age           0
power         0
brand         0
dtype: int64

In [8]:
df

Unnamed: 0,bike_name,price,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus Dual Tone 110cc,35000.0,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,119900.0,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Triumph Daytona 675R,600000.0,Delhi,110.0,First Owner,8.0,675.0,Triumph
3,TVS Apache RTR 180cc,65000.0,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,80000.0,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha
...,...,...,...,...,...,...,...,...
9362,Hero Hunk Rear Disc 150cc,25000.0,Delhi,48587.0,First Owner,8.0,150.0,Hero
9369,Bajaj Avenger 220cc,35000.0,Bangalore,60000.0,First Owner,9.0,220.0,Bajaj
9370,Harley-Davidson Street 750 ABS,450000.0,Jodhpur,3430.0,First Owner,4.0,750.0,Harley-Davidson
9371,Bajaj Dominar 400 ABS,139000.0,Hyderabad,21300.0,First Owner,4.0,400.0,Bajaj


In [9]:
cat_col = df.select_dtypes(include='O') # filters all catagorical columns, 'O' is for object as the strings are objects in pandas

In [10]:
cat_col

Unnamed: 0,bike_name,city,owner,brand
0,TVS Star City Plus Dual Tone 110cc,Ahmedabad,First Owner,TVS
1,Royal Enfield Classic 350cc,Delhi,First Owner,Royal Enfield
2,Triumph Daytona 675R,Delhi,First Owner,Triumph
3,TVS Apache RTR 180cc,Bangalore,First Owner,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,Bangalore,First Owner,Yamaha
...,...,...,...,...
9362,Hero Hunk Rear Disc 150cc,Delhi,First Owner,Hero
9369,Bajaj Avenger 220cc,Bangalore,First Owner,Bajaj
9370,Harley-Davidson Street 750 ABS,Jodhpur,First Owner,Harley-Davidson
9371,Bajaj Dominar 400 ABS,Hyderabad,First Owner,Bajaj


In [11]:
num_col = df.select_dtypes(exclude='O') # filters all numerical columns

In [12]:
num_col

Unnamed: 0,price,kms_driven,age,power
0,35000.0,17654.0,3.0,110.0
1,119900.0,11000.0,4.0,350.0
2,600000.0,110.0,8.0,675.0
3,65000.0,16329.0,4.0,180.0
4,80000.0,10000.0,3.0,150.0
...,...,...,...,...
9362,25000.0,48587.0,8.0,150.0
9369,35000.0,60000.0,9.0,220.0
9370,450000.0,3430.0,4.0,750.0
9371,139000.0,21300.0,4.0,400.0


In [13]:
ls = ['bike_name','city' ] 
cat_col.drop(ls, axis='columns', inplace=True)  # removing 'nike_name' and 'city' as they do not have a large impact on the selling price 

In [14]:
cat_col

Unnamed: 0,owner,brand
0,First Owner,TVS
1,First Owner,Royal Enfield
2,First Owner,Triumph
3,First Owner,TVS
4,First Owner,Yamaha
...,...,...
9362,First Owner,Hero
9369,First Owner,Bajaj
9370,First Owner,Harley-Davidson
9371,First Owner,Bajaj


In [15]:
cat_col['owner'].value_counts()

owner
First Owner             6642
Second Owner             588
Third Owner               84
Fourth Owner Or More      10
Name: count, dtype: int64

In [16]:
dict = {
    'First Owner' : 1,
    'Second Owner': 2,
    'Third Owner' : 3,
    'Fourth Owner Or More' : 4
}

cat_col['owner'] = cat_col['owner'].map(dict)


In [17]:
cat_col

Unnamed: 0,owner,brand
0,1,TVS
1,1,Royal Enfield
2,1,Triumph
3,1,TVS
4,1,Yamaha
...,...,...
9362,1,Hero
9369,1,Bajaj
9370,1,Harley-Davidson
9371,1,Bajaj


In [18]:
lst = list(cat_col['brand'].value_counts().keys())
len(lst)
lst

['Bajaj',
 'Royal Enfield',
 'Hero',
 'Honda',
 'Yamaha',
 'TVS',
 'KTM',
 'Suzuki',
 'Harley-Davidson',
 'Kawasaki',
 'Hyosung',
 'Mahindra',
 'Benelli',
 'Triumph',
 'Ducati',
 'BMW',
 'Jawa',
 'Indian',
 'MV',
 'Rajdoot',
 'LML',
 'Yezdi',
 'Ideal']

In [19]:
# METHOD 1

# dt2 = {}
# for i, item in enumerate(lst):
#     dt2[item] = i

# dt2

In [20]:
# METHOD 2  by dictionary comprehension

# dt = {key:i for i,key in enumerate(lst)}

In [21]:
dt = {}
for i in range(len(lst)):
    key = lst[i]
    val = i
    dt.update({key : val})
  

In [22]:
dt

{'Bajaj': 0,
 'Royal Enfield': 1,
 'Hero': 2,
 'Honda': 3,
 'Yamaha': 4,
 'TVS': 5,
 'KTM': 6,
 'Suzuki': 7,
 'Harley-Davidson': 8,
 'Kawasaki': 9,
 'Hyosung': 10,
 'Mahindra': 11,
 'Benelli': 12,
 'Triumph': 13,
 'Ducati': 14,
 'BMW': 15,
 'Jawa': 16,
 'Indian': 17,
 'MV': 18,
 'Rajdoot': 19,
 'LML': 20,
 'Yezdi': 21,
 'Ideal': 22}

In [23]:
cat_col['brand'] = cat_col['brand'].map(dt)

In [24]:
cat_col.isnull().sum()

owner    0
brand    0
dtype: int64

In [25]:
num_col.isnull().sum()

price         0
kms_driven    0
age           0
power         0
dtype: int64

In [26]:
df['brand'] = df['brand'].map(dt)
df['owner'] = df['owner'].map(dict)

In [27]:
ls = ['bike_name', 'city']
df.drop(ls, axis='columns', inplace=True)

In [28]:
y = df[['price']]
y

Unnamed: 0,price
0,35000.0
1,119900.0
2,600000.0
3,65000.0
4,80000.0
...,...
9362,25000.0
9369,35000.0
9370,450000.0
9371,139000.0


In [29]:
x = df
x

Unnamed: 0,price,kms_driven,owner,age,power,brand
0,35000.0,17654.0,1,3.0,110.0,5
1,119900.0,11000.0,1,4.0,350.0,1
2,600000.0,110.0,1,8.0,675.0,13
3,65000.0,16329.0,1,4.0,180.0,5
4,80000.0,10000.0,1,3.0,150.0,4
...,...,...,...,...,...,...
9362,25000.0,48587.0,1,8.0,150.0,2
9369,35000.0,60000.0,1,9.0,220.0,0
9370,450000.0,3430.0,1,4.0,750.0,8
9371,139000.0,21300.0,1,4.0,400.0,0


In [30]:
from sklearn.model_selection import train_test_split

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)  # 80% data for training and 20% data for test

In [32]:
x_train

Unnamed: 0,price,kms_driven,owner,age,power,brand
2956,360000.0,7093.0,1,3.0,310.0,15
8881,60000.0,19351.0,1,5.0,220.0,0
1460,57500.0,15000.0,1,4.0,180.0,0
769,60000.0,40997.0,1,6.0,150.0,7
947,60000.0,15252.0,1,6.0,150.0,3
...,...,...,...,...,...,...
4367,90000.0,2672.0,1,2.0,160.0,5
5876,24000.0,21700.0,1,5.0,100.0,0
6156,20990.0,82000.0,1,6.0,150.0,0
7928,29000.0,26000.0,2,9.0,150.0,0


In [33]:
y_train

Unnamed: 0,price
2956,360000.0
8881,60000.0
1460,57500.0
769,60000.0
947,60000.0
...,...
4367,90000.0
5876,24000.0
6156,20990.0
7928,29000.0


In [34]:
x_test

Unnamed: 0,price,kms_driven,owner,age,power,brand
435,75000.0,3419.0,1,3.0,200.0,5
5034,70000.0,10000.0,3,4.0,223.0,2
5990,145000.0,14307.0,1,5.0,500.0,1
8054,52000.0,40272.0,1,4.0,150.0,7
218,115000.0,7593.0,1,4.0,400.0,0
...,...,...,...,...,...,...
8595,95783.0,12975.0,1,6.0,350.0,1
7767,41000.0,10000.0,1,5.0,110.0,3
1659,70000.0,48500.0,1,11.0,350.0,1
3498,58000.0,28435.0,1,6.0,220.0,0


In [35]:
y_test

Unnamed: 0,price
435,75000.0
5034,70000.0
5990,145000.0
8054,52000.0
218,115000.0
...,...
8595,95783.0
7767,41000.0
1659,70000.0
3498,58000.0


In [36]:
from sklearn.linear_model import LinearRegression  # Linear Regression is the model we will use because our problem statement is of 'regression'

In [37]:
lr  = LinearRegression()  # object of class 'LinearRegression'

In [38]:
lr.fit(x_train, y_train)   # The time taken by this cell for the execution is the time taken by the algorithm to train

In [39]:
lr.score(x_train, y_train)   # returns the accuracy of the model for training data

1.0

In [40]:
lr.score(x_test, y_test)    # returns the accuracy of the model for test data

1.0

In [41]:
pred = lr.predict(x_test)   # return the prediction done on the x_test data

In [42]:
pred

array([[ 75000.],
       [ 70000.],
       [145000.],
       ...,
       [ 70000.],
       [ 58000.],
       [140000.]])

In [43]:
y_test

Unnamed: 0,price
435,75000.0
5034,70000.0
5990,145000.0
8054,52000.0
218,115000.0
...,...
8595,95783.0
7767,41000.0
1659,70000.0
3498,58000.0


In [44]:
y_test['prediction'] = pred

In [45]:
y_test.columns = ['Actual ', 'Prediction']

In [46]:
y_test

Unnamed: 0,Actual,Prediction
435,75000.0,75000.0
5034,70000.0,70000.0
5990,145000.0,145000.0
8054,52000.0,52000.0
218,115000.0,115000.0
...,...,...
8595,95783.0,95783.0
7767,41000.0,41000.0
1659,70000.0,70000.0
3498,58000.0,58000.0


In [47]:
# using a different algorithm
from sklearn.ensemble import RandomForestRegressor

In [48]:
rdf = RandomForestRegressor()

In [49]:
rdf.fit(x_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [50]:
rdf.score(x_train, y_train)

0.999789926815565

In [51]:
pred2 = rdf.predict(x_test)

In [52]:
y_test['Prediction-2'] = pred2

In [53]:
y_test

Unnamed: 0,Actual,Prediction,Prediction-2
435,75000.0,75000.0,75000.0
5034,70000.0,70000.0,70000.0
5990,145000.0,145000.0,145000.0
8054,52000.0,52000.0,52000.0
218,115000.0,115000.0,115000.0
...,...,...,...
8595,95783.0,95783.0,96000.0
7767,41000.0,41000.0,41000.0
1659,70000.0,70000.0,70000.0
3498,58000.0,58000.0,58000.0


In [54]:
import joblib  # saving variables in our local system

In [55]:
lst1 = [10,20,30,40,50,60]
lst1

[10, 20, 30, 40, 50, 60]

In [56]:
joblib.dump(lst1, 'myList.lb')

['myList.lb']

In [57]:
my_var = joblib.load('myList.lb')
my_var

[10, 20, 30, 40, 50, 60]

In [58]:
joblib.dump(lr, 'my_model.lb')

['my_model.lb']