<h1>Predicting Car Prices from a given Dataset</h1>

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from scipy import stats
from matplotlib import pyplot as plt
import numpy as np

<h3>Using SciKit Learn and Pandas</h3>

In [2]:
address = 'CarPrice_Assignment.csv'
df = pd.read_csv(address)
df.describe()
df.dtypes

car_ID                int64
symboling             int64
CarName              object
fueltype             object
aspiration           object
doornumber           object
carbody              object
drivewheel           object
enginelocation       object
wheelbase           float64
carlength           float64
carwidth            float64
carheight           float64
curbweight            int64
enginetype           object
cylindernumber       object
enginesize            int64
fuelsystem           object
boreratio           float64
stroke              float64
compressionratio    float64
horsepower            int64
peakrpm               int64
citympg               int64
highwaympg            int64
price               float64
dtype: object

In [3]:
df.corr(method='pearson')#finding correlation between price and other parameters. closer the value is to 1 or -1, higher is their
#interdependece

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
car_ID,1.0,-0.151621,0.129729,0.170636,0.052387,0.25596,0.071962,-0.03393,0.260064,-0.160824,0.150276,-0.015006,-0.203789,0.01594,0.011255,-0.109093
symboling,-0.151621,1.0,-0.531954,-0.357612,-0.232919,-0.541038,-0.227691,-0.10579,-0.130051,-0.008735,-0.178515,0.070873,0.273606,-0.035823,0.034606,-0.079978
wheelbase,0.129729,-0.531954,1.0,0.874587,0.795144,0.589435,0.776386,0.569329,0.48875,0.160959,0.249786,0.353294,-0.360469,-0.470414,-0.544082,0.577816
carlength,0.170636,-0.357612,0.874587,1.0,0.841118,0.491029,0.877728,0.68336,0.606454,0.129533,0.158414,0.552623,-0.287242,-0.670909,-0.704662,0.68292
carwidth,0.052387,-0.232919,0.795144,0.841118,1.0,0.27921,0.867032,0.735433,0.55915,0.182942,0.181129,0.640732,-0.220012,-0.642704,-0.677218,0.759325
carheight,0.25596,-0.541038,0.589435,0.491029,0.27921,1.0,0.295572,0.067149,0.171071,-0.055307,0.261214,-0.108802,-0.320411,-0.04864,-0.107358,0.119336
curbweight,0.071962,-0.227691,0.776386,0.877728,0.867032,0.295572,1.0,0.850594,0.64848,0.16879,0.151362,0.750739,-0.266243,-0.757414,-0.797465,0.835305
enginesize,-0.03393,-0.10579,0.569329,0.68336,0.735433,0.067149,0.850594,1.0,0.583774,0.203129,0.028971,0.809769,-0.24466,-0.653658,-0.67747,0.874145
boreratio,0.260064,-0.130051,0.48875,0.606454,0.55915,0.171071,0.64848,0.583774,1.0,-0.055909,0.005197,0.573677,-0.254976,-0.584532,-0.587012,0.553173
stroke,-0.160824,-0.008735,0.160959,0.129533,0.182942,-0.055307,0.16879,0.203129,-0.055909,1.0,0.18611,0.08094,-0.067964,-0.042145,-0.043931,0.079443


<p>Here we see that the parameters that price depends on the most are: carlength, carwidth, curbweight, enginesize, horsepower, citympg and highwaympg.</p>

In [4]:
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [5]:
X = df[['horsepower', 'enginesize', 'citympg', 'highwaympg', 'curbweight', 'carlength', 'carwidth']]#independent variables 
y = df[['price']]#target variable

lr = linear_model.LinearRegression()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)#splitting variables into training and testing sets

model = lr.fit(X_train, y_train)#fitting and training the model
prediction = model.predict(X_test)#making predicts using the test set
print(prediction)#printing predicted values
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
print(model.score(X,y))#accuracy of the model

[[37200.97155967]
 [ 5662.03175894]
 [13858.78130483]
 [ 5760.21217482]
 [19261.49159467]
 [12103.98516483]
 [26254.86929349]
 [18760.43570155]
 [ 5545.16844874]
 [18592.41416235]
 [ 8664.78715253]
 [18731.98952683]
 [ 5758.70888992]
 [12572.73738433]
 [15251.7794495 ]
 [22205.74584446]
 [18925.44851626]
 [23927.70239154]
 [ 6261.95063465]
 [21750.95575988]
 [15378.54795499]
 [16736.90830634]
 [ 9957.2263788 ]
 [ 6461.42093136]
 [ 5953.0079286 ]
 [ 5712.11078092]
 [15755.24105698]
 [12027.61173791]
 [11422.78632803]
 [-1056.05725645]
 [ 8450.03659135]
 [39282.75399562]
 [ 6875.98398923]
 [19552.71348473]
 [34060.76759954]
 [19888.75656314]
 [ 6439.38135881]
 [24147.68864343]
 [ 8523.89557967]
 [16547.07276992]
 [15719.61349353]]
Intercept: [-62662.33702811]
Coefficients: [[ 50.69236636  78.14252008 -42.1923286   57.6493361    3.05493708
  -42.35721314 908.20619398]]
0.8164983116607906


<h3>Using Gradient Descent</h3>

In [6]:

Xfinal = X.copy()
Xfinal = X/np.max(X)
Xfinal = Xfinal.values#converting and normalizing independent variables 


rows, cols = Xfinal.shape
intercept = np.ones((rows,1))
Xfinal = np.append(Xfinal, intercept, axis=1)#adding bias for constant term
print(Xfinal)
m = np.random.randn(cols+1,1)#initializing m 
print(m)
yfinal = y.copy()
yfinal = y/np.max(y)
yfinal = yfinal.values#converting and normalizing target variable 
print(yfinal)


[[0.38541667 0.39877301 0.42857143 ... 0.81114849 0.88658368 1.        ]
 [0.38541667 0.39877301 0.42857143 ... 0.81114849 0.88658368 1.        ]
 [0.53472222 0.46625767 0.3877551  ... 0.8226814  0.90594744 1.        ]
 ...
 [0.46527778 0.53067485 0.36734694 ... 0.90725613 0.95297372 1.        ]
 [0.36805556 0.44478528 0.53061224 ... 0.90725613 0.95297372 1.        ]
 [0.39583333 0.43251534 0.3877551  ... 0.90725613 0.95297372 1.        ]]
[[-0.72012142]
 [-0.2060893 ]
 [ 0.6282948 ]
 [ 0.43907103]
 [ 1.35571759]
 [-1.30434111]
 [ 0.63657833]
 [-0.47593915]]
[[0.2972467 ]
 [0.36343612]
 [0.36343612]
 [0.30726872]
 [0.38436123]
 [0.33590308]
 [0.39008811]
 [0.41674009]
 [0.52588106]
 [0.39337372]
 [0.36189427]
 [0.37279736]
 [0.46189427]
 [0.46486784]
 [0.5410793 ]
 [0.67753304]
 [0.91002203]
 [0.8123348 ]
 [0.11345815]
 [0.13865639]
 [0.14482379]
 [0.12273128]
 [0.14046256]
 [0.17526432]
 [0.13720264]
 [0.14740088]
 [0.16759912]
 [0.1885022 ]
 [0.1964978 ]
 [0.28555066]
 [0.14270925]
 

In [7]:
n = len(y)
def gradientDescent(X, y, slope, alpha = 0.3, current_iterations = 0, stepsize = 1, costfunc = 0):#function for gradient descent
    while abs(stepsize) > 0.000001:
        h = np.dot(X, slope) 
        slope = slope - alpha*(1/n)*(X.T.dot(h-y))
        prevcf = costfunc 
        costfunc =  np.sum((h-y)**2)
        stepsize = prevcf - costfunc
        current_iterations = current_iterations+1
        print(costfunc)        
    return slope, costfunc

In [8]:
final, finalcf = gradientDescent(Xfinal, yfinal, m)#training the model
print("Final Values of Theta:\n", final, "\nFinal Cost Function:", finalcf)

25.84332944121399
17.15541713958387
16.370939498635398
15.768823544410992
15.194981619763771
14.645320175640155
14.11876184460731
13.61433105728993
13.131094517474422
12.668158324241652
12.224666288272529
11.79979834044331
11.39276900798031
11.002825954910945
10.629248584090423
10.271346698206115
9.928459217271476
9.599952950226355
9.28522141836103
8.983683728377596
8.694783492994688
8.417987797089706
8.152786207457444
7.898689824344995
7.655230373000555
7.421959333547914
7.198447107569819
6.984282219851535
6.779070553801219
6.582434619126394
6.394012850405689
6.213458935252413
6.040441170821602
5.874641847464697
5.715756658386626
5.563494134208263
5.417575101383565
5.277732163464993
5.143709204253333
5.015260911908587
4.892152323137719
4.774158386612152
4.661063544803856
4.552661333462891
4.448753997992217
4.349152126006855
4.253674295394663
4.162146737224697
4.0744030128768
3.9902837047924518
3.90963612027221
3.8323140077693285
3.758177285152398
3.6870917794320253
3.618928977467922
3

In [9]:
predictions = np.dot(Xfinal,final)
print(predictions*45400)#printing predictions
print("R^2:",np.sum((predictions - np.mean(yfinal))**2)/np.sum((yfinal - np.mean(yfinal))**2))


[[1.32678971e+04]
 [1.32678971e+04]
 [1.87726083e+04]
 [1.08039607e+04]
 [1.55931219e+04]
 [1.41323394e+04]
 [1.71819146e+04]
 [1.76528934e+04]
 [1.87858955e+04]
 [1.89395871e+04]
 [1.02217761e+04]
 [1.02217761e+04]
 [1.71892839e+04]
 [1.74247733e+04]
 [1.87762151e+04]
 [2.59893166e+04]
 [2.68403641e+04]
 [2.85695085e+04]
 [3.91793066e+01]
 [5.70410571e+03]
 [5.68093075e+03]
 [5.54166752e+03]
 [5.85073237e+03]
 [8.95721244e+03]
 [6.24036027e+03]
 [6.33455602e+03]
 [6.33455602e+03]
 [9.22695483e+03]
 [1.14596036e+04]
 [1.87417797e+04]
 [5.48083856e+03]
 [6.92343026e+03]
 [4.65044777e+03]
 [6.86511041e+03]
 [6.93361641e+03]
 [6.36529773e+03]
 [6.75161712e+03]
 [9.81393795e+03]
 [1.00408641e+04]
 [9.63372586e+03]
 [8.58783306e+03]
 [1.07657075e+04]
 [1.09606615e+04]
 [8.01262776e+03]
 [5.70410571e+03]
 [5.85396259e+03]
 [1.24628488e+04]
 [3.38895190e+04]
 [3.38895190e+04]
 [4.37055426e+04]
 [5.47633252e+03]
 [6.12735120e+03]
 [6.14875933e+03]
 [5.86059496e+03]
 [5.88200309e+03]
 [7.912431