In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import pearsonr
from statsmodels.stats.proportion import proportions_ztest
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv("imports-85.data",header=None,names=['symboling','normalizedLosses','make','fuelType','aspiration','numofdoors','bodyStyle','driveWheels','engineLocation','wheelBase','length','width','height','curbWeight','engineType','cylinders','engineSize','fuelSystem','bore','stroke','compRatio','horsepower','peak_rpm','city_mpg','highway_mpg','price'],na_values='?')

In [3]:
df.head()

Unnamed: 0,symboling,normalizedLosses,make,fuelType,aspiration,numofdoors,bodyStyle,driveWheels,engineLocation,wheelBase,...,engineSize,fuelSystem,bore,stroke,compRatio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [4]:
df.isnull().sum()

symboling            0
normalizedLosses    41
make                 0
fuelType             0
aspiration           0
numofdoors           2
bodyStyle            0
driveWheels          0
engineLocation       0
wheelBase            0
length               0
width                0
height               0
curbWeight           0
engineType           0
cylinders            0
engineSize           0
fuelSystem           0
bore                 4
stroke               4
compRatio            0
horsepower           2
peak_rpm             2
city_mpg             0
highway_mpg          0
price                4
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   symboling         205 non-null    int64  
 1   normalizedLosses  164 non-null    float64
 2   make              205 non-null    object 
 3   fuelType          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   numofdoors        203 non-null    object 
 6   bodyStyle         205 non-null    object 
 7   driveWheels       205 non-null    object 
 8   engineLocation    205 non-null    object 
 9   wheelBase         205 non-null    float64
 10  length            205 non-null    float64
 11  width             205 non-null    float64
 12  height            205 non-null    float64
 13  curbWeight        205 non-null    int64  
 14  engineType        205 non-null    object 
 15  cylinders         205 non-null    object 
 16  engineSize        205 non-null    int64  
 1

In [6]:
df.normalizedLosses.fillna(df.normalizedLosses.mean(),inplace=True)

In [7]:
df.numofdoors.fillna(df.numofdoors.mode().max(),inplace=True)

In [8]:
df.bore.fillna(df.bore.median(),inplace=True)

In [9]:
df.stroke.fillna(df.stroke.mean(),inplace=True)

In [10]:
df.horsepower.fillna(df.horsepower.mean(),inplace=True)

In [11]:
df.peak_rpm.fillna(df.peak_rpm.mean(),inplace=True)

In [12]:
df.price.fillna(df.price.mean(),inplace=True)

In [13]:
df.isnull().sum()

symboling           0
normalizedLosses    0
make                0
fuelType            0
aspiration          0
numofdoors          0
bodyStyle           0
driveWheels         0
engineLocation      0
wheelBase           0
length              0
width               0
height              0
curbWeight          0
engineType          0
cylinders           0
engineSize          0
fuelSystem          0
bore                0
stroke              0
compRatio           0
horsepower          0
peak_rpm            0
city_mpg            0
highway_mpg         0
price               0
dtype: int64

In [14]:
pearsonr(df.symboling,df.price)

(-0.08220143736318973, 0.24130433124264047)

In [15]:
pearsonr(df.normalizedLosses,df.price)

(0.13399873429274337, 0.05543057826318259)

In [16]:
pearsonr(df.wheelBase,df.price)

(0.5831681499789547, 4.527625545686765e-20)

In [17]:
pearsonr(df.length,df.price)

(0.6829862954386219, 1.6498873291218535e-29)

In [18]:
pearsonr(df.width,df.price)

(0.7286988175931839, 3.214520483804664e-35)

In [19]:
pearsonr(df.height,df.price)

(0.13438751871051807, 0.05471982048424776)

In [20]:
pearsonr(df.curbWeight,df.price)

(0.820824733354729, 2.8663321070270534e-51)

In [21]:
pearsonr(df.engineSize,df.price)

(0.8617522436859721, 9.669661923634776e-62)

In [22]:
pearsonr(df.bore,df.price)

(0.5323068133842814, 2.13761337330923e-16)

In [23]:
pearsonr(df.stroke,df.price)

(0.08209529554134327, 0.24191456857746316)

In [24]:
pearsonr(df.compRatio,df.price)

(0.07099036277901556, 0.31178249194905244)

In [25]:
pearsonr(df.horsepower,df.price)

(0.7579169537498177, 1.6076703978130332e-39)

In [26]:
pearsonr(df.peak_rpm,df.price)

(-0.10085406591169085, 0.15019246954232154)

In [27]:
pearsonr(df.city_mpg,df.price)

(-0.6674492651600004, 8.463700497727807e-28)

In [28]:
pearsonr(df.highway_mpg,df.price)

(-0.6905257341183488, 2.23246740583743e-30)

In [29]:
stats.f_oneway(df.price[df.fuelType=='gas'],df.price[df.fuelType=='diesel'])

F_onewayResult(statistic=2.4958589760681935, pvalue=0.11570297526361666)

In [30]:
stats.f_oneway(df.price[df.aspiration=='std'],df.price[df.aspiration=='turbo'])

F_onewayResult(statistic=6.587289855140993, pvalue=0.010991046478632666)

In [31]:
stats.f_oneway(df.price[df.numofdoors=='four'],df.price[df.numofdoors=='two'])

F_onewayResult(statistic=0.35778516571700403, pvalue=0.5504056367624801)

In [32]:
stats.f_oneway(df.price[df.driveWheels=='fwd'],df.price[df.driveWheels=='rwd'],df.price[df.driveWheels=='4wd'])

F_onewayResult(statistic=67.5036668266912, pvalue=3.5392707916047903e-23)

In [33]:
stats.f_oneway(df.price[df.bodyStyle=='convertible'],df.price[df.bodyStyle=='hatchback'],df.price[df.bodyStyle=='sedan'],df.price[df.bodyStyle=='wagon'],df.price[df.bodyStyle=='hardtop'])

F_onewayResult(statistic=9.183927252573389, pvalue=7.844575720929884e-07)

In [34]:
stats.f_oneway(df.price[df.engineLocation=='front'],df.price[df.engineLocation=='rear'])

F_onewayResult(statistic=24.979629190446918, pvalue=1.2486063656822578e-06)

In [35]:
stats.f_oneway(df.price[df.engineType=='ohc'],df.price[df.engineType=='ohcf'],df.price[df.engineType=='ohcv'],df.price[df.engineType=='l'],df.price[df.engineType=='rotor'],df.price[df.engineType=='dohcv'])

F_onewayResult(statistic=8.880041876905345, pvalue=1.3919774978921531e-07)

In [36]:
stats.f_oneway(df.price[df.cylinders=='four'],df.price[df.cylinders=='six'],df.price[df.cylinders=='five'],df.price[df.cylinders=='eight'],df.price[df.cylinders=='two'],df.price[df.cylinders=='three'],df.price[df.cylinders=='twelve'])

F_onewayResult(statistic=45.72705376201727, pvalue=7.149269882584999e-35)

In [37]:
stats.f_oneway(df.price[df.fuelSystem=='mpfi'],df.price[df.fuelSystem=='2bbl'],df.price[df.fuelSystem=='idi'],df.price[df.fuelSystem=='1bbl'],df.price[df.fuelSystem=='spdi'],df.price[df.fuelSystem=='4bbl'],df.price[df.fuelSystem=='mfi'],df.price[df.fuelSystem=='spfi'])

F_onewayResult(statistic=14.797402090222409, pvalue=1.8657347748825717e-15)

### Columns that Not affects price:

symboling, normalizedLosses, make, height, numofdoors, stroke, compRatio, peak_rpm, fuelType 

In [38]:
df.drop(['symboling','normalizedLosses','make','height','numofdoors','stroke','compRatio','peak_rpm','fuelType'],axis=1,inplace=True)

In [39]:
df.columns

Index(['aspiration', 'bodyStyle', 'driveWheels', 'engineLocation', 'wheelBase',
       'length', 'width', 'curbWeight', 'engineType', 'cylinders',
       'engineSize', 'fuelSystem', 'bore', 'horsepower', 'city_mpg',
       'highway_mpg', 'price'],
      dtype='object')

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   aspiration      205 non-null    object 
 1   bodyStyle       205 non-null    object 
 2   driveWheels     205 non-null    object 
 3   engineLocation  205 non-null    object 
 4   wheelBase       205 non-null    float64
 5   length          205 non-null    float64
 6   width           205 non-null    float64
 7   curbWeight      205 non-null    int64  
 8   engineType      205 non-null    object 
 9   cylinders       205 non-null    object 
 10  engineSize      205 non-null    int64  
 11  fuelSystem      205 non-null    object 
 12  bore            205 non-null    float64
 13  horsepower      205 non-null    float64
 14  city_mpg        205 non-null    int64  
 15  highway_mpg     205 non-null    int64  
 16  price           205 non-null    float64
dtypes: float64(6), int64(4), object(7)


In [42]:
df=pd.get_dummies(df)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   wheelBase              205 non-null    float64
 1   length                 205 non-null    float64
 2   width                  205 non-null    float64
 3   curbWeight             205 non-null    int64  
 4   engineSize             205 non-null    int64  
 5   bore                   205 non-null    float64
 6   horsepower             205 non-null    float64
 7   city_mpg               205 non-null    int64  
 8   highway_mpg            205 non-null    int64  
 9   price                  205 non-null    float64
 10  aspiration_std         205 non-null    uint8  
 11  aspiration_turbo       205 non-null    uint8  
 12  bodyStyle_convertible  205 non-null    uint8  
 13  bodyStyle_hardtop      205 non-null    uint8  
 14  bodyStyle_hatchback    205 non-null    uint8  
 15  bodySt

In [44]:
x = df.drop(['price'],axis=1)

In [45]:
y = df['price']

In [46]:
train_x,test_x,train_y,test_y= train_test_split(x,y,test_size=0.2,random_state=42)

In [47]:
model = LinearRegression()
model.fit(train_x,train_y)

LinearRegression()

In [48]:
predictions = model.predict(test_x)

In [49]:
predictions

array([ 2.58099535e+04,  2.09769551e+04,  1.09153868e+04,  1.15238665e+04,
        2.69752815e+04,  5.68116409e+03,  7.71763062e+03,  8.07753161e+03,
        8.79434430e+03,  9.17163562e+03,  1.67840973e+04,  7.86741739e+03,
        1.70482342e+04,  1.11230070e+04,  4.30210851e+04,  6.17196882e+03,
       -4.10697297e+00,  1.37122133e+04,  1.13264635e+04,  1.00905565e+04,
        1.14158330e+04,  1.58924420e+04,  4.82047732e+03,  4.03720490e+03,
        7.58609895e+03,  2.67291985e+04,  1.26796324e+04,  1.56238784e+04,
        4.85743022e+03,  1.67639206e+04,  2.75441178e+04,  6.19114136e+03,
        6.00519466e+03,  1.75429504e+04,  6.56364644e+03,  2.48796029e+04,
        1.23754594e+04,  1.27323627e+04,  6.62231162e+03,  1.41576350e+04,
        7.07101040e+03])

In [50]:
score = str(model.score(test_x,test_y))

In [51]:
score

'0.8364728664975493'