In [17]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,MinMaxScaler

In [18]:
url='https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
columns=["symboling","normalized_losses","make","fuel_type","aspiration","num_doors",
        "body_style","drive_wheels","engine_location","wheel_base","length","width","height",
        "curb_weight","engine_type","num_cylinders","engine_size","fuel_system","bore","stroke",
        "compression_ratio","horsepower","peak_rpm","city_mpg","highway_mpg","price"]
df=pd.read_csv(url,names=columns)

In [19]:
df.replace('?',np.nan,inplace=True)

In [20]:
df

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
201,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
202,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
203,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


In [21]:
df['price'].isnull().sum()

4

In [22]:
df.isnull().sum()

symboling             0
normalized_losses    41
make                  0
fuel_type             0
aspiration            0
num_doors             2
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_cylinders         0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

In [23]:
df.dropna(subset='price',inplace=True)

In [24]:
df.shape

(201, 26)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          201 non-null    int64  
 1   normalized_losses  164 non-null    object 
 2   make               201 non-null    object 
 3   fuel_type          201 non-null    object 
 4   aspiration         201 non-null    object 
 5   num_doors          199 non-null    object 
 6   body_style         201 non-null    object 
 7   drive_wheels       201 non-null    object 
 8   engine_location    201 non-null    object 
 9   wheel_base         201 non-null    float64
 10  length             201 non-null    float64
 11  width              201 non-null    float64
 12  height             201 non-null    float64
 13  curb_weight        201 non-null    int64  
 14  engine_type        201 non-null    object 
 15  num_cylinders      201 non-null    object 
 16  engine_size        201 non

In [26]:
for col in df.columns:
    if df[col].dtype in ['int64','float64']:
        df[col].fillna(df[col].mean(),inplace=True)
    else:
        df[col].fillna(df[col].mode()[0],inplace=True)

In [27]:
df.isnull().sum()

symboling            0
normalized_losses    0
make                 0
fuel_type            0
aspiration           0
num_doors            0
body_style           0
drive_wheels         0
engine_location      0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
engine_type          0
num_cylinders        0
engine_size          0
fuel_system          0
bore                 0
stroke               0
compression_ratio    0
horsepower           0
peak_rpm             0
city_mpg             0
highway_mpg          0
price                0
dtype: int64

In [29]:
df['num_doors'].unique()

array(['two', 'four'], dtype=object)

In [30]:
df['num_cylinders'].unique()

array(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'],
      dtype=object)

In [31]:
word_to_num={
    'one':1,'two':2,'three':3,'four':4,'five':5,'six':6,'seven':7,'eight':8,'nine':9,'ten':10,'eleven':11,'twelve':12
}

In [32]:
df['num_doors']=df['num_doors'].map(word_to_num)
df['num_cylinders']=df['num_cylinders'].map(word_to_num)

In [36]:
df=pd.get_dummies(df,columns=['body_style','drive_wheels'])

In [38]:
le=LabelEncoder()

for col in ['make','aspiration','engine_location','fuel_type']:
    df[col]=le.fit_transform(df[col])

In [40]:
df['fuel_system']=df['fuel_system'].apply(lambda x:1 if x=='pfi' else 0)
df['engine_type']=df['engine_type'].apply(lambda x:1 if x=='ohc' else 0)

In [41]:
df

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,engine_location,wheel_base,length,width,...,highway_mpg,price,body_style_convertible,body_style_hardtop,body_style_hatchback,body_style_sedan,body_style_wagon,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd
0,3,161,0,1,0,2,0,88.6,168.8,64.1,...,27,13495,1,0,0,0,0,0,0,1
1,3,161,0,1,0,2,0,88.6,168.8,64.1,...,27,16500,1,0,0,0,0,0,0,1
2,1,161,0,1,0,2,0,94.5,171.2,65.5,...,26,16500,0,0,1,0,0,0,0,1
3,2,164,1,1,0,4,0,99.8,176.6,66.2,...,30,13950,0,0,0,1,0,0,1,0
4,2,164,1,1,0,4,0,99.4,176.6,66.4,...,22,17450,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,21,1,0,4,0,109.1,188.8,68.9,...,28,16845,0,0,0,1,0,0,0,1
201,-1,95,21,1,1,4,0,109.1,188.8,68.8,...,25,19045,0,0,0,1,0,0,0,1
202,-1,95,21,1,0,4,0,109.1,188.8,68.9,...,23,21485,0,0,0,1,0,0,0,1
203,-1,95,21,0,1,4,0,109.1,188.8,68.9,...,27,22470,0,0,0,1,0,0,0,1


In [43]:
X=df.drop('price',axis=1)
y=df['price']

In [44]:
scaler=MinMaxScaler()
X_scaled=scaler.fit_transform(X)

In [59]:
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.3,random_state=42)

In [60]:
model=LinearRegression()
model.fit(X_train,y_train)
model.predict(X_test)

array([ 7310.1514928 , 25249.86850518,  6254.27706211,  6615.04107352,
       12873.23310891,  5231.30707742, 27764.96962862,  9627.12379188,
       20064.10142984, 29855.00993948, 23066.20314378,  9953.12506768,
       14257.83777915,  9856.70661818, 13603.3287356 ,  7554.61406877,
       10731.37055669,  6603.02917665, 10261.45049678, 30528.53379438,
       32528.        , 29820.51088264,  6265.36269515,  7809.89111256,
       23157.03891036, 11996.94331477, 11231.65210117, 26447.78925174,
       35212.33148481, 14332.11981404, 15472.32133103,  7122.83457155,
       10216.68023124,  7115.23130497,  7669.26355055, 15657.67779156,
        5306.35891072,  7980.811066  , 14412.1896852 ,  7529.74956955,
       14068.63467653, 21574.10086733, 10635.82867356,  8149.7020834 ,
       17813.88452175, 18662.51106687,  3676.24970063,  7719.43000184,
        6865.76768288,  6799.12134463, 18732.69101716, 20027.32596315,
       15350.31380829,  8023.57220738,  6895.14595634,  6731.14016804,
      

In [61]:
model.score(X_train,y_train)

0.9068335669887636

In [62]:
pca=PCA(n_components=0.95)
X_pca=pca.fit_transform(X_scaled)
X_train,X_test,y_train,y_test=train_test_split(X_pca,y,test_size=0.3,random_state=42)

In [63]:
model.fit(X_train,y_train)
model.predict(X_test)

array([12737.75330533, 25447.13946542,  4270.32385618,  7460.07761148,
       13269.88739752,  3469.9087912 , 25905.07830144, 11005.37308006,
       18414.42566141, 30336.05004666, 28102.44413749,  7081.6132941 ,
       13667.64552604, 11286.26235611, 14066.60030702,  7108.08137705,
       12683.20276657,  6878.37820167, 12046.30861413, 28840.4579391 ,
       23440.22416519, 27585.32511573,  4003.98615267,  7344.97942482,
       21710.88463232, 12010.893952  , 11264.7618347 , 25556.78215733,
       18785.69419737, 14124.23892919, 14864.0919596 ,  4870.90069964,
       13527.98273525,  5563.89700338,  5435.69069231, 18615.09250961,
        7513.50455937,  8798.5939512 , 19763.75454624,  7301.56517625,
       15909.27655199, 19032.98840489, 11127.91529927,  7231.0238424 ,
       16102.92489003, 16991.86554271,  1802.04463884,  5900.38631215,
        8225.88342013,  7582.51024313, 18968.13299342, 18242.29391451,
       16831.61744305, 11603.39989221,  4250.22021774,  4531.97937112,
      

In [64]:
model.score(X_train,y_train)

0.8244339152904524

In [None]:
#no improvment