In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA

cols = ["symboling", "normalized_losses","make","fuel_type","aspiration","num_doors","body_style","drive_wheels",
        "engine_location","wheel_base","length","width","height","curb_weight","engine_type","num_cylinders",
        "engine_size","fuel_system","bore","stroke","compression_ratio","horsepower","peak_rpm","city_mpg",
        "highway_mpg","price"]

data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                   names=cols)

data.replace("?", np.nan, inplace=True)

for c in data.columns:
    if c != "price":
        if data[c].dtype == "object":
            data[c].fillna(data[c].mode()[0], inplace=True)
        else:
            data[c] = pd.to_numeric(data[c], errors="coerce")
            data[c].fillna(data[c].mean(), inplace=True)

data = data.dropna(subset=["price"])
data["price"] = pd.to_numeric(data["price"], errors="coerce")

map_doors = {"two":2, "four":4}
map_cyl = {"two":2,"three":3,"four":4,"five":5,"six":6,"eight":8,"twelve":12}
data["num_doors"] = data["num_doors"].replace(map_doors)
data["num_cylinders"] = data["num_cylinders"].replace(map_cyl)

data = pd.get_dummies(data, columns=["body_style","drive_wheels"])

for c in ["make","aspiration","engine_location","fuel_type"]:
    le = LabelEncoder()
    data[c] = le.fit_transform(data[c])

data["fuel_system"] = data["fuel_system"].apply(lambda x: 1 if "pfi" in str(x) else 0)
data["engine_type"] = data["engine_type"].apply(lambda x: 1 if "ohc" in str(x) else 0)

X = data.drop("price", axis=1).values
y = data["price"].values

sc = StandardScaler()
X = sc.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print("R2 without PCA:", r2_score(y_test, y_pred))

pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reduced, y, test_size=0.3, random_state=42)

reg2 = LinearRegression()
reg2.fit(X_train_r, y_train_r)
y_pred_r = reg2.predict(X_test_r)
print("R2 with PCA:", r2_score(y_test_r, y_pred_r))

R2 without PCA: 0.8684846435357864
R2 with PCA: 0.8680881801854045


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[c].fillna(data[c].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[c].fillna(data[c].mode()[0], inplace=True)
  data["num_doors"] = data["num_doors"].replace(map_doors)
  data["num_cylinders"] = data["num_cylinders"].replace(map_cyl)
  return X @ co