In [1]:
# Import libraries for data manipulation
import pandas as pd
import numpy as np

# Import libraries for data visualization
from matplotlib import pyplot
import matplotlib.pyplot as plt
import seaborn as sns

# Import libraries for building linear regression model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge

# Import library for preparing data
from sklearn.model_selection import train_test_split

# Import library for data preprocessing

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


# Import libraries for scoring models
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

#### 1. import, X-y separation, train_test_split
#### 2. EDA
#### 3. create 6 datasets
#### 4. impute, encoding, separate cat/num
#### 5. build models
#### 6. score models

In [2]:
data = pd.read_csv("auto_1993_adj.csv")
df = data.copy()

In [3]:
# df.head()

In [4]:
# df.describe().T

In [5]:
# df.info()

In [6]:
# We will drop "ID" as it is unlikely to contribute to our analysis
df = df.drop("ID", axis=1)
features = df.drop("mpg", axis=1)
target = df["mpg"]

#### Let's look at the data.

In [7]:
# df.hist(bins=25, figsize = (12,7))

In [8]:
# corr_matrix = df.corr()
# plt.figure(figsize=(8, 6))
# sns.heatmap(corr_matrix, annot=True, cmap="YlOrRd")
# plt.show()

In [9]:
# sns.pairplot(data=df, corner=True, height=3)
# plt.show()


In [10]:
df.head(1)

Unnamed: 0,displacement,cylinders,horsepower,weight,acceleration,model_year,origin,mpg
0,307.0,8,130.0,3504,12.0,70,1,18.0


In [20]:
X = data.copy()
X = X.dropna(axis=0)
y = X.pop("mpg")

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X.dtypes == int

In [18]:
from sklearn.feature_selection import mutual_info_regression
def make_mi_scores(X, y, discrete_features):
    print(X.head())
    print(y.shape)
    print(discrete_features)
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores[::3]  # show a few features with their MI scores

   ID  displacement  cylinders  horsepower  weight  acceleration  model_year  \
0   1         307.0          8       130.0    3504          12.0          70   
1   2         350.0          8       165.0    3693          11.5          70   
2   3         318.0          8       150.0    3436          11.0          70   
3   4         304.0          8       150.0    3433          12.0          70   
4   5         302.0          8       140.0    3449          10.5          70   

   origin  
0       1  
1       1  
2       1  
3       1  
4       1  
(392,)
ID               True
displacement    False
cylinders        True
horsepower      False
weight           True
acceleration    False
model_year       True
origin           True
dtype: bool


ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.

In [None]:
# 6 Distinct Feature Sets
# # Set 1
df1 = features
df2 = features[["displacement", "horsepower"]]
df3 = df[["cylinders"]]
df4 = df[["model_year"]]
# df5 = df[["displacement", "horsepower"]]
# df6 = df[["displacement", "horsepower"]]


In [None]:
def auto_regres(data, model):
    # Separate features and target
    X = data
    global target
    y = target
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=16
    )

    # Separate CAT & NUM features
    cat_cols = ["cylinders", "origin"]
    num_cols = ["displacement", "horsepower", "weight", "acceleration", "model_year"]
    cat_cols = [col for col in data.columns if col in cat_cols]
    num_cols = [col for col in data.columns if col in num_cols]

    # Transform NUM Data
    numerical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer()),
            ("scale", StandardScaler()),
        ]
    )

    # Transform CAT Data
    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )
    # Create Data Preprocessor Process
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols),
        ]
    )

    # # Define models
    # knn = KNeighborsRegressor()
    # svr = SVR()
    # ridge = Ridge()

    # Preprocess data then create model
    mod_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

    # Preprocess training data, fit model
    mod_pipeline.fit(X_train, y_train)

    # Get predictions for training data
    preds = mod_pipeline.predict(X_train)

    print(" MAE:", mean_squared_error(y_train, preds))
    print("R_2:", r2_score(y_train, preds))

In [None]:
# Define models
knn = KNeighborsRegressor()
svr = SVR()
ridge = Ridge()

In [None]:
auto_regres(df4, ridge)

In [None]:
# def cv_regres(data, model):
#     # Separate features and target
#     X = data.drop("mpg", axis=1)
#     y = data["mpg"]

#     # train test split
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.25, random_state=16
#     )

#     # Separate CAT & NUM features
#     cat_cols = ["cylinders", "origin"]
#     num_cols = ["displacement", "horsepower", "weight", "acceleration", "model_year"]
#     cat_cols = [col for col in data.columns if col in cat_cols]
#     num_cols = [col for col in data.columns if col in num_cols]

#     # Transform NUM Data
#     numerical_transformer = Pipeline(
#         steps=[
#             ("imputer", SimpleImputer()),
#             ("scale", StandardScaler()),
#         ]
#     )

#     # Transform CAT Data
#     categorical_transformer = Pipeline(
#         steps=[
#             ("imputer", SimpleImputer(strategy="most_frequent")),
#             ("onehot", OneHotEncoder(handle_unknown="ignore")),
#         ]
#     )
#     # Create Data Preprocessor Process
#     preprocessor = ColumnTransformer(
#         transformers=[
#             ("num", numerical_transformer, num_cols),
#             ("cat", categorical_transformer, cat_cols),
#         ]
#     )

#     # Define models
#     knn = KNeighborsRegressor()
#     svr = SVR()
#     ridge = Ridge()

#     # Preprocess data then create model
#     mod_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

#     # Cross-Validation Code
#     scores = -1 * cross_val_score(
#         mod_pipeline, X, y, cv=5, scoring="neg_mean_absolute_error"
#     )

#     print("MAE scores:\n", scores)

In [None]:
auto_regres(df1,ridge)

In [None]:
# simple_imputer = SimpleImputer()
# X_train_imputed = pd.DataFrame(simple_imputer.fit_transform(X_train))
# X_test_imputed = pd.DataFrame(simple_imputer.transform(X_test))
# X_train_imputed.columns = X_train.columns
# X_test_imputed.columns = X_test.columns

In [None]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer()),
        ("scale", StandardScaler()),
    ]
)

# Preprocessing for categorical data
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

In [None]:
# # Apply ordinal encoder to each column with categorical data
# ordinal_encoder = OrdinalEncoder()
# label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
# label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

In [None]:
# # One Hot Encoder
# # Apply one-hot encoder to each column with categorical data
# OHE = OneHotEncoder(handle_unknown="ignore", sparse=False)
# OH_cols_train = pd.DataFrame(OHE.fit_transform(X_train_imputed[cat_cols]))
# OH_cols_valid = pd.DataFrame(OHE.transform(X_test_imputed[cat_cols]))

# # One-hot encoding removed index; put it back
# OH_cols_train.index = X_train_imputed.index
# OH_cols_valid.index = X_test_imputed.index

# # Remove categorical columns (will replace with one-hot encoding)
# num_X_train_imputed = X_train_imputed.drop(cat_cols, axis=1)
# num_X_test_imputed = X_test_imputed.drop(cat_cols, axis=1)

# # Add one-hot encoded columns to numerical features
# OH_X_train_imputed = pd.concat([num_X_train_imputed, OH_cols_train], axis=1)
# OH_X_test_imputed = pd.concat([num_X_test_imputed, OH_cols_valid], axis=1)

# # Ensure all columns have string type
# OH_X_train_imputed.columns = OH_X_train_imputed.columns.astype(str)
# OH_X_test_imputed.columns = OH_X_test_imputed.columns.astype(str)

# # print("MAE from Approach 3 (One-Hot Encoding):")
# # print(score_dataset(OH_X_train_imputed, OH_X_test_imputed, y_train, y_valid))

In [None]:
# OH_X_train_imputed.head(15)
# OH_X_train_imputed.rename(
#     columns={
#         "0": "cyl_3",
#         "1": "cyl_4",
#         "2": "cyl_5",
#         "3": "cyl_6",
#         "4": "cyl_8",
#         "5": "org_1",
#         "6": "org_2",
#         "7": "org_3"},
#     inplace=True,
# )

#### Modeling

In [None]:
# Define models
knn = KNeighborsRegressor()
svr = SVR()
ridge = Ridge()

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[("preprocessor", preprocessor), ("model", knn)])

# Preprocessing of training data, fit model
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_train)

# score = mean_absolute_error(y_valid, preds)
print(" MAE:", mean_squared_error(y_train, preds))
print("R_2:", r2_score(y_train, preds))