In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

data = pd.read_csv("auto-mpg.csv")
print(data.duplicated().sum())

In [None]:
data.isna().sum()

In [None]:
num_df = data.select_dtypes(include=['int64', 'float64'])
cat_df = data.select_dtypes(include=['object'])
print(num_df)
print(cat_df)

In [None]:
num_cols = num_df.columns.tolist()
print(num_cols)
for col in num_cols:
    plt.hist(num_df[col].dropna())
    plt.xlabel(col)
    plt.ylabel("counts")
    plt.show()

In [None]:
for col in num_cols:
    num_df[col] = num_df[col].fillna(num_df[col].median())

print(num_df.isna().sum())

In [None]:
plt.boxplot(num_df)
plt.show()

In [None]:
def clip_outliers(df, column_name):
    q1= df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    iqr = q3 - q1
    upper_bound = q3 + 1.5 * iqr
    lower_bound = q1 - 1.5 * iqr
    df[column_name] = df[column_name].clip(upper=upper_bound)
    df[column_name] = df[column_name].clip(lower=lower_bound)
    return df[column_name]

for col in num_cols:
    num_df[col]= clip_outliers(num_df, col)
    plt.boxplot(num_df)
    plt.show()

In [None]:
num_df.head(2)

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
num_df = min_max_scaler.fit_transform(num_df)
print(num_df)

In [None]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
num_df = standard_scaler.fit_transform(num_df)
print(num_df)

In [None]:
import pickle

with open("min_max_scaler.pkl", "wb") as file:
    pickle.dump(min_max_scaler, file)

In [None]:
with open("min_max_scaler.pkl", "rb") as file:
    min_max_scaler = pickle.load(file)
print("Min:", min_max_scaler.min_)
print("Scale:", min_max_scaler.scale_)
print("Feature range:", min_max_scaler.feature_range)

In [None]:
test_input = np.array([[20, 4, 150, 2500, 15, 80, 1]])
test_output = min_max_scaler.transform(test_input)
print(test_output)

In [None]:
from sklearn.preprocessing import OneHotEncoder
data = {'color': ['yellow', 'green', 'blue', 'yellow', 'green']}
df = pd.DataFrame(data)
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(df[['color']])
encoded_df = pd.DataFrame(encoded_data, columns=['blue', 'green', 'yellow'])
final_df = pd.concat([df, encoded_df] , axis=1)
print(final_df)

In [None]:
cat_cols = cat_df.columns.tolist()
for col in cat_cols:
    encoder = LabelEncoder()
    cat_df[col] = encoder.fit_transform(cat_df[col])
print(cat_df.head())


In [None]:
num_df_scaled = pd.DataFrame(num_df, columns=num_cols)

y = num_df_scaled["mpg"]
X = num_df_scaled.drop("mpg", axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train