In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load Dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 
           'acceleration', 'model_year', 'origin', 'car_name']
df = pd.read_csv(url, names=columns, delim_whitespace=True, na_values='?')

# Step 2: Data Cleaning
initial_shape = df.shape
missing_values = df.isnull().sum()
df_cleaned = df.dropna()
after_drop_shape = df_cleaned.shape
df_cleaned = df_cleaned.drop(columns=['car_name'])

# Step 3: Normalize Data
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df_cleaned)
df_normalized = pd.DataFrame(normalized_data, columns=df_cleaned.columns)

# Step 4: Train-Test Split
X = df_normalized.drop(columns=['mpg'])
y = df_normalized['mpg']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 6: Evaluate Model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output
print("Initial shape:", initial_shape)
print("Missing values:\n", missing_values)
print("Shape after dropping missing values:", after_drop_shape)
print("Mean Squared Error:", mse)
print("R² Score:", r2)


  df = pd.read_csv(url, names=columns, delim_whitespace=True, na_values='?')


Initial shape: (398, 9)
Missing values:
 mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64
Shape after dropping missing values: (392, 9)
Mean Squared Error: 0.007576154664751017
R² Score: 0.7901500386760345
