In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

data = pd.read_csv('/Users/mac/Desktop/DoneDeal WebScraping/DoneDealCars.csv')

data['Miles'] = pd.to_numeric(data['Miles'], errors='coerce')
data.dropna(subset=['Miles'], inplace=True)

#Split the data into features (X) and target variable (y)
X = data[['Price', 'Year', 'Engine']]
y = data['Miles']

#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the 'Engine' column using one-hot encoding
# Combine the training and testing data to ensure all categories are present
combined_data = pd.concat([X_train, X_test], axis=0)

# Create a ColumnTransformer to apply one-hot encoding to the 'Engine' column
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), ['Engine'])],
    remainder='passthrough'
)

# Apply the ColumnTransformer to the combined data
combined_data_encoded = ct.fit_transform(combined_data)

# Split the encoded data back into training and testing sets
X_train_encoded = combined_data_encoded[:len(X_train)]
X_test_encoded = combined_data_encoded[len(X_train):]

# Build the regression model (Linear Regression in this case)
model = LinearRegression()
model.fit(X_train_encoded, y_train)

# Make predictions using the trained model
y_pred = model.predict(X_test_encoded)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R-squared (R2): {r2:.2f}')

# SAMPLE - Make mileage predictions for new car data
new_car_data = pd.DataFrame({
    'Price': [25000],
    'Year': [2020],
    'Engine': ['2.0 Diesel']
})

new_car_data_encoded = ct.transform(new_car_data)
predicted_mileage = model.predict(new_car_data_encoded)

print(f'Predicted Mileage for the new car: {predicted_mileage[0]:.2f}')


Mean Squared Error (MSE): 9661990252.44
R-squared (R2): 0.24
Predicted Mileage for the new car: 144514.73
