In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# --- Phase 1: Data Acquisition & Initial Exploration ---

print("--- Phase 1: Data Acquisition & Initial Exploration ---")

# Load the dataset
# Make sure 'car_dataset.csv' is in the same directory as your script/notebook,
# or provide the full path to the file.
try:
    df = pd.read_csv('car_dataset.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'car_dataset.csv' not found. Please make sure the file is in the correct directory.")
    # In a real application, you might want to exit or raise a more specific error here.
    exit()

# Display the first few rows of the DataFrame
print("\n--- First 5 rows of the dataset ---")
print(df.head())

# Get basic information about the dataset (data types, non-null counts)
print("\n--- Dataset Info ---")
df.info()

# Get descriptive statistics for numerical columns
print("\n--- Descriptive Statistics ---")
print(df.describe())

# Check for missing values
print("\n--- Missing Values Count ---")
print(df.isnull().sum())

# Check unique values for categorical columns (important for understanding categories)
# These are typical categorical columns for this dataset. Adjust if your dataset varies.
typical_categorical_cols = ['Brand', 'Model', 'Fuel Type', 'Transmission', 'Ownership', 'Color']
print("\n--- Unique Values for Key Categorical Columns (Example Check) ---")
for col in typical_categorical_cols:
    if col in df.columns:
        print(f"\nUnique values in '{col}': {df[col].nunique()}")
        # print(df[col].value_counts()) # Uncomment to see counts of each unique value if needed

# --- Phase 2: Data Preprocessing ---

print("\n--- Phase 2: Data Preprocessing ---")

# 1. Handle 'Unnamed: 0' column if it's an unnecessary index
if 'Unnamed: 0' in df.columns:
    df.drop('Unnamed: 0', axis=1, inplace=True)
    print("Dropped 'Unnamed: 0' column.")

# 2. Impute Missing Values
# Identify columns for imputation based on typical missing patterns in such datasets.
# You should verify these columns based on your actual df.isnull().sum() output from Phase 1.
numerical_cols_to_impute = ['Engine Size', 'Max Power', 'Torque'] # Common columns that might have missing values
categorical_cols_to_impute = ['Fuel Type', 'Transmission'] # Example: if these had missing values

# Impute numerical missing values with the median (robust to outliers)
for col in numerical_cols_to_impute:
    if col in df.columns:
        if df[col].isnull().any():
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
            print(f"Filled missing values in '{col}' with median: {median_val}")

# Impute categorical missing values with the mode (most frequent category)
for col in categorical_cols_to_impute:
    if col in df.columns:
        if df[col].isnull().any():
            mode_val = df[col].mode()[0] # .mode() can return multiple if tied, take the first
            df[col].fillna(mode_val, inplace=True)
            print(f"Filled missing values in '{col}' with mode: {mode_val}")

# Verify no more missing values (for the imputed columns)
print("\n--- Missing Values After Handling ---")
print(df.isnull().sum())

# 3. Feature Engineering: Create 'Car_Age'
# Assuming 'Year' column exists and represents manufacturing year
if 'Year' in df.columns:
    current_year = pd.Timestamp.now().year # Get the current year dynamically
    df['Car_Age'] = current_year - df['Year']
    df.drop('Year', axis=1, inplace=True) # Drop the original 'Year' column after creating 'Car_Age'
    print("\n'Car_Age' feature created and 'Year' column dropped.")
    print(df[['Car_Age', 'Price']].head())
else:
    print("\n'Year' column not found for 'Car_Age' feature engineering.")

# Define categorical and numerical features for preprocessing pipeline
# These lists should reflect the columns that remain after initial cleaning and feature engineering.
# Ensure 'Price' is NOT in numerical_features as it's the target variable.
categorical_features = ['Brand', 'Model', 'Fuel Type', 'Transmission', 'Ownership', 'Color']
numerical_features = df.select_dtypes(include=np.number).columns.tolist()
if 'Price' in numerical_features:
    numerical_features.remove('Price') # Exclude target variable

# Filter lists to include only columns actually present in the DataFrame after drops/renames
categorical_features_present = [col for col in categorical_features if col in df.columns]
numerical_features_present = [col for col in numerical_features if col in df.columns]

# Create a ColumnTransformer for flexible preprocessing
# This allows us to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_present), # Apply StandardScaler to numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features_present) # One-Hot Encode categorical
    ],
    remainder='passthrough' # Keep other columns not explicitly listed (e.g., 'Price' itself)
)

print(f"\nDefined numerical features for scaling: {numerical_features_present}")
print(f"Defined categorical features for encoding: {categorical_features_present}")
print("\n--- Data Preprocessing Complete (defined via ColumnTransformer) ---")


# --- Phase 3: Model Selection, Training & Evaluation ---

print("\n--- Phase 3: Model Selection, Training & Evaluation ---")

# Define features (X) and target (y)
X = df.drop('Price', axis=1) # All columns except 'Price'
y = df['Price'] # The target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nData split: X_train:{X_train.shape}, X_test:{X_test.shape}, y_train:{y_train.shape}, y_test:{y_test.shape}")

# Function to evaluate models
def evaluate_model(model_name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n--- {model_name} Evaluation ---")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R-squared: {r2:.4f}")
    return mae, rmse, r2

# 1. Linear Regression Model
print("\n--- Training Linear Regression Model ---")
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', LinearRegression())])
pipeline_lr.fit(X_train, y_train)
y_pred_lr = pipeline_lr.predict(X_test)
mae_lr, rmse_lr, r2_lr = evaluate_model("Linear Regression", y_test, y_pred_lr)

# 2. Decision Tree Regressor Model
print("\n--- Training Decision Tree Regressor Model ---")
pipeline_dt = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', DecisionTreeRegressor(random_state=42))])
pipeline_dt.fit(X_train, y_train)
y_pred_dt = pipeline_dt.predict(X_test)
mae_dt, rmse_dt, r2_dt = evaluate_model("Decision Tree Regressor", y_test, y_pred_dt)

# 3. Neural Network (MLPRegressor) Model
print("\n--- Training Neural Network (MLPRegressor) Model ---")
pipeline_mlp = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', MLPRegressor(hidden_layer_sizes=(100, 50), # Two layers with 100 and 50 neurons
                                                          activation='relu', # ReLU activation
                                                          solver='adam', # Adam optimizer
                                                          max_iter=500, # Max iterations for convergence
                                                          random_state=42,
                                                          early_stopping=True, # Stop if validation score doesn't improve
                                                          n_iter_no_change=20))]) # Number of epochs with no improvement to wait
pipeline_mlp.fit(X_train, y_train)
y_pred_mlp = pipeline_mlp.predict(X_test)
mae_mlp, rmse_mlp, r2_mlp = evaluate_model("Neural Network (MLPRegressor)", y_test, y_pred_mlp)

# 4. Hyperparameter Tuning for Decision Tree Regressor (Example using GridSearchCV)
print("\n--- Hyperparameter Tuning for Decision Tree Regressor using GridSearchCV ---")
param_grid_dt = {
    'regressor__max_depth': [None, 10, 20, 30], # Maximum depth of the tree
    'regressor__min_samples_split': [2, 5, 10], # Minimum number of samples required to split an internal node
    'regressor__min_samples_leaf': [1, 2, 4] # Minimum number of samples required to be at a leaf node
}
grid_search_dt = GridSearchCV(pipeline_dt, param_grid_dt, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_search_dt.fit(X_train, y_train)

print(f"\nBest parameters for Decision Tree: {grid_search_dt.best_params_}")
print(f"Best cross-validation score (negative MSE): {grid_search_dt.best_score_:.2f}")

best_dt_model = grid_search_dt.best_estimator_
y_pred_dt_tuned = best_dt_model.predict(X_test)
mae_dt_tuned, rmse_dt_tuned, r2_dt_tuned = evaluate_model("Tuned Decision Tree Regressor", y_test, y_pred_dt_tuned)


# --- Phase 4: Model Analysis & Visualization ---

print("\n--- Phase 4: Model Analysis & Visualization ---")

# Choose your best performing model's predictions for visualization
# For example, if Tuned Decision Tree was best:
y_preds_best_model = y_pred_dt_tuned # Replace with predictions from your actual best model (e.g., y_pred_mlp, y_pred_lr)
best_model_name = "Tuned Decision Tree Regressor" # Adjust accordingly based on your evaluation
best_model_instance = best_dt_model # Store the actual best model instance

print(f"\n--- Visualizing Performance for {best_model_name} ---")

# Actual vs. Predicted Prices Plot
plt.figure(figsize=(10, 7))
sns.scatterplot(x=y_test, y=y_preds_best_model, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2) # Ideal line
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title(f"Actual vs. Predicted Prices ({best_model_name})")
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

# Residuals Distribution Plot
residuals = y_test - y_preds_best_model
plt.figure(figsize=(10, 7))
sns.histplot(residuals, kde=True, bins=50)
plt.title(f"Distribution of Residuals ({best_model_name})")
plt.xlabel("Residuals (Actual - Predicted)")
plt.ylabel("Frequency")
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

# Residuals vs. Predicted Prices Plot
plt.figure(figsize=(10, 7))
sns.scatterplot(x=y_preds_best_model, y=residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted Prices")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title(f"Residuals vs. Predicted Prices ({best_model_name})")
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

# --- Feature Importance (for Tree-based models like Decision Tree) ---
# This section assumes your best model is a tree-based model and has feature_importances_
# If your best model is Linear Regression or MLP, this part will not run directly.
if hasattr(best_model_instance.named_steps['regressor'], 'feature_importances_'):
    print("\n--- Extracting Feature Importances ---")
    
    # Get feature names from the preprocessor's transformers
    # Ensure numerical_features_present and categorical_features_present are defined correctly above
    numerical_ftrs_out = best_model_instance.named_steps['preprocessor'].named_transformers_['num'].get_feature_names_out(numerical_features_present)
    categorical_ftrs_out = best_model_instance.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features_present)
    
    # Combine all feature names in the correct order as they come out of the preprocessor
    all_feature_names_transformed = list(numerical_ftrs_out) + list(categorical_ftrs_out)

    feature_importances = best_model_instance.named_steps['regressor'].feature_importances_
    importance_df = pd.DataFrame({'Feature': all_feature_names_transformed, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    print("\n--- Top 10 Feature Importances ---")
    print(importance_df.head(10))

    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
    plt.title("Top 10 Feature Importances")
    plt.xlabel("Importance Score")
    plt.ylabel("Feature")
    plt.show()
else:
    print(f"\nFeature importances are not directly available for {best_model_name} in this manner.")
    print("Consider advanced techniques like Permutation Importance or SHAP values for model-agnostic explanations if needed (advanced topic).")

print("\n--- Project Code Execution Complete ---")
