In [None]:
# =============================================================================
# INSTALL LIBRARIES
# =============================================================================
!pip install scikit-learn==1.3.2 xgboost

# =============================================================================
# IMPORT LIBRARIES
# =============================================================================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import xgboost as xgb
import joblib
import warnings

warnings.filterwarnings('ignore')

# =============================================================================
# 1. LOAD AND PREPARE DATA
# =============================================================================
df = pd.read_csv('car data.csv')

# ** THE MOST IMPORTANT STEP: Standardize column names **
df.rename(columns={
    'Selling_type': 'Seller_Type',
    'Driven_kms': 'Kms_Driven'
}, inplace=True)
print("✅ Column names have been standardized.")

# Feature Engineering
df['brand'] = df['Car_Name'].apply(lambda x: x.split(' ')[0])
df['car_age'] = 2025 - df['Year'] # Using current year
df.drop(['Car_Name', 'Year'], axis=1, inplace=True)

# =============================================================================
# 2. DEFINE PREPROCESSING PIPELINE
# =============================================================================
X = df.drop('Selling_Price', axis=1)
y = df['Selling_Price']

categorical_features = ['Fuel_Type', 'Seller_Type', 'Transmission', 'brand']
numerical_features = ['Present_Price', 'Kms_Driven', 'Owner', 'car_age']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ], remainder='passthrough')

# =============================================================================
# 3. TRAIN AND SAVE THE MODEL PIPELINE
# =============================================================================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the final pipeline object
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
])

print("⏳ Training the model...")
pipeline.fit(X_train, y_train)
print("✅ Model training complete.")

# Save ONLY the pipeline
joblib.dump(pipeline, 'xgboost_pipeline_car_data.pkl')
print("\n✅ Final model 'xgboost_pipeline_car_data.pkl' saved! Please download this one file.")

✅ Column names have been standardized.
⏳ Training the model...
✅ Model training complete.

✅ Final model 'xgboost_pipeline_car_data.pkl' saved! Please download this one file.
