In [None]:
import os
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime


from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder,OrdinalEncoder

# Import regression models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
import catboost as cb
import xgboost as xgb
import lightgbm as lgb

# Hyperparameter tuning
import optuna
from optuna.samplers import TPESampler

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
ss = pd.read_csv('data/sample_submission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
year = datetime.datetime.now().year
train['age'] = train['model_year'].apply(lambda x: year-x)
train

In [None]:
corr_matrix = train[train.select_dtypes(include=['number']).drop(columns=['id']).columns].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')

In [None]:
train.describe()

In [None]:
train['accident'].value_counts()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
null_train = train.isnull().sum()
null_test = test.isnull().sum()
null_df = pd.concat([null_train, null_test], axis=1, keys=['train', 'test'])
null_df['% missing train'] = null_df['train']/len(train)
null_df['% missing test'] = null_df['test']/len(test)
null_df

In [None]:
print("duplicate train: ", train.duplicated().sum())
print("duplicate test: ", test.duplicated().sum())

In [None]:
train.fuel_type.value_counts()


In [None]:
test.fuel_type.value_counts()

In [None]:
print(train['engine'].iloc[0])

In [None]:
fuel_missing_df = train[train['fuel_type'].isnull()]
fuel_missing_df.brand.value_counts()
# Tesla is electric 

In [None]:
train.clean_title.value_counts()

In [None]:
clean_title_missing_df = train[train['clean_title'].isnull()]

In [None]:
def feature_engineering(df):

    year = datetime.datetime.now().year
    df['age'] = df['model_year'].apply(lambda x: year-x)

    df['risk_dead_engine'] = df['milage'].map(lambda x: 1 if x>300000 else 0)

    def overworked(row):
        if row['milage']>50000 and row['age']<1:
            return 1
        elif row['milage']>100000 and row['age']<2:
            return 1
        elif row['milage']>300000 and row['age']<10:
            return 1
        return 0

    def fresh_engine(row):
        if row['milage']<10000:
            return 1
        if row['milage']<30000 and row['age']>=2:
            return 1
        return 0
    df['overworked'] = df.apply(overworked, axis=1)
    df['fresh_engine'] = df.apply(fresh_engine, axis=1)


    df['Cylinder'] = df['engine'].str.extract(r'(\d+)\s+Cylinder', expand=False).fillna(-1).astype(int)

    df['engine_Litr'] = df['engine'].str.extract(r'(\d+\.\d+)\s+L', expand=False).fillna(-1).astype(float)

    return df

In [None]:
test_ids = test['id']
def preprocess_data(train, test):

    # drop duplicates
    train = train.drop_duplicates()

    # concat train and test
    test['price'] = -1
    df = pd.concat([train, test], axis=0, ignore_index=True)

    # missing values :

    ## fuel_type
    mask = df['fuel_type'].isnull()
    df.loc[mask, 'fuel_type'] = df.loc[mask, 'engine'].map(lambda x: 'Plug-In Hybrid' if 'Plug-In' in x else 'Hybrid' if 'Hybrid' in x else 'Gasoline' if 'Gasoline' in x else 'E85 Flex Fuel' if 'Flex Fuel' in x else  'Diesel' if 'Diesel|GDI' in x else 'Other')
    ## clean_title
    mask = df['clean_title'].isnull()
    df.loc[mask, 'clean_title'] = 'No'
    ## accident 
    mask = df['accident'].isnull()
    df.loc[mask, 'accident'] = 'None reported'

    # feature engineering
    df = feature_engineering(df)


    # drop null values
    # df.dropna(inplace=True)

    # drop cols
    cols = ['id']
    df.drop(cols, axis=1, inplace=True)


    # Encode categorical features
    ## feature to one hot encoding
    features_to_1hotEncode = ['accident']
    df = pd.get_dummies(df, columns=features_to_1hotEncode)

    train = df[df['price']!=-1]
    test = df[df['price']==-1]

    ## feature for label encoding
    cols_to_LabelEncode = train.select_dtypes(include=['object']).columns
    le = LabelEncoder()

    for col in tqdm(cols_to_LabelEncode,desc='Encoding categorical features'):
        if col in train.columns and col in test.columns:
            classes_ = train[col].unique()
            train[col] = le.fit_transform(train[col])
            test[col] = test[col].map(lambda x: le.transform([x])[0] if x in classes_ else -1)

    return train, test

In [None]:
train, test = preprocess_data(train, test)

In [None]:
train.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('price', axis=1), train['price'], test_size=0.2, random_state=42)

In [None]:
cat_model = cb.CatBoostRegressor(loss_function='RMSE', verbose=0)
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
lgb_model = lgb.LGBMRegressor(objective='regression', metric='rmse', random_state=42,verbose=-1)
rf_model = RandomForestRegressor(random_state=42)

models = [cat_model, xgb_model, lgb_model, rf_model]

In [None]:
def objective(trial):
    # Suggest hyperparameters for each model
    cat_params = {
        'iterations': trial.suggest_int('cat_iterations', 100, 1000),
        'learning_rate': trial.suggest_float('cat_learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('cat_depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('cat_l2_leaf_reg', 1, 10)
    }
    
    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('xgb_learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
        'subsample': trial.suggest_float('xgb_subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.6, 1.0)
    }
    
    lgb_params = {
        'n_estimators': trial.suggest_int('lgb_n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('lgb_learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('lgb_max_depth', 3, 10),
        'subsample': trial.suggest_float('lgb_subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('lgb_colsample_bytree', 0.6, 1.0)
    }
    
    rf_params = {
        'n_estimators': trial.suggest_int('rf_n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('rf_max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('rf_min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('rf_min_samples_leaf', 1, 5)
    }
    
    # Create models with suggested hyperparameters
    cat_model = cb.CatBoostRegressor(loss_function='RMSE', verbose=0, **cat_params)
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **xgb_params)
    lgb_model = lgb.LGBMRegressor(objective='regression', metric='rmse', random_state=42, verbose=-1, **lgb_params)
    rf_model = RandomForestRegressor(random_state=42, **rf_params)
    
    models = [cat_model, xgb_model, lgb_model, rf_model]
    
    # Train models and calculate ensemble RMSE and NRMSE
    predictions = []
    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        predictions.append(y_pred)
    
    # Calculate ensemble prediction (average of all models)
    ensemble_pred = np.mean(predictions, axis=0)
    rmse = root_mean_squared_error(y_test, ensemble_pred)
    nrmse = rmse / (y_test.max() - y_test.min())
    
    # Store NRMSE in trial user attributes for comparison
    trial.set_user_attr('nrmse', nrmse)
    
    return rmse  # Optimize for RMSE

# Create study and optimize
study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=30)

In [None]:
print("Best trial:")
print(f"RMSE: {study.best_value:.2f}")
print(f"NRMSE: {study.best_trial.user_attrs['nrmse']:.4f}")
print("Best params:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

In [None]:
# Update models with best parameters
best_params = study.best_params

# Extract parameters for each model
cat_best_params = {k.replace('cat_', ''): v for k, v in best_params.items() if k.startswith('cat_')}
xgb_best_params = {k.replace('xgb_', ''): v for k, v in best_params.items() if k.startswith('xgb_')}
lgb_best_params = {k.replace('lgb_', ''): v for k, v in best_params.items() if k.startswith('lgb_')}
rf_best_params = {k.replace('rf_', ''): v for k, v in best_params.items() if k.startswith('rf_')}

# Create optimized models
cat_model = cb.CatBoostRegressor(loss_function='RMSE', verbose=0, **cat_best_params)
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **xgb_best_params)
lgb_model = lgb.LGBMRegressor(objective='regression', metric='rmse', random_state=42, verbose=-1, **lgb_best_params)
rf_model = RandomForestRegressor(random_state=42, **rf_best_params)

models = [cat_model, xgb_model, lgb_model, rf_model]

In [None]:
folder_name = 'submission'
os.makedirs(folder_name, exist_ok=True)

# Make predictions on the test set
test_predictions = []

# Remove 'price' column from test data if it exists
test_features = test.drop('price', axis=1) if 'price' in test.columns else test

for model in models:
    model.fit(train.drop('price', axis=1), train['price'])
    pred = model.predict(test_features)
    test_predictions.append(pred)
    
    # Create individual submission file for each model
    model_submission = pd.DataFrame({
        'id': test_ids,
        'price': pred
    })
    
    model_name = model.__class__.__name__
    model_submission.to_csv(f'{folder_name}/submission_{model_name}.csv', index=False)
    print(f"{model_name} submission file created successfully!")

# Create ensemble prediction (average of all models)
ensemble_pred = np.mean(test_predictions, axis=0)

# Create ensemble submission file
ensemble_submission = pd.DataFrame({
    'id': test_ids,
    'price': ensemble_pred
})

ensemble_submission.to_csv(f'{folder_name}/submission_ensemble.csv', index=False)
print("Ensemble submission file created successfully!")
print(ensemble_submission.head())
