# Imports

In [None]:
import numpy as np
import pandas as pd

# Data Collection

In [None]:
!wget https://nkb-backend-otg-media-static.s3.ap-south-1.amazonaws.com/otg_prod/media/Tech_4.0/AI_ML/CCBP_Platform_Projects/Car_Price_Prediction/train_Y_cp.csv
!wget https://nkb-backend-otg-media-static.s3.ap-south-1.amazonaws.com/otg_prod/media/Tech_4.0/AI_ML/CCBP_Platform_Projects/Car_Price_Prediction/train_X_cp.csv
!wget https://nkb-backend-otg-media-static.s3.ap-south-1.amazonaws.com/otg_prod/media/Tech_4.0/AI_ML/CCBP_Platform_Projects/Car_Price_Prediction/test_X_cp.csv

In [None]:
train_X_df = pd.read_csv('train_X_cp.csv')
train_Y_df = pd.read_csv('train_Y_cp.csv', header=None).squeeze()

test_X_df = pd.read_csv('test_X_cp.csv')

# Implementation

In [None]:
!pip install --upgrade category_encoders

In [None]:
!pip install -U scikit-learn

In [None]:
  train_X_df['drive-wheels'] = train_X_df['drive-wheels'].replace('4wd', 'fwd')
  test_X_df['drive-wheels'] = test_X_df['drive-wheels'].replace('4wd', 'fwd')
  door_mode = train_X_df['num-of-doors'].mode()
  train_X_df['num-of-doors'] = train_X_df['num-of-doors'].replace(np.nan, 'four')

### NUMERICAL PIPELINE

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

numerical_cols = train_X_df.columns[train_X_df.dtypes != 'object']
numerical_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))])

In [None]:
from category_encoders import TargetEncoder

target_cols = ['make']

target_pipeline = Pipeline(steps=[('target_encoder', TargetEncoder())])

### CATEGORICAL DATA

In [None]:
categorical_cols = train_X_df.columns[train_X_df.dtypes == 'object']

#### ORDINAL PIPELINE

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_cols = ['num-of-doors', 
                'num-of-cylinders']

ordinal_categories=[['two', 'four'],
                ['two', 'three', 'four', 'five', 'six', 'eight', 'twelve']]     

ordinal_pipeline = Pipeline(steps=[('ordinal_encoder', OrdinalEncoder(categories=ordinal_categories))])     

#### NOMINAL PIPELINE

In [None]:
from sklearn.preprocessing import OneHotEncoder

nominal_cols = list(set(categorical_cols) - set(ordinal_cols) - set(target_cols))

nominal_pipeline = Pipeline(steps=[('onehot_encoder', OneHotEncoder(handle_unknown='ignore'))])


##Column Transformer

In [None]:
from sklearn.compose import ColumnTransformer

column_transformer = ColumnTransformer(
                                        transformers=[
                                            ('nom', numerical_pipeline, numerical_cols),
                                            ('target', target_pipeline, target_cols),
                                            ('ordinal', ordinal_pipeline, ordinal_cols),
                                            ('nominal', nominal_pipeline, nominal_cols)
                                            ], 
                                        remainder='passthrough'
                                      )

##Creating a Pipeline

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import SequentialFeatureSelector


pipe = Pipeline(steps=[('preprocessor', column_transformer),
                       ('selector', SequentialFeatureSelector(Ridge(),direction='forward')),
                       ('classifier', LinearRegression())])

##Hyper Parameter Tuning

In [None]:
param_distributions = [
              {
                'selector__n_features_to_select': [10, 15, 20, 25, 30, 35],
                'classifier': [Ridge()],
                'classifier__alpha': [10, 20, 50, 100, 500, 1000, 2000, 5000]
              },
              {
                'selector__n_features_to_select': [10, 15, 20, 25, 30, 35],
                'classifier': [Lasso()],
                'classifier__alpha': [10, 20, 50, 100, 500, 1000, 2000, 5000]
              }                                   
            ]

In [None]:
random_search_cv = RandomizedSearchCV(pipe, param_distributions=param_distributions, n_iter=100, scoring='neg_mean_squared_log_error', refit=True, cv=5, random_state=0) 
random_search_cv.fit(train_X_df, train_Y_df)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('nom',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer())]),
                                                                               Index(['symboling', 'normalized-losses', 'wheel-base', 'length', 'width',
       'height', 'curb-weight', 'engine-size', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg...
                   n_iter=100,
                   param_distributions=[{'classifier': [Ridge(alpha=10)],
                                         'classifier__alpha': [10, 20, 50, 100,
                                             

In [None]:
print(random_search_cv.best_params_)

{'selector__n_features_to_select': 20, 'classifier__alpha': 10, 'classifier': Ridge(alpha=10)}


In [None]:
random_search_cv.best_score_

-0.03080040531852718

In [None]:
best_model = random_search_cv.best_estimator_
predicted_test_Y = best_model.predict(test_X_df)
predicted_test_Y

array([16041.36486749, 17554.54987628, 16107.44222876, 20515.96414269,
        7893.0155258 , 34850.37747528, 33280.14986657, 10389.52561784,
       14835.29137923,  8244.25149061,  8211.80071206, 18403.53588464,
        7008.29843391,  8844.29917504,  7745.35179182,  9777.20503185,
        6615.6814178 , 16100.24631533, 15085.24270432,  9551.28387291,
       17680.35769982, 30691.71454569, 10512.03458697,  7893.0155258 ,
        7893.0155258 ,  6317.63604941, 12811.74948262,  6317.63604941,
       18402.66508353, 17455.28875784,  7745.35179182,  7971.50861802,
       29796.74305642, 16852.51653557, 30381.34564153, 10503.40601207,
        8645.98995602,  9246.47155779,  8698.83913965,  6509.51436788,
       13894.77966778])

Writing the predicted values to CSV file

In [None]:
pd.DataFrame(predicted_test_Y).to_csv('predicted_test_Y_cp.csv', header=None, index=False)