# Imports

In [None]:
import numpy as np
import pandas as pd

# Data Collection

In [None]:
!wget https://nkb-backend-otg-media-static.s3.ap-south-1.amazonaws.com/otg_prod/media/Tech_4.0/AI_ML/CCBP_Platform_Projects/Car_Price_Prediction/train_Y_cp.csv
!wget https://nkb-backend-otg-media-static.s3.ap-south-1.amazonaws.com/otg_prod/media/Tech_4.0/AI_ML/CCBP_Platform_Projects/Car_Price_Prediction/train_X_cp.csv
!wget https://nkb-backend-otg-media-static.s3.ap-south-1.amazonaws.com/otg_prod/media/Tech_4.0/AI_ML/CCBP_Platform_Projects/Car_Price_Prediction/test_X_cp.csv

In [None]:
train_X_df = pd.read_csv('train_X_cp.csv')
train_Y_df = pd.read_csv('train_Y_cp.csv', header=None).squeeze()

test_X_df = pd.read_csv('test_X_cp.csv')

# Implementation

In [None]:
!pip install --upgrade category_encoders

In [None]:
!pip install -U scikit-learn

In [None]:
  train_X_df['drive-wheels'] = train_X_df['drive-wheels'].replace('4wd', 'fwd')
  test_X_df['drive-wheels'] = test_X_df['drive-wheels'].replace('4wd', 'fwd')
  door_mode = train_X_df['num-of-doors'].mode()
  train_X_df['num-of-doors'] = train_X_df['num-of-doors'].replace(np.nan, 'four')

##Regression Imputation - Normalized Losses

In [None]:
is_loss_null = train_X_df['normalized-losses'].isna()
test_losses_df = train_X_df[is_loss_null]
train_losses_df = train_X_df[~is_loss_null]
train_losses_Y = train_losses_df.pop('normalized-losses')

In [None]:
test_losses_df.pop('normalized-losses')

13    NaN
20    NaN
28    NaN
30    NaN
39    NaN
44    NaN
47    NaN
68    NaN
86    NaN
89    NaN
91    NaN
94    NaN
97    NaN
111   NaN
117   NaN
123   NaN
124   NaN
133   NaN
135   NaN
136   NaN
137   NaN
141   NaN
142   NaN
144   NaN
147   NaN
152   NaN
153   NaN
154   NaN
156   NaN
157   NaN
Name: normalized-losses, dtype: float64

##Numerical Pipeline for Regression Imputation

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

numerical_cols = train_losses_df.columns[train_losses_df.dtypes != 'object']
numerical_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))])

In [None]:
from category_encoders import TargetEncoder

target_cols = ['make']

target_pipeline = Pipeline(steps=[('target_encoder', TargetEncoder())])

##Categorical Data for Regression Imputation

In [None]:
categorical_cols = train_losses_df.columns[train_losses_df.dtypes == 'object']

##Ordinal Pipeline for Regression Imputation

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_cols = ['num-of-doors', 
                'num-of-cylinders']

ordinal_categories=[['two', 'four'],
                ['two', 'three', 'four', 'five', 'six', 'eight', 'twelve']]     

ordinal_pipeline = Pipeline(steps=[('ordinal_encoder', OrdinalEncoder(categories=ordinal_categories))])     

##Nominal Pipeline for Regression Imputation

In [None]:
from sklearn.preprocessing import OneHotEncoder

nominal_cols = list(set(categorical_cols) - set(ordinal_cols) - set(target_cols))

nominal_pipeline = Pipeline(steps=[('onehot_encoder', OneHotEncoder(handle_unknown='ignore'))])

##Column Transformer for Regression Imputation

In [None]:
from sklearn.compose import ColumnTransformer

column_transformer_impute = ColumnTransformer(
                                        transformers=[
                                            ('nom', numerical_pipeline, numerical_cols),
                                            ('target', target_pipeline, target_cols),
                                            ('ordinal', ordinal_pipeline, ordinal_cols),
                                            ('nominal', nominal_pipeline, nominal_cols)
                                            ], 
                                        remainder='passthrough'
                                      )

##Creating a pipeline

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import SequentialFeatureSelector


pipe_impute = Pipeline(steps=[('preprocessor', column_transformer_impute),
                       ('selector', SequentialFeatureSelector(Ridge(),direction='forward')),
                       ('classifier', Ridge())])

##Hyper Parameter Tuning for Regression Imputation

In [None]:
param_distributions = [
              {
                'selector__n_features_to_select': [10, 15, 20, 25, 30, 35],
                'classifier': [Ridge()],
                'classifier__alpha': [10, 20, 50, 100, 500, 1000, 2000, 5000]
              },
              {
                'selector__n_features_to_select': [10, 15, 20, 25, 30, 35],          
                'classifier': [Lasso()],
                'classifier__alpha': [10, 20, 50, 100, 500, 1000, 2000, 5000]
              }
            ]

In [None]:
random_search_cv_impute = RandomizedSearchCV(pipe_impute, param_distributions=param_distributions, n_iter=100, scoring='neg_mean_squared_log_error', refit=True, cv=5, random_state=0) 
random_search_cv_impute.fit(train_losses_df, train_losses_Y)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('nom',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer())]),
                                                                               Index(['symboling', 'wheel-base', 'length', 'width', 'height', 'curb-weight',
       'engine-size', 'bore', 'stroke', 'compression-ratio', 'horsepower',
       'peak-rpm', 'city-mpg', 'highway-mpg'],
      dty...
                   n_iter=100,
                   param_distributions=[{'classifier': [Ridge(alpha=10)],
                                         'classifier__alpha': [10, 20, 50, 100,
                                      

In [None]:
print(random_search_cv_impute.best_params_)

{'selector__n_features_to_select': 35, 'classifier__alpha': 10, 'classifier': Ridge(alpha=10)}


In [None]:
model = random_search_cv_impute.best_estimator_
predicted_test_losses_Y = model.predict(test_losses_df)

##Regression Imputation Normalized losses prediction

In [None]:
predicted_test_losses_Y

array([196.07519722, 196.72796835, 107.45527335, 156.49486101,
       150.18989081, 109.66937246, 114.32475732, 188.01083505,
       131.47216258, 133.75642715, 136.44139852, 153.33058816,
       196.07519722, 103.1187842 , 158.9023797 , 181.60043455,
       117.30324079, 137.72139421, 212.58723939, 157.99634112,
       135.66367295,  98.42191253, 152.37021059, 147.04261841,
       158.97655823, 194.96351377, 196.92623521, 181.60043455,
        93.52380331, 160.26140491])

In [None]:
rows_with_null_losses = train_X_df['normalized-losses'].isna()
train_X_df.loc[rows_with_null_losses, 'normalized-losses'] = predicted_test_losses_Y

In [None]:
is_loss_null = test_X_df['normalized-losses'].isna()
test_losses_df = test_X_df[is_loss_null]
train_losses_df = test_X_df[~is_loss_null]
train_losses_Y = train_losses_df.pop('normalized-losses')
test_losses_df.pop('normalized-losses')

1    NaN
2    NaN
5    NaN
7    NaN
32   NaN
38   NaN
40   NaN
Name: normalized-losses, dtype: float64

In [None]:
model = random_search_cv_impute.best_estimator_
predicted_test_losses_Y = model.predict(test_losses_df)

In [None]:
rows_with_null_losses = test_X_df['normalized-losses'].isna()
test_X_df.loc[rows_with_null_losses, 'normalized-losses'] = predicted_test_losses_Y

##Numerical Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

numerical_cols = train_X_df.columns[train_X_df.dtypes != 'object']
numerical_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))])

In [None]:
from category_encoders import TargetEncoder

target_cols = ['make']

target_pipeline = Pipeline(steps=[('target_encoder', TargetEncoder())])

##Categorical Data

In [None]:
categorical_cols = train_X_df.columns[train_X_df.dtypes == 'object']

##Ordinal Data

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_cols = ['num-of-doors', 
                'num-of-cylinders']

ordinal_categories=[['two', 'four'],
                ['two', 'three', 'four', 'five', 'six', 'eight', 'twelve']]     

ordinal_pipeline = Pipeline(steps=[('ordinal_encoder', OrdinalEncoder(categories=ordinal_categories))])     

##Nominal Pipeline

In [None]:
from sklearn.preprocessing import OneHotEncoder

nominal_cols = list(set(categorical_cols) - set(ordinal_cols) - set(target_cols))

nominal_pipeline = Pipeline(steps=[('onehot_encoder', OneHotEncoder(handle_unknown='ignore'))])

##Column Transformer

In [None]:
from sklearn.compose import ColumnTransformer

column_transformer = ColumnTransformer(
                                        transformers=[
                                            ('nom', numerical_pipeline, numerical_cols),
                                            ('target', target_pipeline, target_cols),
                                            ('ordinal', ordinal_pipeline, ordinal_cols),
                                            ('nominal', nominal_pipeline, nominal_cols)
                                            ], 
                                        remainder='passthrough'
                                      )

##Creating a Pipeline

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import SequentialFeatureSelector


pipe = Pipeline(steps=[('preprocessor', column_transformer),
                       ('selector', SequentialFeatureSelector(Ridge(), direction='forward')),
                       ('classifier', LinearRegression())])

##Hyper Parameter Tuning

In [None]:
param_distributions = [
              {
                'selector__n_features_to_select': [10, 15, 20, 25, 30, 35],
                'classifier': [Ridge()],
                'classifier__alpha': [10, 20, 50, 100, 500, 1000, 2000, 5000]
              },
              {
                'selector__n_features_to_select': [10, 15, 20, 25, 30, 35],            
                'classifier': [Lasso()],
                'classifier__alpha': [10, 20, 50, 100, 500, 1000, 2000, 5000]
              }                          
            ]

In [None]:
random_search_cv_forward = RandomizedSearchCV(pipe, param_distributions=param_distributions, n_iter=100, scoring='neg_mean_squared_log_error', refit=True, cv=5, random_state=42) 
random_search_cv_forward.fit(train_X_df, train_Y_df)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('nom',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer())]),
                                                                               Index(['symboling', 'normalized-losses', 'wheel-base', 'length', 'width',
       'height', 'curb-weight', 'engine-size', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg...
                   n_iter=100,
                   param_distributions=[{'classifier': [Ridge()],
                                         'classifier__alpha': [10, 20, 50, 100,
                                                     

In [None]:
print(random_search_cv_forward.best_params_)

{'selector__n_features_to_select': 35, 'classifier__alpha': 500, 'classifier': Lasso(alpha=500)}


In [None]:
random_search_cv_forward.best_score_

-0.030835039811834265

In [None]:
best_model = random_search_cv_forward.best_estimator_
predicted_test_Y = best_model.predict(test_X_df)
predicted_test_Y

array([15931.54509683, 16627.20646339, 15815.69894468, 20606.81918815,
        7804.11465499, 34710.94773977, 34491.44256968,  9581.58010052,
       14444.50689148,  7627.02838743,  8102.78089894, 18134.69877259,
        7210.43729169,  9071.51416289,  7818.26768505,  9993.05704453,
        6423.96925034, 15782.81024768, 15496.61314126,  9071.81549669,
       18429.5922591 , 31123.5078912 , 10642.54125565,  7881.79213698,
        7927.54490314,  6501.30209244, 12811.48878112,  6457.50426245,
       19081.18424771, 18312.62465344,  7798.35958051,  7007.86093148,
       29700.98293092, 16897.75239849, 30582.15826763, 10566.64501346,
        7587.22163348,  8298.73348003,  8122.48144778,  7062.37576121,
       14006.43385799])

Writing the predicted values to CSV file

In [None]:
pd.DataFrame(predicted_test_Y).to_csv('predicted_test_Y_cp.csv', header=None, index=False)