# ML Approach

## Model training and Evaluation

In [1]:
# import necessory packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy import stats


from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,accuracy_score
from sklearn.decomposition import PCA

 
from sklearn.model_selection import  train_test_split, cross_val_score,RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler,RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from kneed import KneeLocator
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
# import cleaned data
df = pd.read_csv('cleaned_data.csv')

In [3]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [4]:
df.head(1)

Unnamed: 0,brand_names,price,score,processor brand,processor type,processor gen,type of core,no of threads,Ram,ram type,storage type,storage capacity,ppi,OS,Warranty
0,others,84490,65.0,Intel,i9,12.0,14.0,32.0,32,lpddr5,SSD,1024.0,,Windows 11,


In [5]:
# checking shape of dataset
df.shape

(1016, 15)

In [6]:
data = df.copy() 

In [7]:
# Encoding
data = pd.get_dummies(data, columns = ['brand_names', 'processor brand', 'processor type','ram type', 'storage type', 'OS'],drop_first=True)

In [8]:
data.head(1)

Unnamed: 0,price,score,processor gen,type of core,no of threads,Ram,storage capacity,ppi,Warranty,brand_names_apple,brand_names_asus,brand_names_dell,brand_names_hp,brand_names_lenovo,brand_names_msi,brand_names_others,processor brand_Apple,processor brand_Intel,processor brand_others,processor type_celeron,processor type_i3,processor type_i5,processor type_i7,processor type_i9,processor type_m1,processor type_m2,processor type_others,processor type_pentium,processor type_ryzen,ram type_ddr4,ram type_ddr5,ram type_lpddr3,ram type_lpddr4,ram type_lpddr4x,ram type_lpddr5,ram type_ram,ram type_unified,storage type_SSD,OS_DOS,OS_Mac,OS_Windows,OS_Windows 10,OS_Windows 11,OS_others
0,84490,65.0,12.0,14.0,32.0,32,1024.0,,,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0


In [10]:
data.shape

(1016, 44)

In [None]:
# Splitting X and y for all Experiments
X= data.drop('price', axis=1)
y = data['price']

### Create function for model training and model evaluation

In [None]:
def evaluate_reg(y_true, y_pred):
    '''
    This function takes in true values and predicted values
    Returns: r2_score,RMSE
    '''
    score = r2_score(y_true, y_pred) # Calculate Accuracy
    rmse = mean_squared_error(y_true, y_pred, squared=False) #Calculate Root mean sqaured error
    return score,rmse

In [None]:
# Create a function which can evaluate models and return a report 
def evaluate_models(X, y, models):
    '''
    This function takes in X and y and models dictionary as input
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15,random_state=10)
    
    models_list = []
    accuracy_list_train = []
    accuracy_list_test = []
    train_rmse = []
    test_rmse = []
    retraining = []

    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        # Train model
        model.fit(X_train, y_train) 

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Training set performance
        model_train_accuracy,model_train_rmse=evaluate_reg(y_train ,y_train_pred)

        # Test set performance
        model_test_accuracy,model_test_rmse=evaluate_reg(y_test ,y_test_pred)
        
        # Accuracy Threshold 0.1
        diff = model_train_accuracy - model_test_accuracy
    
        print(list(models.keys())[i])
        
        models_list.append(list(models.keys())[i])
        accuracy_list_train.append(model_train_accuracy)
        accuracy_list_test.append(model_test_accuracy)
        train_rmse.append(model_train_rmse)
        test_rmse.append(model_test_rmse)
        
        if (diff <=0.1 ) or (diff <= -0.1):
            retraining.append('accepted')
        elif diff > 0.1:
            retraining.append('Overfitting')
        else:
            retraining.append('Underfitting')
            

        print('Model performance for Training set')
        print("- Accuracy: {}".format(model_train_accuracy))
        print("- RMSE: {}".format(model_train_rmse)) 

        print('----------------------------------')

        print('Model performance for Test set')
        print("- Accuracy: {}".format(model_test_accuracy))
        print("- RMSE: {}".format(model_test_rmse))
        
        print('='*35)
        print('\n')
        
    report=pd.DataFrame(list(zip(models_list, accuracy_list_train,accuracy_list_test,train_rmse,test_rmse,retraining)), columns=['Model Name', 'train_accuracy','test_accuracy','train_rmse','test_rmse','retraining']).sort_values(by=["test_accuracy"],ascending=False)
        
    return report

### Experiment: 1 = KNN Imputer for Null values

**Why Robust scaler and not Standard scaler?**
- Scaling the data using Robust scaler
- Since most of the independent variables are not normally distributed we cannot use Standardscaler

**Why Robust Scaler and not Minmax?** 
- because most of the feature has outliers. So Minmax will scale data according to Max values which is outlier.
- This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile).

In [None]:
# Fit with robust scaler for KNN best K-selection experminet
#robustscaler = RobustScaler()
#X1 = robustscaler.fit_transform(X)

**Why KNN Imputer**?
- KNNImputer by scikit-learn is a widely used method to impute missing values. It is widely being observed as a replacement for traditional imputation techniques.
- KNNImputer helps to impute missing values present in the observations by finding the nearest neighbors with the Euclidean distance matrix.
- Here we Iterates through different K values and get accuracy and choose best K values.

**Finding the optimal n_neighbour value for KNN imputer**

#### Pipeline for KNN imputer

In [None]:
num_features = X.select_dtypes(exclude="object").columns

# Fit the KNN imputer with selected K-value
knn_pipeline = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=3)),
    ('RobustScaler', RobustScaler())
])

In [None]:
X_knn =knn_pipeline.fit_transform(X)

#### Initialize Default Models in a dictionary

In [None]:
# Dictionary which contains models for experiment
models = {
    "Linear Regression":LinearRegression(),
    "Lasso Regression" :Lasso(),
    "Ridge Regression" : Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "AdaBoost Regressor": AdaBoostRegressor(),
    "SVR" : SVR()
}

#### Fit KNN imputed data for models in dictionary

In [None]:
report_knn = evaluate_models(X_knn, y, models)

### Report for KNN imputed data

In [None]:
report_knn

### Hyperparameter tuning

In [None]:
# splitting dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X_knn,y,test_size=0.15,random_state=10)

In [None]:
# Random Forest

In [None]:
rf = RandomForestRegressor()
grid_param_rf = {
                    "n_estimators":[20,40,75,100],
                    "random_state":[1,5,10],
                    "max_samples":[0.1,0.2,0.5,1],
                    "max_features":[0.25,0.50,0.75,1],
                    "max_depth":[5,10,15]
                }
grid_search_rf=GridSearchCV(estimator=rf,param_grid=grid_param_rf,cv=5)
grid_search_rf.fit(X_train,y_train)

In [None]:
grid_search_rf.best_params_

In [None]:
# Gradient Boosting

In [None]:
gb = GradientBoostingRegressor()
grid_param_gb = {
                    "learning_rate":[0.1,0.5,1],
                    "n_estimators":[50,75,100],
                    "max_depth":[2,5,10],
                    "random_state":[1,5,10],
                    "max_features":['auto', 'sqrt', 'log2']
                }

grid_search_gb=GridSearchCV(estimator=gb,param_grid=grid_param_gb,cv=5)
grid_search_gb.fit(X_train,y_train)

In [None]:
grid_search_gb.best_params_

In [None]:
# XGB

In [None]:
xgb = XGBRegressor()
grid_param_xgb = {
                    "n_estimators":[30,50,75,100],
                    "max_depth":[2,4,10]
                }
grid_search_xgb=GridSearchCV(estimator=xgb,param_grid=grid_param_xgb,cv=5)
grid_search_xgb.fit(X_train,y_train)

In [None]:
grid_search_xgb.best_params_

In [None]:
# Dictionary which contains models for experiment
models_new = {
                "Random Forest": RandomForestRegressor(n_estimators = 75,
                                                random_state = 1,
                                                max_samples = 0.5,
                                                max_features = 0.25,
                                                max_depth =15),
                "Gradient Boosting": GradientBoostingRegressor(learning_rate=0.1,
                                                     max_depth=8,
                                                     max_features='log2',
                                                     n_estimators=100,
                                                     random_state= 5 ),
                "XGBRegressor": XGBRegressor(max_depth=2,
                                       n_estimators=100)
             }

In [None]:
report_knn_new = evaluate_models(X_knn, y, models_new)

In [None]:
report_knn_new

### Experiment: 2 = Simple Imputer with Strategy Median 

- SimpleImputer is a class in the `sklearn.impute` module that can be used to replace missing values in a dataset, using a variety of input strategies.
- Here we use SimpleImputer can also be used to impute multiple columns at once by passing in a list of column names. SimpleImputer will then replace missing values in all of the specified columns.

In [None]:
num_features = X.select_dtypes(exclude="object").columns

# Fit the Simple imputer with strategy median
median_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('RobustScaler', RobustScaler())
])

In [None]:
# Fit X with median_pipeline
X_median = median_pipeline.fit_transform(X)

In [None]:
# Training the models
report_median = evaluate_models(X_median, y, models)

### Report for Simple Imputer with median strategy

In [None]:
report_median

### Hyper parameter tuning

In [None]:
# splitting dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X_median,y,test_size=0.15,random_state=10)

In [None]:
# Gradient Boosting

In [None]:
gb = GradientBoostingRegressor()
grid_param_gb = {
                    "learning_rate":[0.1,0.5,1],
                    "n_estimators":[50,75,100],
                    "max_depth":[2,5,10],
                    "random_state":[1,5,10],
                    "max_features":['auto', 'sqrt', 'log2']
                }

grid_search_gb=GridSearchCV(estimator=gb,param_grid=grid_param_gb,cv=5)
grid_search_gb.fit(X_train,y_train)

In [None]:
grid_search_gb.best_params_

In [None]:
# XGB

In [None]:
xgb = XGBRegressor()
grid_param_xgb = {
                    "n_estimators":[30,50,75,100],
                    "max_depth":[2,4,10]
                }
grid_search_xgb=GridSearchCV(estimator=xgb,param_grid=grid_param_xgb,cv=5)
grid_search_xgb.fit(X_train,y_train)

In [None]:
grid_search_xgb.best_params_

In [None]:
# RF

In [None]:
rf = RandomForestRegressor()
grid_param_rf = {
                    "n_estimators":[20,40,75,100],
                    "random_state":[1,5,10],
                    "max_samples":[0.1,0.2,0.5,1],
                    "max_features":[0.25,0.50,0.75,1],
                    "max_depth":[5,10,15]
                }
grid_search_rf=GridSearchCV(estimator=rf,param_grid=grid_param_rf,cv=5)
grid_search_rf.fit(X_train,y_train)

In [None]:
grid_search_rf.best_params_

In [None]:
# Dictionary which contains models for experiment
models_new = {
                "Random Forest": RandomForestRegressor(n_estimators = 20,
                                                random_state = 5,
                                                max_samples = 0.5,
                                                max_features = 0.75,
                                                max_depth =15),
                "Gradient Boosting": GradientBoostingRegressor(learning_rate=0.1,
                                                     max_depth=5,
                                                     max_features='log2',
                                                     n_estimators=100,
                                                     random_state= 10 ),
                "XGBRegressor": XGBRegressor(max_depth=2,
                                       n_estimators=75)
             }

In [None]:
report_median_new = evaluate_models(X_median, y, models_new)

In [None]:
report_median_new

### Experiment: 3 = MICE for Imputing Null values

- MICE stands for Multivariate Imputation By Chained Equations algorithm
- This technique by which we can effortlessly impute missing values in a dataset by looking at data from other columns and trying to estimate the best prediction for each missing value.
- `ImputationKernel` Creates a kernel dataset. This dataset can perform MICE on itself, and impute new data from models obtained during MICE.

In [None]:
import miceforest as mf

X_mice = X.copy()
kernel = mf.ImputationKernel(
  X_mice,
  save_all_iterations=True,
  random_state=1989
)# Run the MICE algorithm for 3 iterations kernel.mice(3)

In [None]:
X_mice = kernel.complete_data()

In [None]:
# fit robust scaler
mice_pipeline = Pipeline(steps=[
    ('RobustScaler', RobustScaler())
])

In [None]:
# Fit X with Mice imputer 
X_mice= mice_pipeline.fit_transform(X_mice)

In [None]:
# Training the models
report_mice = evaluate_models(X_mice, y, models)

### Report for MICE Imputer algorithm

In [None]:
report_mice

In [None]:
# splitting dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X_mice,y,test_size=0.15,random_state=10)

In [None]:
# Gradient Boosting

In [None]:
gb = GradientBoostingRegressor()
grid_param_gb = {
                    "learning_rate":[0.1,0.5,1],
                    "n_estimators":[50,75,100],
                    "max_depth":[2,5,10],
                    "random_state":[1,5,10],
                    "max_features":['auto', 'sqrt', 'log2']
                }

grid_search_gb=GridSearchCV(estimator=gb,param_grid=grid_param_gb,cv=5)
grid_search_gb.fit(X_train,y_train)

In [None]:
grid_search_gb.best_params_

In [None]:
# RF

In [None]:
rf = RandomForestRegressor()
grid_param_rf = {
                    "n_estimators":[20,40,75,100],
                    "random_state":[1,5,10],
                    "max_samples":[0.1,0.2,0.5,1],
                    "max_features":[0.25,0.50,0.75,1],
                    "max_depth":[5,10,15]
                }
grid_search_rf=GridSearchCV(estimator=rf,param_grid=grid_param_rf,cv=5)
grid_search_rf.fit(X_train,y_train)

In [None]:
grid_search_rf.best_params_

In [None]:
# Ridge

In [None]:
ridge = Ridge()
grid_param_ridge = { "alpha" : [1,2,5,10],
                     "random_state": [1,5,10,15]}
grid_search_ridge=GridSearchCV(estimator=ridge,param_grid=grid_param_ridge,cv=5)
grid_search_ridge.fit(X_train,y_train)

In [None]:
grid_search_ridge.best_params_

In [None]:
# Dictionary which contains models for experiment
models_new = {
                "Random Forest": RandomForestRegressor(n_estimators = 100,
                                                random_state = 1,
                                                max_samples = 0.5,
                                                max_features = 0.25,
                                                max_depth =10),
                "Gradient Boosting": GradientBoostingRegressor(learning_rate=0.1,
                                                     max_depth=5,
                                                     max_features='log2',
                                                     n_estimators=100,
                                                     random_state= 5 ),
                "Ridge": Ridge(alpha=10,random_state=1)
             }

In [None]:
report_mice_new = evaluate_models(X_mice, y, models_new)

In [None]:
report_mice_new 

### Experiment: 4 = Simple Imputer with Strategy Constant 

- Another strategy which can be used is replacing missing values with a fixed (constant) value.
- To do this, specify “constant” for strategy and specify the fill value using the fill_value parameter

In [None]:
# Create a pipeline with simple imputer with strategy constant and fill value 0
constant_pipeline = Pipeline(steps=[
    ('Imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('RobustScaler', RobustScaler())
])

In [None]:
X_const =constant_pipeline.fit_transform(X)

In [None]:
# training the models
report_const = evaluate_models(X_const, y, models)

### Report for Simple Imputer with Constant strategy

In [None]:
report_const

### Hyperparameter tuning

In [None]:
# splitting dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X_const,y,test_size=0.15,random_state=10)

In [None]:
# Gradient Boosting

In [None]:
gb = GradientBoostingRegressor()
grid_param_gb = {
                    "learning_rate":[0.1,0.5,1],
                    "n_estimators":[50,75,100],
                    "max_depth":[2,5,10],
                    "random_state":[1,5,10],
                    "max_features":['auto', 'sqrt', 'log2']
                }

grid_search_gb=GridSearchCV(estimator=gb,param_grid=grid_param_gb,cv=5)
grid_search_gb.fit(X_train,y_train)

In [None]:
grid_search_gb.best_params_

In [None]:
# RF

In [None]:
rf = RandomForestRegressor()
grid_param_rf = {
                    "n_estimators":[20,40,75,100],
                    "random_state":[1,5,10],
                    "max_samples":[0.1,0.2,0.5,1],
                    "max_features":[0.25,0.50,0.75,1],
                    "max_depth":[5,10,15]
                }
grid_search_rf=GridSearchCV(estimator=rf,param_grid=grid_param_rf,cv=5)
grid_search_rf.fit(X_train,y_train)

In [None]:
grid_search_rf.best_params_

In [None]:
# XGB

In [None]:
xgb = XGBRegressor()
grid_param_xgb = {
                    "n_estimators":[30,50,75,100],
                    "max_depth":[2,4,10]
                }
grid_search_xgb=GridSearchCV(estimator=xgb,param_grid=grid_param_xgb,cv=5)
grid_search_xgb.fit(X_train,y_train)

In [None]:
grid_search_xgb.best_params_

In [None]:
# Dictionary which contains models for experiment
models_new = {
                "Random Forest": RandomForestRegressor(n_estimators = 75,
                                                random_state = 5,
                                                max_samples = 0.5,
                                                max_features = 0.25,
                                                max_depth =15),
                "Gradient Boosting": GradientBoostingRegressor(learning_rate=0.1,
                                                     max_depth=5,
                                                     max_features='log2',
                                                     n_estimators=100,
                                                     random_state= 5 ),
                "XGBRegressor": XGBRegressor(max_depth=2,
                                       n_estimators=100)
             }

In [None]:
report_const_new = evaluate_models(X_const, y, models_new)

In [None]:
report_const_new

### Experiment: 5 = Simple Imputer with Strategy Mean 

- Another strategy which can be used is replacing missing values with mean
- Here we replace the missing values with the mean of the column

In [None]:
# Create a pipeline with Simple imputer with strategy mean
mean_pipeline = Pipeline(steps=[
    ('Imputer', SimpleImputer(strategy='mean')),
    ('RobustScaler', RobustScaler())
])

In [None]:
X_mean = mean_pipeline.fit_transform(X)

In [None]:
# Training all models
report_mean = evaluate_models(X_mean, y, models)

In [None]:
report_mean

### Hyperparameter tuning

In [None]:
# splitting dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X_mean,y,test_size=0.15,random_state=10)

In [None]:
# Gradient Boosting

In [None]:
gb = GradientBoostingRegressor()
grid_param_gb = {
                    "learning_rate":[0.1,0.5,1],
                    "n_estimators":[50,75,100],
                    "max_depth":[2,5,10],
                    "random_state":[1,5,10],
                    "max_features":['auto', 'sqrt', 'log2']
                }

grid_search_gb=GridSearchCV(estimator=gb,param_grid=grid_param_gb,cv=5)
grid_search_gb.fit(X_train,y_train)

In [None]:
grid_search_gb.best_params_

In [None]:
# RF

In [None]:
rf = RandomForestRegressor()
grid_param_rf = {
                    "n_estimators":[20,40,75,100],
                    "random_state":[1,5,10],
                    "max_samples":[0.1,0.2,0.5,1],
                    "max_features":[0.25,0.50,0.75,1],
                    "max_depth":[5,10,15]
                }
grid_search_rf=GridSearchCV(estimator=rf,param_grid=grid_param_rf,cv=5)
grid_search_rf.fit(X_train,y_train)

In [None]:
grid_search_rf.best_params_

In [None]:
# XGB

In [None]:
xgb = XGBRegressor()
grid_param_xgb = {
                    "n_estimators":[30,50,75,100],
                    "max_depth":[2,4,10]
                }
grid_search_xgb=GridSearchCV(estimator=xgb,param_grid=grid_param_xgb,cv=5)
grid_search_xgb.fit(X_train,y_train)

In [None]:
grid_search_xgb.best_params_

In [None]:
# Dictionary which contains models for experiment
models_new = {
                "Random Forest": RandomForestRegressor(n_estimators = 15,
                                                random_state = 1,
                                                max_samples = 0.5,
                                                max_features = 0.25,
                                                max_depth =15),
                "Gradient Boosting": GradientBoostingRegressor(learning_rate=0.1,
                                                     max_depth=5,
                                                     max_features='sqrt',
                                                     n_estimators=100,
                                                     random_state= 5 ),
                "XGBRegressor": XGBRegressor(max_depth=2,
                                       n_estimators=75)
             }

In [None]:
report_mean_new = evaluate_models(X_mean, y, models_new)

In [None]:
report_mean_new

### Experiment: 6 = Principle component analysis with imputing median

- Principal component analysis is a technique for feature extraction — so it combines our input variables in a specific way, then we can drop the “least important” variables while still retaining the most valuable parts of all of the variables! 
- As the dataset has 164 columns we can try PCA and check our metrics Cost

In [None]:
pca_pipeline = Pipeline(steps=[
    ('Imputer', SimpleImputer(strategy='median', fill_value=np.median)),
    ('RobustScaler', RobustScaler())
])

In [None]:
X_pca = pca_pipeline.fit_transform(X)

In [None]:
#Applying PCA
var_ratio={}
for n in range(2,44):
    pc=PCA(n_components=n)
    df_pca=pc.fit(X_pca)
    var_ratio[n]=sum(df_pca.explained_variance_ratio_)

In [None]:
# plotting variance ratio
pd.Series(var_ratio).plot()

In [None]:

i = np.arange(len(var_ratio))
variance_ratio= list(var_ratio.values())
components=  list(var_ratio.keys())
knee = KneeLocator(i, variance_ratio, S=1, curve='concave', interp_method='polynomial')

fig = plt.figure(figsize=(5, 5))
knee.plot_knee()
plt.xlabel("Points")
plt.ylabel("Distance")
plt.show()
k= components[knee.knee]
print('Knee Locator k =', k)

In [None]:
# Reducing the dimensions of the data 
pca_final=PCA(n_components=7,random_state=42).fit(X_pca)

reduced=pca_final.fit_transform(X_pca)

In [None]:
# Training all models
report_pca = evaluate_models(X_pca,y, models)

### Report for PCA and Mean imputed data

In [None]:
report_pca

### Hyperparameter tuning

In [None]:
# splitting dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X_pca,y,test_size=0.15,random_state=10)

In [None]:
# Gradient Boosting

In [None]:
gb = GradientBoostingRegressor()
grid_param_gb = {
                    "learning_rate":[0.1,0.5,1],
                    "n_estimators":[50,75,100],
                    "max_depth":[2,5,10],
                    "random_state":[1,5,10],
                    "max_features":['auto', 'sqrt', 'log2']
                }

grid_search_gb=GridSearchCV(estimator=gb,param_grid=grid_param_gb,cv=5)
grid_search_gb.fit(X_train,y_train)

In [None]:
grid_search_gb.best_params_

In [None]:
# RF

In [None]:
rf = RandomForestRegressor()
grid_param_rf = {
                    "n_estimators":[20,40,75,100],
                    "random_state":[1,5,10],
                    "max_samples":[0.1,0.2,0.5,1],
                    "max_features":[0.25,0.50,0.75,1],
                    "max_depth":[5,10,15]
                }
grid_search_rf=GridSearchCV(estimator=rf,param_grid=grid_param_rf,cv=5)
grid_search_rf.fit(X_train,y_train)

In [None]:
grid_search_rf.best_params_

In [None]:
# XGB

In [None]:
xgb = XGBRegressor()
grid_param_xgb = {
                    "n_estimators":[30,50,75,100],
                    "max_depth":[2,4,10]
                }
grid_search_xgb=GridSearchCV(estimator=xgb,param_grid=grid_param_xgb,cv=5)
grid_search_xgb.fit(X_train,y_train)

In [None]:
grid_search_xgb.best_params_

In [None]:
# Dictionary which contains models for experiment
models_new = {
                "Random Forest": RandomForestRegressor(n_estimators = 20,
                                                random_state = 5,
                                                max_samples = 0.5,
                                                max_features = 0.75,
                                                max_depth =15),
                "Gradient Boosting": GradientBoostingRegressor(learning_rate=0.1,
                                                     max_depth=5,
                                                     max_features='log2',
                                                     n_estimators=100,
                                                     random_state= 10 ),
                "XGBRegressor": XGBRegressor(max_depth=2,
                                       n_estimators=75)
             }

In [None]:
report_pca_new = evaluate_models(X_pca, y, models_new)

In [None]:
report_pca_new