# **Selecting the best model with best hyperparameters**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [13]:
X = df.drop('tip', axis=1)
y = df['tip']

# encode categorical variables
encoder = OrdinalEncoder()
X = encoder.fit_transform(X)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [15]:
models = {
    'LinearRegression': LinearRegression(),
    'SVR': SVR(),
    'DecisionTree': DecisionTreeRegressor(),
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'KNN': KNeighborsRegressor(),
    'XGBoost': XGBRegressor()
}

In [17]:
%%time

model_scores = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, mae))

sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('MEA:', f'{model[0]} is {model[1]: .2f}')

MEA: SVR is  0.60
MEA: LinearRegression is  0.62
MEA: XGBoost is  0.67
MEA: KNN is  0.71
MEA: GradientBoosting is  0.72
MEA: RandomForest is  0.74
MEA: DecisionTree is  0.85
CPU times: user 263 ms, sys: 4.96 ms, total: 267 ms
Wall time: 233 ms


In [18]:
# load diamond dataset
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [21]:
X = df.drop(['price','x','y','z'], axis=1)
y = df['price']

In [22]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table
0,0.23,Ideal,E,SI2,61.5,55.0
1,0.21,Premium,E,SI1,59.8,61.0
2,0.23,Good,E,VS1,56.9,65.0
3,0.29,Premium,I,VS2,62.4,58.0
4,0.31,Good,J,SI2,63.3,58.0
...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0
53936,0.72,Good,D,SI1,63.1,55.0
53937,0.70,Very Good,D,SI1,62.8,60.0
53938,0.86,Premium,H,SI2,61.0,58.0


In [23]:
# encoding the categorical variables
encoder = OrdinalEncoder()
X = encoder.fit_transform(X)

In [24]:
models = {
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100]}),
          }

In [28]:
for name, (model, params) in models.items():
  pipeline = GridSearchCV(model, params, cv=5)
  pipeline.fit(X_train, y_train)
  y_pred = pipeline.predict(X_test)
  mae = mean_absolute_error(y_test, y_pred)
  print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
  print(name, 'R2: ', r2_score(y_test, y_pred))
  print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
  print('\n')

print(name, pipeline.best_params_)

LinearRegression MSE:  0.6565138933097732
LinearRegression R2:  0.474776845251208
LinearRegression MAE:  0.6160553405209656


SVR MSE:  0.635053061597389
SVR R2:  0.4919459042008757
SVR MAE:  0.669092672341374


DecisionTreeRegressor MSE:  0.8235850163311137
DecisionTreeRegressor R2:  0.34111688284232755
DecisionTreeRegressor MAE:  0.6958053058053059


RandomForestRegressor MSE:  0.8003287006122453
RandomForestRegressor R2:  0.35972236192536433
RandomForestRegressor MAE:  0.7239816326530614


KNeighborsRegressor MSE:  0.6021555034084395
KNeighborsRegressor R2:  0.5182645540750211
KNeighborsRegressor MAE:  0.6237327188940092


GradientBoostingRegressor MSE:  0.8185226615381059
GradientBoostingRegressor R2:  0.34516685951751525
GradientBoostingRegressor MAE:  0.7717845174722279


XGBRegressor MSE:  0.6624107100882575
XGBRegressor R2:  0.4700592836840687
XGBRegressor MAE:  0.6549163442728472


XGBRegressor {'n_estimators': 10}


## **Classification**

In [37]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# dont show warnings
import warnings
warnings.filterwarnings('ignore')

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create a dictionary of classifiers to evaluate
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

# Perform k-fold cross-validation and calculate the mean accuracy
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for name, classifier in classifiers.items():
    scores = cross_val_score(classifier, X, y, cv=kfold)
    accuracy = np.mean(scores)
    print("Classifier:", name)
    print("Mean Accuracy:", accuracy)
    print()




Classifier: Logistic Regression
Mean Accuracy: 0.9733333333333334

LogisticRegression()
Classifier: Decision Tree
Mean Accuracy: 0.9533333333333335

DecisionTreeClassifier()
Classifier: Random Forest
Mean Accuracy: 0.9600000000000002

RandomForestClassifier()
Classifier: SVM
Mean Accuracy: 0.9666666666666668

SVC()
Classifier: KNN
Mean Accuracy: 0.9733333333333334

KNeighborsClassifier()
