# ML Algorithms - Classification Example

## Business Problem

For this example, we are trying to predict if it will rain tomorrow based on weather data from Australia. This could be something that would be useful for a weather station or a website to project. 

### Import Libraries

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

# Load in the data

In [2]:
df = pd.read_csv('weatherAUS.csv')

# Brief exploratory data analysis

In [3]:
df.shape

(145460, 23)

In [4]:
# Display the first few rows of the dataset
print(df.head())


         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  Humidity3pm  \
0           W           44.0          W  ...        71.0         22.0   
1         WNW           44.0        NNW  ...        44.0         25.0   
2         WSW           46.0          W  ...        38.0         30.0   
3          NE           24.0         SE  ...        45.0         16.0   
4           W           41.0        ENE  ...        82.0         33.0   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  \
0       1007.7    

In [5]:
# Display basic information about the dataset
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [6]:
# Display summary statistics for numerical columns
print(df.describe())


             MinTemp        MaxTemp       Rainfall   Evaporation  \
count  143975.000000  144199.000000  142199.000000  82670.000000   
mean       12.194034      23.221348       2.360918      5.468232   
std         6.398495       7.119049       8.478060      4.193704   
min        -8.500000      -4.800000       0.000000      0.000000   
25%         7.600000      17.900000       0.000000      2.600000   
50%        12.000000      22.600000       0.000000      4.800000   
75%        16.900000      28.200000       0.800000      7.400000   
max        33.900000      48.100000     371.000000    145.000000   

           Sunshine  WindGustSpeed   WindSpeed9am   WindSpeed3pm  \
count  75625.000000  135197.000000  143693.000000  142398.000000   
mean       7.611178      40.035230      14.043426      18.662657   
std        3.785483      13.607062       8.915375       8.809800   
min        0.000000       6.000000       0.000000       0.000000   
25%        4.800000      31.000000       7.0000

In [7]:
# Check for missing values
print(df.isna().sum())

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64


In [9]:
# create months column
# figure out location data 
# what to do with imputed data 
# scale data 
#remove outliers 

## Naive Bayes 
    # Outlier removal 
    # Encoding 
## SVM & Logistic regression Logistic Regression
    # Remove Outliers 
    # Remove missing values 
    # Scaling 
    # dummy variables 
## Trees (Decision, RF, XGBoost, GB Tree)
## KNN
    # Feature scaling
    # Imputation 
    # dummy variables 
## ANN 
    # scaling 
    # get dummies 

# Data Cleaning 
1) Drop Null Values in y variable
2) Create a category for Month

In [8]:
#Data Cleaning 
df = df.dropna(subset=['RainTomorrow'])
df['month'] = df.Date.apply(lambda x: pd.to_datetime(x).month).astype('category')

# Create model baseline

The dataset is slightly imbalanced. We see that if we project it not to rain every time, our model will be right around 78% of the time. We want our model to perform at least to perform better than this 78% threshold. 

In [9]:
df['RainTomorrow'].value_counts()/ df.shape[0]

No     0.775819
Yes    0.224181
Name: RainTomorrow, dtype: float64

# Additional Data Preprocessing
1) Remove outliers
2) Create train test split
3) Create columns for continuous and categorical varaiables

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy import stats
from sklearn.base import BaseEstimator, TransformerMixin
import scipy 
#adjust for X & Y 

def z_score_removal(X,y, columns, z_score):
    df = pd.concat([X, y], axis=1)
    col_df = df[columns]    
    z_scores = scipy.stats.zscore(col_df).abs()
    outliers = (z_scores.max(axis=1) > z_score)
    df_out = df[~outliers]
    X_cleaned = df_out[X.columns]
    y_cleaned = df_out.drop(X.columns, axis =1)
    return X_cleaned, y_cleaned


In [11]:
#train test split 
from sklearn.model_selection import train_test_split    
X = df.drop(['RainTomorrow', 'Date'], axis=1)
y = df.loc[:,'RainTomorrow'].map({'Yes': 1, 'No': 0}).astype('category')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Identify continuous and categorical columns
continuous_columns = [col for col in X_train.columns if X_train[col].dtype == 'float64' or X_train[col].dtype == 'int64']
categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'object']

In [13]:
# pipeline for Naive Bayes - We need to impute continuous columns and encode categorical variables.
nb_preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
    ]), continuous_columns),
    ('cat', Pipeline([
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), categorical_columns)
],remainder = 'passthrough', verbose_feature_names_out= False).set_output(transform='pandas')


# Fit and transform the data
data_transformed = nb_preprocessor.fit_transform(X_train)
nb_X_train, nb_y_train = z_score_removal(data_transformed, y_train, continuous_columns, 3)

TypeError: __init__() got an unexpected keyword argument 'sparse_output'

In [14]:
# pipeline for Naive Bayes - We need to impute continuous columns and encode categorical variables.
nb_preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
    ]), continuous_columns),
    ('cat', Pipeline([
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_columns)
], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')


TypeError: __init__() got an unexpected keyword argument 'verbose_feature_names_out'

In [15]:
# pipeline for Naive Bayes - We need to impute continuous columns and encode categorical variables.
nb_preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
    ]), continuous_columns),
    ('cat', Pipeline([
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_columns)
], remainder='passthrough').set_output(transform='pandas')


AttributeError: 'ColumnTransformer' object has no attribute 'set_output'

In [23]:
# Pipeline for Naive Bayes - We need to impute continuous columns and encode categorical variables.
nb_preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
    ]), continuous_columns),
    ('cat', Pipeline([
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_columns)
], remainder='passthrough')

# Fit and transform the data
data_transformed = nb_preprocessor.fit_transform(X_train)

# Get the feature names from the categorical encoder
categorical_encoder = nb_preprocessor.named_transformers_['cat']
encoded_categorical_features = list(categorical_encoder.named_steps['encoder'].get_feature_names(input_features=categorical_columns))

# Combine the feature names from both continuous and encoded categorical columns
feature_names = continuous_columns + encoded_categorical_features

# Create a DataFrame with the transformed data and feature names
data_transformed_df = pd.DataFrame(data_transformed, columns=feature_names)

# Now you can concatenate the DataFrames
df = pd.concat([data_transformed_df, y_train], axis=1)


ValueError: Shape of passed values is (113754, 1), indices imply (113754, 119)

In [24]:
# Check the length of feature_names and number of columns in transformed data
print("Length of feature_names:", len(feature_names))
print("Number of columns in transformed data:", data_transformed.shape[1])

# Create a DataFrame with the transformed data and feature names
data_transformed_df = pd.DataFrame(data_transformed, columns=feature_names)

# Now you can concatenate the DataFrames
df = pd.concat([data_transformed_df, y_train], axis=1)


Length of feature_names: 119
Number of columns in transformed data: 120


ValueError: Shape of passed values is (113754, 1), indices imply (113754, 119)

## Hyperparameter Tuning - Naive Bayes

Naive Bayes is a family of simple probabilistic classifiers based on applying Bayes' theorem with the "naive" assumption of conditional independence between every pair of features given the class. These classifiers are particularly useful for text classification and other high-dimensional problems. There are different Naive Bayes classifiers available in scikit-learn, such as GaussianNB, MultinomialNB, and BernoulliNB.

Relevant Parameters:

### GaussianNB
- **var_smoothing**: Portion of the largest variance of all features that is added to variances for calculation stability. It's used to smooth the likelihood estimates and avoid zero probabilities, which can lead to better generalization performance.

### MultinomialNB
- **alpha**: Additive (Laplace/Lidstone) smoothing parameter. It's used to control the trade-off between fitting the data and smoothing the probabilities, which helps prevent overfitting.
- **fit_prior**: Whether to learn class prior probabilities or not. If false, a uniform prior will be used. Learning the prior can help improve the classification performance in cases where the class distribution is imbalanced.

### BernoulliNB
- **alpha**: Additive (Laplace/Lidstone) smoothing parameter. It's used to control the trade-off between fitting the data and smoothing the probabilities, which helps prevent overfitting.
- **binarize**: Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors.
- **fit_prior**: Whether to learn class prior probabilities or not. If false, a uniform prior will be used. Learning the prior can help improve the classification performance in cases where the class distribution is imbalanced.

By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance.

In [None]:
#Naive Bayes Code 
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, GridSearchCV

# Naive Bayes Classifier
nb_model = GaussianNB()
nb_scores = cross_val_score(nb_model, nb_X_train, nb_y_train, cv=5)
print(nb_scores)

# No hyperparameters to tune for GaussianNB
nb_model.fit(nb_X_train, nb_y_train)
test_score_nb = nb_model.score(nb_X_train, nb_y_train)

print(f"Test score (accuracy): {test_score_nb}")

In [29]:
# pipeline for SVM & Logistic regression classifiers 
lr_preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), continuous_columns),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), categorical_columns)
],remainder = 'passthrough', verbose_feature_names_out= False).set_output(transform='pandas')

lr_transformed = lr_preprocessor.fit_transform(X_train)
lr_X_train, lr_y_train = z_score_removal(lr_transformed, y_train, continuous_columns, 3)

TypeError: __init__() got an unexpected keyword argument 'sparse_output'

## Hyperparameter Tuning - Logistic Regression

Logistic Regression is a linear model for classification that uses the logistic function to model the probability of a binary outcome. It's a simple yet powerful technique for solving binary and multi-class classification problems. In scikit-learn, the `LogisticRegression` class provides an implementation of logistic regression.

Relevant Parameters:
- **penalty**: The type of regularization applied to the model. Options include 'l1', 'l2', 'elasticnet', and 'none'. Regularization is used to control the trade-off between fitting the data and keeping the weights small, which helps prevent overfitting.
- **C**: Inverse of regularization strength (i.e., 1/lambda). Smaller values specify stronger regularization. It's used to control the amount of regularization applied to the model, which can impact the model's ability to generalize to unseen data.
- **fit_intercept**: Whether to include an intercept term in the model. If false, the data is assumed to be already centered. Including an intercept can improve the fit of the model, especially if the data is not centered.
- **solver**: The algorithm used for optimization. Choices are 'newton-cg', 'lbfgs', 'liblinear', 'sag', and 'saga'. Each solver has its own benefits and drawbacks, so it's essential to choose the one that best suits your problem and dataset.
- **max_iter**: Maximum number of iterations for the solver to converge. Increasing this value allows the model more time to converge but may increase the computation time.
- **multi_class**: Strategy for multi-class problems. Options are 'auto', 'ovr' (one-vs-rest), and 'multinomial'. 'auto' will choose the best strategy based on the data and solver. For multi-class problems, the choice of strategy can impact the classification performance.

By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance.

In [18]:
#Logistic Regression & SVM Code

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV

# Logistic Regression with default parameters
lr_model = LogisticRegression()
lr_scores = cross_val_score(lr_model, lr_X_train, lr_y_train, cv=5, scoring='accuracy')
print(lr_scores)
"""
# Parameter grid for GridSearchCV
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.1, 1, 10],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 500, 1000]
}

grid_search_lr = GridSearchCV(lr_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)
grid_search_lr.fit(lr_X_train, lr_y_train)
test_score_lr = grid_search_lr.best_estimator_.score(lr_X_train, lr_y_train)

print(f"Best penalty value: {grid_search_lr.best_params_['penalty']}")
print(f"Best C value: {grid_search_lr.best_params_['C']}")
print(f"Best solver value: {grid_search_lr.best_params_['solver']}")
print(f"Best max_iter value: {grid_search_lr.best_params_['max_iter']}")
print(f"Best accuracy: {grid_search_lr.best_score_}")
"""

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

[0.85077628 0.8516648  0.85231949 0.8526936  0.8481575 ]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


'\n# Parameter grid for GridSearchCV\nparam_grid = {\n    \'penalty\': [\'l1\', \'l2\', \'elasticnet\', \'none\'],\n    \'C\': [0.1, 1, 10],\n    \'solver\': [\'newton-cg\', \'lbfgs\', \'liblinear\', \'sag\', \'saga\'],\n    \'max_iter\': [100, 500, 1000]\n}\n\ngrid_search_lr = GridSearchCV(lr_model, param_grid, scoring=\'accuracy\', cv=5, n_jobs=-1, verbose=1)\ngrid_search_lr.fit(lr_X_train, lr_y_train)\ntest_score_lr = grid_search_lr.best_estimator_.score(lr_X_train, lr_y_train)\n\nprint(f"Best penalty value: {grid_search_lr.best_params_[\'penalty\']}")\nprint(f"Best C value: {grid_search_lr.best_params_[\'C\']}")\nprint(f"Best solver value: {grid_search_lr.best_params_[\'solver\']}")\nprint(f"Best max_iter value: {grid_search_lr.best_params_[\'max_iter\']}")\nprint(f"Best accuracy: {grid_search_lr.best_score_}")\n'

## Hyperparameter Tuning - SVM Classification

Support Vector Machine (SVM) Classification is a versatile machine learning algorithm that can be used for both linear and non-linear classification tasks. It aims to find the best-fitting hyperplane that has the largest distance (margin) between the support vectors and the hyperplane.

Relevant Parameters:
- **kernel**: Specifies the kernel function to be used in the algorithm. Possible options are 'linear', 'poly', 'rbf', 'sigmoid', and 'precomputed'. The choice of the kernel function depends on the nature of the data and the problem to be solved.
- **C**: Regularization parameter (also called the cost parameter); must be a positive float. It determines the trade-off between achieving a low training error and a low testing error. In other words, it controls the balance between overfitting and underfitting. A smaller value of C creates a wider margin, which may result in more training errors but better generalization to the test data. A larger value of C creates a narrower margin, which may result in fewer training errors but poorer generalization to the test data.
- **degree**: The degree of the polynomial kernel function ('poly'). Ignored by all other kernels. It is the degree of the polynomial used for the 'poly' kernel and determines the flexibility of the model.
- **gamma**: Kernel coefficient for 'rbf', 'poly', and 'sigmoid'. If gamma is 'scale' (default), then it is calculated as 1 / (n_features * X.var()) for the input data X. If gamma is 'auto', then it is calculated as 1/n_features. A smaller gamma value will produce a more flexible model, while a larger gamma value will produce a more rigid model.
- **coef0**: Independent term in the kernel function. It is only significant in 'poly' and 'sigmoid'. It controls the influence of higher degree terms in the polynomial and sigmoid kernels.
- **shrinking**: Whether to use the shrinking heuristic. The shrinking heuristic is a technique used to speed up training by removing some of the support vectors that are not necessary for the final solution. True by default.

By tuning these parameters, you can find the best combination for your specific classification problem and achieve a better balance between model complexity and generalization performance.

In [19]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV

# SVM with default parameters
svm_model = SVC()
svm_scores = cross_val_score(svm_model, lr_X_train, lr_y_train, cv=5, scoring='accuracy')
print(svm_scores)

"""
# Parameter grid for GridSearchCV
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.1, 1, 10]
}

grid_search_svm = GridSearchCV(svm_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)
grid_search_svm.fit(lr_X_train, lr_y_train)
test_score_svm = grid_search_svm.best_estimator_.score(lr_X_train, lr_y_train)

print(f"Best kernel value: {grid_search_svm.best_params_['kernel']}")
print(f"Best C value: {grid_search_svm.best_params_['C']}")
print(f"Best gamma value: {grid_search_svm.best_params_['gamma']}")
print(f"Best accuracy: {grid_search_svm.best_score_}")
"""

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[0.85760382 0.85891321 0.85793116 0.85895997 0.85582679]


'\n# Parameter grid for GridSearchCV\nparam_grid = {\n    \'kernel\': [\'linear\', \'rbf\'],\n    \'C\': [0.1, 1, 10],\n    \'gamma\': [\'scale\', \'auto\', 0.1, 1, 10]\n}\n\ngrid_search_svm = GridSearchCV(svm_model, param_grid, scoring=\'accuracy\', cv=5, n_jobs=-1, verbose=1)\ngrid_search_svm.fit(lr_X_train, lr_y_train)\ntest_score_svm = grid_search_svm.best_estimator_.score(lr_X_train, lr_y_train)\n\nprint(f"Best kernel value: {grid_search_svm.best_params_[\'kernel\']}")\nprint(f"Best C value: {grid_search_svm.best_params_[\'C\']}")\nprint(f"Best gamma value: {grid_search_svm.best_params_[\'gamma\']}")\nprint(f"Best accuracy: {grid_search_svm.best_score_}")\n'

In [26]:
# pipeline for Trees (Decision, RF, XGBoost, GB Tree) - Reuse nb without outlier removval
tree_X_train = nb_preprocessor.fit_transform(X_train)


## Hyperparameter Tuning - Decision Tree

Decision Trees are a popular machine learning algorithm used for both regression and classification tasks. They are easy to interpret and can naturally handle a mixture of continuous and categorical variables.

Relevant Parameters:
- **criterion**: The function to measure the quality of a split. Supported criteria for regression are 'mse' (mean squared error) and 'friedman_mse' (improvement in mean squared error). For classification, supported criteria are 'gini' and 'entropy'.
- **splitter**: The strategy used to choose the split at each node. Supported strategies are 'best' to choose the best split and 'random' to choose the best random split.
- **max_depth**: The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. Controlling the depth can help prevent overfitting.
- **min_samples_split**: The minimum number of samples required to split an internal node. A larger value prevents the tree from growing too deep, thus preventing overfitting.
- **min_samples_leaf**: The minimum number of samples required to be at a leaf node. A larger value prevents the tree from growing too deep, thus preventing overfitting.
- **min_weight_fraction_leaf**: The minimum weighted fraction of the sum total of weights required to be at a leaf node. Samples have equal weight when sample_weight is not provided.
- **max_features**: The number of features to consider when looking for the best split. If None, then max_features=n_features.
- **max_leaf_nodes**: Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None, then unlimited number of leaf nodes.
- **min_impurity_decrease**: A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
- **min_impurity_split**: Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.

By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance.


In [28]:
# Trees (Decision, RF, XGBoost, GB Tree) Code
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

# Decision Tree with default parameters
dt_model = DecisionTreeClassifier()
dt_scores = cross_val_score(dt_model, tree_X_train, y_train, cv=5, scoring='accuracy')
print(dt_scores)

"""
# Expanded parameter grid for GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 30, 40, 50],
    'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'class_weight': [None, 'balanced']
}

grid_search_dt = GridSearchCV(dt_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)
grid_search_dt.fit(tree_X_train, y_train)
test_score_dt = grid_search_dt.best_estimator_.score(tree_X_train, y_train)

print("Best hyperparameters found:")
for key, value in grid_search_dt.best_params_.items():
    print(f"{key}: {value}")

print(f"Best accuracy: {grid_search_dt.best_score_}")

"""

[0.7905147  0.79077843 0.78932794 0.79482221 0.79265934]


'\n# Expanded parameter grid for GridSearchCV\nparam_grid = {\n    \'criterion\': [\'gini\', \'entropy\'],\n    \'splitter\': [\'best\', \'random\'],\n    \'max_depth\': [None, 5, 10, 15, 20, 25, 30, 35, 40],\n    \'min_samples_split\': [2, 5, 10, 15, 20],\n    \'min_samples_leaf\': [1, 2, 4, 6, 8, 10],\n    \'min_weight_fraction_leaf\': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\n    \'max_features\': [None, \'sqrt\', \'log2\'],\n    \'max_leaf_nodes\': [None, 10, 20, 30, 40, 50],\n    \'min_impurity_decrease\': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\n    \'class_weight\': [None, \'balanced\']\n}\n\ngrid_search_dt = GridSearchCV(dt_model, param_grid, scoring=\'accuracy\', cv=5, n_jobs=-1, verbose=1)\ngrid_search_dt.fit(tree_X_train, y_train)\ntest_score_dt = grid_search_dt.best_estimator_.score(tree_X_train, y_train)\n\nprint("Best hyperparameters found:")\nfor key, value in grid_search_dt.best_params_.items():\n    print(f"{key}: {value}")\n\nprint(f"Best accuracy: {grid_search_dt.best_score_}")\n\n'

## Hyperparameter Tuning - Random Forest

Random Forest is an ensemble learning method that constructs a multitude of decision trees at training time and outputs the mode of the classes (classification) or mean prediction (regression) of the individual trees. It is highly flexible and can handle a wide variety of tasks.

Relevant Parameters:
- **n_estimators**: The number of trees in the forest. Increasing the number of trees can improve the model's performance, but may also increase the computation time.
- **criterion**: The function to measure the quality of a split. Supported criteria for regression are 'mse' (mean squared error) and 'mae' (mean absolute error). For classification, supported criteria are 'gini' and 'entropy'.
- **max_depth**: The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. Controlling the depth can help prevent overfitting.
- **min_samples_split**: The minimum number of samples required to split an internal node. A larger value prevents the tree from growing too deep, thus preventing overfitting.
- **min_samples_leaf**: The minimum number of samples required to be at a leaf node. A larger value prevents the tree from growing too deep, thus preventing overfitting.
- **min_weight_fraction_leaf**: The minimum weighted fraction of the sum total of weights required to be at a leaf node. Samples have equal weight when sample_weight is not provided.
- **max_features**: The number of features to consider when looking for the best split. If None, then max_features=n_features. It can also be a float, int, or string ('auto', 'sqrt', or 'log2').
- **max_leaf_nodes**: Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None, then unlimited number of leaf nodes.
- **min_impurity_decrease**: A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
- **bootstrap**: Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.
- **oob_score**: Whether to use out-of-bag samples to estimate the generalization accuracy.

By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance.

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

# Random Forest with default parameters
rf_model = RandomForestClassifier()
rf_scores = cross_val_score(rf_model, tree_X_train, y_train, cv=5, scoring='accuracy')
print(rf_scores)

"""
# Expanded parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [10, 50, 100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 30, 40, 50],
    'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced'],
    'warm_start': [False, True],
    'oob_score': [False, True]
}

grid_search_rf = GridSearchCV(rf_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)
grid_search_rf.fit(tree_X_train, y_train)
test_score_rf = grid_search_rf.best_estimator_.score(tree_X_train, y_train)

print("Best hyperparameters found:")
for key, value in grid_search_rf.best_params_.items():
    print(f"{key}: {value}")

print(f"Best accuracy: {grid_search_rf.best_score_}")
"""

[0.85736891 0.85776449 0.85530306 0.85859962 0.85327473]


'\n# Expanded parameter grid for GridSearchCV\nparam_grid = {\n    \'n_estimators\': [10, 50, 100, 200, 300],\n    \'criterion\': [\'gini\', \'entropy\'],\n    \'max_depth\': [None, 5, 10, 15, 20, 25, 30, 35, 40],\n    \'min_samples_split\': [2, 5, 10, 15, 20],\n    \'min_samples_leaf\': [1, 2, 4, 6, 8, 10],\n    \'min_weight_fraction_leaf\': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\n    \'max_features\': [None, \'sqrt\', \'log2\'],\n    \'max_leaf_nodes\': [None, 10, 20, 30, 40, 50],\n    \'min_impurity_decrease\': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\n    \'bootstrap\': [True, False],\n    \'class_weight\': [None, \'balanced\'],\n    \'warm_start\': [False, True],\n    \'oob_score\': [False, True]\n}\n\ngrid_search_rf = GridSearchCV(rf_model, param_grid, scoring=\'accuracy\', cv=5, n_jobs=-1, verbose=1)\ngrid_search_rf.fit(tree_X_train, y_train)\ntest_score_rf = grid_search_rf.best_estimator_.score(tree_X_train, y_train)\n\nprint("Best hyperparameters found:")\nfor key, value in grid_search_rf.bes

## Hyperparameter Tuning - Gradient Boosted Classifier

Gradient Boosting is an ensemble learning method that builds an additive model in a forward stage-wise fashion. It allows for the optimization of arbitrary differentiable loss functions. In each stage, a regression tree is fit on the negative gradient of the given loss function.

Relevant Parameters:
- **loss**: The loss function to be optimized. For classification, supported options are 'deviance' (default) for the exponential loss and 'exponential' for AdaBoost-like exponential loss.
- **learning_rate**: The learning rate shrinks the contribution of each tree by learning_rate. There is a trade-off between learning_rate and n_estimators.
- **n_estimators**: The number of boosting stages to perform. Gradient boosting is fairly robust to overfitting, so a large number of estimators usually results in better performance.
- **subsample**: The fraction of samples to be used for fitting the individual base learners. If smaller than 1.0, this results in Stochastic Gradient Boosting. subsample interacts with the parameter n_estimators. Choosing subsample < 1.0 leads to a reduction of variance and an increase in bias.
- **criterion**: The function to measure the quality of a split. Supported criteria are 'friedman_mse' (default) for the mean squared error with improvement score by Friedman, 'mse' for mean squared error, and 'mae' for the mean absolute error.
- **min_samples_split**: The minimum number of samples required to split an internal node. A larger value prevents the tree from growing too deep, thus preventing overfitting.
- **min_samples_leaf**: The minimum number of samples required to be at a leaf node. A larger value prevents the tree from growing too deep, thus preventing overfitting.
- **min_weight_fraction_leaf**: The minimum weighted fraction of the sum total of weights required to be at a leaf node. Samples have equal weight when sample_weight is not provided.
- **max_depth**: The maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables.
- **min_impurity_decrease**: A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
- **max_features**: The number of features to consider when looking for the best split. If None, then max_features=n_features. It can also be a float, int, or string ('auto', 'sqrt', or 'log2').

By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance.

In [23]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

# Gradient Boosting with default parameters
gb_model = GradientBoostingClassifier()
gb_scores = cross_val_score(gb_model, tree_X_train, y_train, cv=5, scoring='accuracy')
print(gb_scores)

"""
# Expanded parameter grid for GridSearchCV
param_grid = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': [10, 50, 100, 200, 300],
    'subsample': [0.5, 0.8, 1.0],
    'criterion': ['friedman_mse', 'mse', 'mae'],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40],
    'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 30, 40, 50],
    'n_iter_no_change': [None, 5, 10, 15],
    'validation_fraction': [0.1, 0.2, 0.3, 0.4, 0.5],
    'tol': [1e-4, 1e-3, 1e-2, 1e-1]
}

grid_search_gb = GridSearchCV(gb_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)
grid_search_gb.fit(tree_X_train, y_train)
test_score_gb = grid_search_gb.best_estimator_.score(tree_X_train, y_train)

print("Best hyperparameters found:")
for key, value in grid_search_gb.best_params_.items():
    print(f"{key}: {value}")

print(f"Best accuracy: {grid_search_gb.best_score_}")
"""

[0.85117138 0.85033625 0.85020439 0.85266582 0.84606593]


'\n# Expanded parameter grid for GridSearchCV\nparam_grid = {\n    \'loss\': [\'deviance\', \'exponential\'],\n    \'learning_rate\': [0.01, 0.1, 0.2, 0.3],\n    \'n_estimators\': [10, 50, 100, 200, 300],\n    \'subsample\': [0.5, 0.8, 1.0],\n    \'criterion\': [\'friedman_mse\', \'mse\', \'mae\'],\n    \'min_samples_split\': [2, 5, 10, 15, 20],\n    \'min_samples_leaf\': [1, 2, 4, 6, 8, 10],\n    \'min_weight_fraction_leaf\': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\n    \'max_depth\': [None, 5, 10, 15, 20, 25, 30, 35, 40],\n    \'min_impurity_decrease\': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\n    \'max_features\': [None, \'sqrt\', \'log2\'],\n    \'max_leaf_nodes\': [None, 10, 20, 30, 40, 50],\n    \'n_iter_no_change\': [None, 5, 10, 15],\n    \'validation_fraction\': [0.1, 0.2, 0.3, 0.4, 0.5],\n    \'tol\': [1e-4, 1e-3, 1e-2, 1e-1]\n}\n\ngrid_search_gb = GridSearchCV(gb_model, param_grid, scoring=\'accuracy\', cv=5, n_jobs=-1, verbose=1)\ngrid_search_gb.fit(tree_X_train, y_train)\ntest_score_gb = 

## Hyperparameter Tuning - XGBoost

XGBoost (eXtreme Gradient Boosting) is an optimized distributed gradient boosting library designed to be highly efficient, flexible, and portable. It implements machine learning algorithms under the Gradient Boosting framework, offering several regularization techniques to prevent overfitting.

Relevant Parameters:
- **learning_rate**: Boosting learning rate. Controls the contribution of each tree in the ensemble. Lower learning rates lead to more robust models but require more trees (n_estimators).
- **n_estimators**: Number of boosting rounds to be run. Larger values result in more complex models but can increase the risk of overfitting.
- **max_depth**: Maximum tree depth for base learners. Controls the depth of each individual tree in the ensemble. Deeper trees can capture more complex patterns, but may also overfit the data.
- **min_child_weight**: Minimum sum of instance weight (hessian) needed in a child. Defines the minimum number of instances required for a node to be split.
- **gamma**: Minimum loss reduction required to make a further partition on a leaf node of the tree. Controls the complexity of the tree by reducing the number of splits made.
- **subsample**: Subsample ratio of the training instances. Setting it to a value less than 1.0 can help prevent overfitting.
- **colsample_bytree**: Subsample ratio of columns when constructing each tree. A smaller value can reduce overfitting and speed up the training process.
- **colsample_bylevel**: Subsample ratio of columns for each level. Specifies the fraction of features to choose for each level in the tree building process.
- **colsample_bynode**: Subsample ratio of columns for each split. Specifies the fraction of features to choose for each split in the tree building process.
- **reg_alpha**: L1 regularization term on weights. Controls the sparsity of feature weights, effectively performing feature selection.
- **reg_lambda**: L2 regularization term on weights. Smoothens the weights, preventing extreme values and reducing the risk of overfitting.

By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance.

In [24]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score, GridSearchCV

# XGBoost with default parameters
xgb_model = xgb.XGBClassifier(use_label_encoder=False)
xgb_scores = cross_val_score(xgb_model, tree_X_train.values, y_train.values, cv=5, scoring='accuracy')
print(xgb_scores)
"""
# Expanded parameter grid for GridSearchCV
param_grid = {
    'max_depth': [3, 6, 9, 12],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': [10, 50, 100, 200, 300],
    'booster': ['gbtree', 'gblinear', 'dart'],
    'min_child_weight': [1, 5, 10],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0],
    'colsample_bylevel': [0.5, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.2, 0.3, 0.4],
    'reg_lambda': [1, 2, 3, 4],
    'scale_pos_weight': [1, 2, 3],
    'max_delta_step': [0, 1, 2, 3, 4],
    'base_score': [0.5, 0.6, 0.7, 0.8, 0.9],
    'random_state': [0, 1, 2, 3]
}

grid_search_xgb = GridSearchCV(xgb_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)
grid_search_xgb.fit(tree_X_train, y_train)
test_score_xgb = grid_search_xgb.best_estimator_.score(tree_X_train, y_train)

print("Best hyperparameters found:")
for key, value in grid_search_xgb.best_params_.items():
    print(f"{key}: {value}")

print(f"Best accuracy: {grid_search_xgb.best_score_}")
"""



[0.86031383 0.85816008 0.86079733 0.86154455 0.85745055]


'\n# Expanded parameter grid for GridSearchCV\nparam_grid = {\n    \'max_depth\': [3, 6, 9, 12],\n    \'learning_rate\': [0.01, 0.1, 0.2, 0.3],\n    \'n_estimators\': [10, 50, 100, 200, 300],\n    \'booster\': [\'gbtree\', \'gblinear\', \'dart\'],\n    \'min_child_weight\': [1, 5, 10],\n    \'gamma\': [0, 0.1, 0.2, 0.3, 0.4],\n    \'subsample\': [0.5, 0.8, 1.0],\n    \'colsample_bytree\': [0.5, 0.8, 1.0],\n    \'colsample_bylevel\': [0.5, 0.8, 1.0],\n    \'reg_alpha\': [0, 0.1, 0.2, 0.3, 0.4],\n    \'reg_lambda\': [1, 2, 3, 4],\n    \'scale_pos_weight\': [1, 2, 3],\n    \'max_delta_step\': [0, 1, 2, 3, 4],\n    \'base_score\': [0.5, 0.6, 0.7, 0.8, 0.9],\n    \'random_state\': [0, 1, 2, 3]\n}\n\ngrid_search_xgb = GridSearchCV(xgb_model, param_grid, scoring=\'accuracy\', cv=5, n_jobs=-1, verbose=1)\ngrid_search_xgb.fit(tree_X_train, y_train)\ntest_score_xgb = grid_search_xgb.best_estimator_.score(tree_X_train, y_train)\n\nprint("Best hyperparameters found:")\nfor key, value in grid_searc

In [25]:
#pipeline for KNN & ANN - Same as lr but wihout outlier removal
knn_X_train = lr_transformed.copy()

k-Nearest Neighbors (KNN) is a simple, yet powerful, non-parametric supervised learning algorithm used for classification and regression. It assigns a new instance to the majority class or computes the mean (for regression tasks) of its k nearest neighbors in the feature space.

Relevant Parameters:
- **n_neighbors**: Number of neighbors to use for the query. This is the main hyperparameter controlling the complexity of the KNN model. Larger values of k lead to smoother decision boundaries, while smaller values can capture more complex patterns but may overfit the data.
- **weights**: Weight function used in prediction. There are two options: 'uniform' (all points in each neighborhood are weighted equally) and 'distance' (assign weights proportional to the inverse of the distance from the query point). Using 'distance' can help reduce the impact of noise in the data.
- **algorithm**: Algorithm used to compute the nearest neighbors. Options include 'auto', 'ball_tree', 'kd_tree', and 'brute'. 'auto' will attempt to decide the most appropriate algorithm based on the values passed to fit() method. Choose the algorithm that best suits your data and computational requirements.
- **leaf_size**: Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.
- **p**: Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and for p = 2, it's equivalent to using euclidean_distance (l2). A larger value of p can help capture the specific geometry of your feature space.

By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance.

In [26]:
# KNN Code
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_scores = cross_val_score(knn_model, knn_X_train, y_train, cv=5, scoring='accuracy')
print(knn_scores)

"""
# Expanded parameter grid for GridSearchCV
param_grid = {
    'n_neighbors': list(range(1, 31)),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': list(range(1, 50)),
    'p': [1, 2],
    'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski', 'wminkowski', 'seuclidean', 'mahalanobis']
}

grid_search_knn = GridSearchCV(knn_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)
grid_search_knn.fit(knn_X_train, y_train)
test_score_knn = grid_search_knn.best_estimator_.score(knn_X_train, y_train)

print("Best hyperparameters found:")
for key, value in grid_search_knn.best_params_.items():
    print(f"{key}: {value}")

print(f"Best accuracy: {grid_search_knn.best_score_}")
"""

[0.84079821 0.83477649 0.83895213 0.83890818 0.83573626]


'\n# Expanded parameter grid for GridSearchCV\nparam_grid = {\n    \'n_neighbors\': list(range(1, 31)),\n    \'weights\': [\'uniform\', \'distance\'],\n    \'algorithm\': [\'auto\', \'ball_tree\', \'kd_tree\', \'brute\'],\n    \'leaf_size\': list(range(1, 50)),\n    \'p\': [1, 2],\n    \'metric\': [\'euclidean\', \'manhattan\', \'chebyshev\', \'minkowski\', \'wminkowski\', \'seuclidean\', \'mahalanobis\']\n}\n\ngrid_search_knn = GridSearchCV(knn_model, param_grid, scoring=\'accuracy\', cv=5, n_jobs=-1, verbose=1)\ngrid_search_knn.fit(knn_X_train, y_train)\ntest_score_knn = grid_search_knn.best_estimator_.score(knn_X_train, y_train)\n\nprint("Best hyperparameters found:")\nfor key, value in grid_search_knn.best_params_.items():\n    print(f"{key}: {value}")\n\nprint(f"Best accuracy: {grid_search_knn.best_score_}")\n'

## Hyperparameter Tuning - MLPClassifier

Multi-layer Perceptron (MLP) is a class of feedforward artificial neural network that can be used for classification and regression tasks. It consists of multiple layers of nodes, where each layer is fully connected to the next one. MLPClassifier is a popular implementation in scikit-learn for solving classification problems.

Relevant Parameters:
- **hidden_layer_sizes**: A tuple representing the number of neurons in each hidden layer. By adjusting this parameter, you can control the complexity of the model. Adding more hidden layers and neurons can increase the capacity of the model to learn complex patterns but may also lead to overfitting.
- **activation**: Activation function for the hidden layers. Options include 'identity', 'logistic' (sigmoid), 'tanh', and 'relu'. Different activation functions can lead to different model behaviors and convergence properties.
- **solver**: The solver for weight optimization. Choices are 'lbfgs', 'sgd', and 'adam'. Each solver has its own benefits and drawbacks, so it's essential to choose the one that best suits your problem and dataset.
- **alpha**: L2 penalty (regularization term) parameter. It's used to control the trade-off between fitting the data and keeping the weights small, which helps prevent overfitting.
- **batch_size**: The size of mini-batches for stochastic optimizers. If the solver is 'lbfgs', the classifier will not use mini-batch. For 'sgd' and 'adam', using smaller batch sizes can provide a regularizing effect but may increase the time required for convergence.
- **learning_rate**: Learning rate schedule for weight updates. Options are 'constant', 'invscaling', and 'adaptive'. The learning rate determines how quickly the model adapts to the data, with larger values leading to faster convergence but potentially oscillating around the optimum.
- **max_iter**: Maximum number of iterations. The solver iterates until convergence or this number of iterations is reached. Increasing this value allows the model more time to converge but may increase the computation time.

By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance.

In [27]:
# ANN Code
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

# MLP with default parameters
mlp_model = MLPClassifier(random_state=42)
mlp_scores = cross_val_score(mlp_model, knn_X_train, y_train, cv=5, scoring='accuracy')
print(mlp_scores)

"""
# Parameter grid for GridSearchCV
param_grid = {
    'hidden_layer_sizes': [(10,), (20,), (50,), (10, 10), (20, 20), (50, 50)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'max_iter': [200, 500, 1000],
}

grid_search_mlp = GridSearchCV(mlp_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)
grid_search_mlp.fit(knn_X_train, y_train)
test_score_mlp = grid_search_mlp.best_estimator_.score(knn_X_train, y_train)

print("Best hyperparameters found:")
for key, value in grid_search_mlp.best_params_.items():
    print(f"{key}: {value}")

print(f"Best accuracy: {grid_search_mlp.best_score_}")
"""



[0.8428201  0.84013889 0.84611665 0.83816096 0.84391209]




'\n# Parameter grid for GridSearchCV\nparam_grid = {\n    \'hidden_layer_sizes\': [(10,), (20,), (50,), (10, 10), (20, 20), (50, 50)],\n    \'activation\': [\'identity\', \'logistic\', \'tanh\', \'relu\'],\n    \'solver\': [\'lbfgs\', \'sgd\', \'adam\'],\n    \'alpha\': [0.0001, 0.001, 0.01, 0.1],\n    \'learning_rate\': [\'constant\', \'invscaling\', \'adaptive\'],\n    \'max_iter\': [200, 500, 1000],\n}\n\ngrid_search_mlp = GridSearchCV(mlp_model, param_grid, scoring=\'accuracy\', cv=5, n_jobs=-1, verbose=1)\ngrid_search_mlp.fit(knn_X_train, y_train)\ntest_score_mlp = grid_search_mlp.best_estimator_.score(knn_X_train, y_train)\n\nprint("Best hyperparameters found:")\nfor key, value in grid_search_mlp.best_params_.items():\n    print(f"{key}: {value}")\n\nprint(f"Best accuracy: {grid_search_mlp.best_score_}")\n'

In [28]:
X_test_Nb = nb_preprocessor.transform(X_test).values
X_test_LR = lr_preprocessor.transform(X_test).values

In [30]:
nb_model.fit(nb_X_train, nb_y_train)
lr_model.fit(lr_X_train, lr_y_train)
svm_model.fit(lr_X_train, lr_y_train)
dt_model.fit(tree_X_train, y_train)
rf_model.fit(tree_X_train, y_train)
gb_model.fit(tree_X_train, y_train)
xgb_model.fit(tree_X_train.values, y_train.values)
knn_model.fit(knn_X_train, y_train)
mlp_model.fit(knn_X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)


In [32]:
from sklearn.metrics import accuracy_score

# Naive Bayes
y_pred_nb = nb_model.predict(X_test_Nb)
acc_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Accuracy: {acc_nb}")

# Logistic Regression
y_pred_lr = lr_model.predict(X_test_LR)
acc_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {acc_lr}")

# SVM Classification
y_pred_svm = svm_model.predict(X_test_LR)
acc_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Classification Accuracy: {acc_svm}")

# Decision Tree
y_pred_dt = dt_model.predict(X_test_Nb)
acc_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {acc_dt}")

# Random Forest
y_pred_rf = rf_model.predict(X_test_Nb)
acc_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {acc_rf}")

# Gradient Boosted Classifier
y_pred_gb = gb_model.predict(X_test_Nb)
acc_gb = accuracy_score(y_test, y_pred_gb)
print(f"Gradient Boosted Classifier Accuracy: {acc_gb}")

# XGBoost
y_pred_xgb = xgb_model.predict(X_test_Nb)
acc_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {acc_xgb}")

# KNN
y_pred_knn = knn_model.predict(X_test_LR)
acc_knn = accuracy_score(y_test, y_pred_knn)
print(f"KNN Accuracy: {acc_knn}")

# ANN
y_pred_ann = mlp_model.predict(X_test_LR)
acc_ann = accuracy_score(y_test, y_pred_ann)
print(f"ANN Accuracy: {acc_ann}")



Naive Bayes Accuracy: 0.6615211505327192
Logistic Regression Accuracy: 0.8455641900207461
SVM Classification Accuracy: 0.8542494461830584
Decision Tree Accuracy: 0.7893034213579943




Random Forest Accuracy: 0.8554801504975562
Gradient Boosted Classifier Accuracy: 0.8475684799043567
XGBoost Accuracy: 0.8601919898730617




KNN Accuracy: 0.8368789338584338
ANN Accuracy: 0.8481662505713984


