In [3]:
from multiprocess import Pool
#from functools import partial
import numpy as np
#from numba import jit

In [4]:
#TODO: loss of least square regression and binary logistic regression
'''
    pred() takes GBDT/RF outputs, i.e., the "score", as its inputs, and returns predictions.
    g() is the gradient/1st order derivative, which takes true values "true" and scores as input, and returns gradient.
    h() is the heassian/2nd order derivative, which takes true values "true" and scores as input, and returns hessian.
'''
class leastsquare(object):
    '''Loss class for mse. As for mse, pred function is pred=score.'''
    def pred(self,score):
        return score

    def g(self,true,score):
        return 2 * (score - true)

    def h(self,true,score):
        return 2 * np.ones(score.shape[0])

class logistic(object):
    '''Loss class for log loss. As for log loss, pred function is logistic transformation.'''
    def pred(self,score):
        score = 1 / (1 + np.exp(-score))
        return score

    def g(self,true,score):
        return self.pred(score) - true

    def h(self,true,score):
        sigmoid = self.pred(score)
        return sigmoid * (1 - sigmoid)

In [2]:
# TODO: class of a node on a tree
class TreeNode(object):
    '''
    Data structure that are used for storing a node on a tree.
    
    A tree is presented by a set of nested TreeNodes,
    with one TreeNode pointing two child TreeNodes,
    until a tree leaf is reached.
    
    A node on a tree can be either a leaf node or a non-leaf node.
    '''
    
    #TODO
    #def __init__(self, X, y, depth):
    def __init__(
        self, 
        split_feature = None, 
        split_threshold = None, 
        depth = None,
        weight = None,
        left_child = None, 
        right_child = None):

        # store essential information in every tree node
        self.split_feature = split_feature 
        self.split_threshold = split_threshold
        self.left_child = left_child
        self.right_child = right_child
        self.depth = depth
        self.weight = weight
        
        self.is_leaf = (left_child is None) and (right_child is None)
    

In [39]:
# TODO: class of single tree
class Tree(object):
    '''
    Class of a single decision tree in GBDT

    Parameters:
        n_threads: The number of threads used for fitting and predicting.
        max_depth: The maximum depth of the tree.
        min_sample_split: The minimum number of samples required to further split a node.
        lamda: The regularization coefficient for leaf prediction, also known as lambda.
        gamma: The regularization coefficient for number of TreeNode, also know as gamma.
        rf: rf*m is the size of random subset of features, from which we select the best decision rule,
            rf = 0 means we are training a GBDT.
    '''
    
    def __init__(self, n_threads = None, 
                 max_depth = 3, min_sample_split = 10,
                 lamda = 1, gamma = 0, rf = 0):
        self.n_threads = n_threads
        self.max_depth = max_depth
        self.min_sample_split = min_sample_split
        self.lamda = lamda
        self.gamma = gamma
        self.rf = rf
        self.int_member = 0

    def fit(self, train, g, h):
        '''
        train is the training data matrix, and must be numpy array (an n_train x m matrix).
        g and h are gradient and hessian respectively.
        '''
        #TODO
        self.tree = self.construct_tree(train, g, h, self.max_depth)
        return self

    def predict(self,test):
        '''
        test is the test data matrix, and must be numpy arrays (an n_test x m matrix).
        Return predictions (scores) as an array.
        '''
        #TODO
        result = []
        n, m = test.shape
        for i in range(n):
            sample = test[i, :]
            cur = self.tree
            while not cur.is_leaf:
                if cur.left_child is not None and sample[cur.split_feature] < cur.split_threshold:
                    cur = cur.left_child
                else:
                    cur = cur.right_child
            result.append(cur.weight)
        result = np.array(result)
        return result

    def construct_tree(self, train, g, h, max_depth):
        '''
        Tree construction, which is recursively used to grow a tree.
        First we should check if we should stop further splitting.
        
        The stopping conditions include:
            1. tree reaches max_depth $d_{max}$
            2. The number of sample points at current node is less than min_sample_split, i.e., $n_{min}$
            3. gain <= 0
        '''
        #TODO
        (n, m) = train.shape
        G, H = np.sum(g), np.sum(h)
        weight = -G / (H + self.lamda)
        if n < self.min_sample_split or max_depth == 0:
            return TreeNode(weight = weight)
        best_p, best_t, gain = 0, 0, 0

        feature, threshold, gain = self.find_best_decision_rule(train, g, h)
        if gain <= 0:
            return TreeNode(weight = weight)
        
        left = train[:, feature] <= threshold
        right = train[:, feature] > threshold
        left_child = self.construct_tree(train[left, :], g[left], h[left], max_depth - 1)
        right_child = self.construct_tree(train[right, :], g[right], h[right], max_depth - 1)
        return TreeNode(
            split_feature = feature, split_threshold = threshold, 
            left_child = left_child, right_child = right_child)

    def find_best_decision_rule(self, train, g, h):
        '''
        Return the best decision rule [feature, threshold], i.e., $(p_j, \tau_j)$ on a node j, 
        train is the training data assigned to node j
        g and h are the corresponding 1st and 2nd derivatives for each data point in train
        g and h should be vectors of the same length as the number of data points in train
        
        for each feature, we find the best threshold by find_threshold(),
        a [threshold, best_gain] list is returned for each feature.
        Then we select the feature with the largest best_gain,
        and return the best decision rule [feature, treshold] together with its gain.
        '''
        #TODO
        (n, m) = train.shape

        if self.rf != 0:
            idx = np.random.choice(np.arange(m), int(self.rf * m), replace = False)
        else:
            idx = np.arange(m)
       
        feature, threshold, best_gain = 0, 0, 0
        # multiprocessing version of selecting decision rule; can be applied to RF and GBDT
        if self.n_threads is not None and self.n_threads > 1:
            with Pool(processes = self.n_threads) as pool:
                res = pool.starmap(self.find_threshold, [(g, h, train[:, p],) for p in idx])
            #print(res)
            for p, (t, gain) in enumerate(res):
                if gain > best_gain:
                    feature, threshold, best_gain = p, t, gain
        else:
            for p in idx:
                t, gain = self.find_threshold(g, h, train[:, p])
                if gain > best_gain:
                    feature, threshold, best_gain = p, t, gain
        return feature, threshold, best_gain

    
    def find_threshold(self, g, h, train):
        '''
        Given a particular feature $p_j$,
        return the best split threshold $\tau_j$ together with the gain that is achieved.
        '''
        #TODO
        #
        n = train.shape[0]
        threshold, best_gain = 0, -1
        idx = sorted(np.arange(n), key = lambda i : train[i])
        sorted_train = train[idx]
        thresholds = set(sorted_train)
        sorted_g, sorted_h = g[idx], h[idx]
        G, H = np.sum(sorted_g), np.sum(sorted_h)
        GL, GR, HL, HR = 0, G, 0, H
        i = 0
        for t in list(thresholds)[:-1]:
            while i < len(idx) - 1 and sorted_train[i] <= t:
                GL += sorted_g[i]
                HL += sorted_h[i]
                GR -= sorted_g[i]
                HR -= sorted_h[i] 
                i += 1
            gain = 0.5 * (
                GL ** 2 / (HL + self.lamda) + \
                GR ** 2 / (HR + self.lamda) - \
                G ** 2 / (H + self.lamda)) - self.gamma
            if gain > best_gain:
                threshold = (sorted_train[i - 1] + sorted_train[i]) / 2
                #threshold = t
                best_gain = gain
        '''for t in range(len(idx) - 1):
            GL += sorted_g[t]
            HL += sorted_h[t]
            GR -= sorted_g[t]
            HR -= sorted_h[t] 
            gain = 0.5 * (
                GL ** 2 / (HL + self.lamda) + \
                GR ** 2 / (HR + self.lamda) - \
                G ** 2 / (H + self.lamda)) - self.gamma
            if gain > best_gain:
                threshold = (sorted_train[t - 1] + sorted_train[t]) / 2
                best_gain = gain'''
        return [threshold, best_gain]

In [40]:
# TODO: class of Random Forest
class RF(object):
    '''
    Class of Random Forest
    
    Parameters:
        n_threads: The number of threads used for fitting and predicting.
        loss: Loss function for gradient boosting.
            'mse' for regression task and 'log' for classfication task.
            A child class of the loss class could be passed to implement customized loss.
        max_depth: The maximum depth d_max of a tree.
        min_sample_split: The minimum number of samples required to further split a node.
        lamda: The regularization coefficient for leaf score, also known as lambda.
        gamma: The regularization coefficient for number of tree nodes, also know as gamma.
        rf: rf*m is the size of random subset of features, from which we select the best decision rule.
        num_trees: Number of trees.
    '''
    def __init__(self,
        n_threads = None, loss = 'mse',
        max_depth = 3, min_sample_split = 10, 
        lamda = 1, gamma = 0,
        rf = 0.99, num_trees = 100):
        
        self.n_threads = n_threads
        self.loss = loss
        self.max_depth = max_depth
        self.min_sample_split = min_sample_split
        self.lamda = lamda
        self.gamma = gamma
        self.rf = rf
        self.num_trees = num_trees
        self.trees = []

        self.loss = leastsquare() if loss == 'mse' else logistic()

    def fit(self, train, target):
        # train is n x m 2d numpy array
        # target is n-dim 1d array
        #TODO
        (n, m) = train.shape

        for i in range(self.num_trees):
            idx = np.random.choice(np.arange(n), n, replace = True)
            X_sample,y_sample = train[idx, :], target[idx]
            pred = np.full(y_sample.shape[0], 0)
            #pred = np.full(y_sample.shape[0], np.mean(y_sample))
            g, h = self.loss.g(y_sample, pred), self.loss.h(y_sample, pred)

            new_tree = Tree(n_threads = self.n_threads, rf = self.rf)
            new_tree.fit(X_sample, g, h)
            self.trees.append(new_tree)

        return self

    def predict(self, test):
        #TODO
        predictions = np.array([tree.predict(test) for tree in self.trees])
        score = np.mean(predictions, axis = 0)
        return self.loss.pred(score)

In [81]:
# TODO: class of GBDT
class GBDT(object):
    '''
    Class of gradient boosting decision tree (GBDT)
    
    Parameters:
        n_threads: The number of threads used for fitting and predicting.
        loss: Loss function for gradient boosting.
            'mse' for regression task and 'log' for classfication task.
            A child class of the loss class could be passed to implement customized loss.
        max_depth: The maximum depth D_max of a tree.
        min_sample_split: The minimum number of samples required to further split a node.
        lamda: The regularization coefficient for leaf score, also known as lambda.
        gamma: The regularization coefficient for number of tree nodes, also know as gamma.
        learning_rate: The learning rate eta of GBDT.
        num_trees: Number of trees.
    '''
    def __init__(self,
        n_threads = None, loss = 'mse',
        max_depth = 3, min_sample_split = 10, 
        lamda = 1, gamma = 0,
        learning_rate = 0.1, num_trees = 100):
        
        self.n_threads = n_threads
        self.loss = loss
        self.max_depth = max_depth
        self.min_sample_split = min_sample_split
        self.lamda = lamda
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.num_trees = num_trees
        self.trees = []

        self.loss = leastsquare() if loss == 'mse' else logistic()
        

    def fit(self, train, target):
        # train is n x m 2d numpy array
        # target is n-dim 1d array
        #TODO
        pred = np.full(target.shape[0], 0)
        g, h = self.learning_rate * self.loss.g(target, pred), self.learning_rate ** 2 *self.loss.h(target, pred)

        (n, m) = train.shape
        for i in range(self.num_trees):
            #idx = np.random.choice(np.arange(n), int(0.99 * n))
            #X_sample,y_sample = train[idx, :], target[idx]

            new_tree = Tree(n_threads = self.n_threads)
            new_tree.fit(train, g, h)
            #new_tree.fit(X_sample, g, h)
            self.trees.append(new_tree)

            pred = self.predict(train)
            #pred = self.predict(X_sample)
            g, h = self.learning_rate * self.loss.g(target, pred), self.learning_rate ** 2 * self.loss.h(target, pred)
            #g, h = self.learning_rate * self.loss.g(y_sample, pred), self.learning_rate ** 2 * self.loss.h(y_sample, pred)
        return self

    def predict(self, test):
        #TODO
        predictions = np.array([tree.predict(test) for tree in self.trees])
        score = np.sum(predictions, axis = 0)
        return self.loss.pred(score)

In [30]:
# TODO: Evaluation functions (you can use code from previous homeworks)

# RMSE
def root_mean_square_error(pred, y):
    #TODO
    n = pred.shape[0]
    error = pred - y
    rmse = np.sqrt(np.dot(pred - y, pred - y) / n)
    return rmse

# precision
def accuracy(pred, y):
    #TODO
    n = pred.shape[0]
    return sum(pred == y) / n

In [41]:
# TODO: GBDT regression on boston house price dataset

# load data
import numpy as np
import pandas as pd

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]


# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)
print(X.shape, y.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(506, 13) (506,) (354, 13) (354,) (152, 13) (152,)


In [42]:
# fit housing price with Random Forest
model = RF(num_trees = 100, )
model.fit(X_train, y_train)

pred_train = model.predict(X_train)
rmse_train = root_mean_square_error(pred_train, y_train)

pred_test = model.predict(X_test)
rmse_test = root_mean_square_error(pred_test, y_test)

print('train rmse: {} | test rmse: {}'.format(rmse_train, rmse_test))

train rmse: 3.724229667511438 | test rmse: 4.215613270781268


In [100]:
# fit housing price with GBDT
model = GBDT(num_trees = 100, learning_rate = 0.001, n_threads = 1)
model.fit(X_train, y_train)

pred_train = model.predict(X_train)
rmse_train = root_mean_square_error(pred_train, y_train)

pred_test = model.predict(X_test)
rmse_test = root_mean_square_error(pred_test, y_test)

print('train rmse: {} | test rmse: {}'.format(rmse_train, rmse_test))

train rmse: 2.153766997451514 | test rmse: 3.785467676530208


In [65]:
# comparison with closed-form linear regression
W = np.matmul(
    np.linalg.inv(np.matmul(X_train.T, X_train)),
    np.matmul(X_train.T, y_train))


pred_train = np.matmul(X_train, W)
rmse_train = root_mean_square_error(pred_train, y_train)

pred_test = np.matmul(X_test, W)
rmse_test = root_mean_square_error(pred_test, y_test)

print('train rmse: {} | test rmse: {}'.format(rmse_train, rmse_test))

train rmse: 4.820626531838223 | test rmse: 5.2092175105308245


In [23]:
# comparison with closed-form ridge regression
lmda = 0.5

W = np.matmul(
    np.linalg.inv(np.matmul(X_train.T, X_train) + lmda),
    np.matmul(X_train.T, y_train))


pred_train = np.matmul(X_train, W)
rmse_train = root_mean_square_error(pred_train, y_train)

pred_test = np.matmul(X_test, W)
rmse_test = root_mean_square_error(pred_test, y_test)

print('train rmse: {} | test rmse: {}'.format(rmse_train, rmse_test))

train rmse: 4.822434482543469 | test rmse: 5.186569984437025


In [24]:
# comparison with sklearn ridge regression
from sklearn.linear_model import Ridge

rdg = Ridge(alpha = 0.5)
rdg.fit(X_train, y_train)

pred_train = rdg.predict(X_train)
rmse_train = root_mean_square_error(pred_train, y_train)

pred_test = rdg.predict(X_test)
rmse_test = root_mean_square_error(pred_test, y_test)

print('train rmse: {} | test rmse: {}'.format(rmse_train, rmse_test))

train rmse: 4.6364947168982935 | test rmse: 4.924357692077932


In [25]:
# comparison with sklearn lasso regression
from sklearn.linear_model import Lasso

las = Lasso(alpha = 0.5)
las.fit(X_train, y_train)

pred_train = las.predict(X_train)
rmse_train = root_mean_square_error(pred_train, y_train)

pred_test = las.predict(X_test)
rmse_test = root_mean_square_error(pred_test, y_test)

print('train rmse: {} | test rmse: {}'.format(rmse_train, rmse_test))

train rmse: 4.811833896637408 | test rmse: 5.218761727804395


In [43]:
# TODO: GBDT classification on credit-g dataset

# load data
from sklearn.datasets import fetch_openml
X, y = fetch_openml('credit-g', version=1, return_X_y=True, data_home='credit/', parser='auto')
y = np.array(list(map(lambda x: 1 if x == 'good' else 0, y)))

In [44]:
# Inspect the dataset
#print(X.columns)

def unique(lst):
    res = []
    for each in lst:
        if each not in res:
            res.append(each)
    return res

for col in X.columns:
    #break
    print('{} | values: {}'.format(col, unique(X[col].tolist())))

print('y | {}'.format(y))

checking_status | values: ['<0', '0<=X<200', 'no checking', '>=200']
duration | values: [6, 48, 12, 42, 24, 36, 30, 15, 9, 10, 7, 60, 18, 45, 11, 27, 8, 54, 20, 14, 33, 21, 16, 4, 47, 13, 22, 39, 28, 5, 26, 72, 40]
credit_history | values: ['critical/other existing credit', 'existing paid', 'delayed previously', 'no credits/all paid', 'all paid']
purpose | values: ['radio/tv', 'education', 'furniture/equipment', 'new car', 'used car', 'business', 'domestic appliance', 'repairs', 'other', 'retraining']
credit_amount | values: [1169, 5951, 2096, 7882, 4870, 9055, 2835, 6948, 3059, 5234, 1295, 4308, 1567, 1199, 1403, 1282, 2424, 8072, 12579, 3430, 2134, 2647, 2241, 1804, 2069, 1374, 426, 409, 2415, 6836, 1913, 4020, 5866, 1264, 1474, 4746, 6110, 2100, 1225, 458, 2333, 1158, 6204, 6187, 6143, 1393, 2299, 1352, 7228, 2073, 5965, 1262, 3378, 2225, 783, 6468, 9566, 1961, 6229, 1391, 1537, 1953, 14421, 3181, 5190, 2171, 1007, 1819, 2394, 8133, 730, 1164, 5954, 1977, 1526, 3965, 4771, 9436, 383

In [45]:
# Preprocess the dataset
non_numeric = X.select_dtypes(exclude = 'number').columns
for col in non_numeric:
    unique = X[col].unique()
    mapping = {each : i / len(unique) - 0.5 for i, each in enumerate(unique)}
    X[col] = X[col].map(mapping)

X = X.values

# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)
print(X.shape, y.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1000, 20) (1000,) (700, 20) (700,) (300, 20) (300,)


In [46]:
# classify breast cancer with RF
model = RF(num_trees = 100)
model.fit(X_train, y_train)

threshold = 0.5
pred_train = model.predict(X_train) > threshold
acc_train = accuracy(pred_train, y_train)

pred_test = model.predict(X_test) > threshold
acc_test = accuracy(pred_test, y_test)

print('train accuracy: {} | test accuracy: {}'.format(acc_train, acc_test))

train accuracy: 0.7771428571428571 | test accuracy: 0.7366666666666667


In [70]:
# classify breast cancer with GBDT
model = GBDT(num_trees = 80, learning_rate = 7.5e-5)
model.fit(X_train, y_train)

threshold = 0.5
pred_train = model.predict(X_train) > threshold
acc_train = accuracy(pred_train, y_train)

pred_test = model.predict(X_test) > threshold
acc_test = accuracy(pred_test, y_test)

print('train accuracy: {} | test accuracy: {}'.format(acc_train, acc_test))

train accuracy: 0.7757142857142857 | test accuracy: 0.7366666666666667


In [71]:
# Comparison with logistic regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=16)

logreg.fit(X_train, y_train)

pred_train = logreg.predict(X_train)
acc_train = accuracy(pred_train, y_train)

pred_test = logreg.predict(X_test)
acc_test = accuracy(pred_test, y_test)

print('train accuracy: {} | test accuracy: {}'.format(acc_train, acc_test))

train accuracy: 0.7328571428571429 | test accuracy: 0.71


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
# TODO: GBDT classification on breast cancer dataset

# load data
from sklearn import datasets
breast_cancer = datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)
print(X.shape, y.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(X_train[:3], y_train[:3])

(569, 30) (569,) (398, 30) (398,) (171, 30) (171,)
[[1.026e+01 1.658e+01 6.585e+01 3.208e+02 8.877e-02 8.066e-02 4.358e-02
  2.438e-02 1.669e-01 6.714e-02 1.144e-01 1.023e+00 9.887e-01 7.326e+00
  1.027e-02 3.084e-02 2.613e-02 1.097e-02 2.277e-02 5.890e-03 1.083e+01
  2.204e+01 7.108e+01 3.574e+02 1.461e-01 2.246e-01 1.783e-01 8.333e-02
  2.691e-01 9.479e-02]
 [1.298e+01 1.935e+01 8.452e+01 5.140e+02 9.579e-02 1.125e-01 7.107e-02
  2.950e-02 1.761e-01 6.540e-02 2.684e-01 5.664e-01 2.465e+00 2.065e+01
  5.727e-03 3.255e-02 4.393e-02 9.811e-03 2.751e-02 4.572e-03 1.442e+01
  2.195e+01 9.921e+01 6.343e+02 1.288e-01 3.253e-01 3.439e-01 9.858e-02
  3.596e-01 9.166e-02]
 [1.469e+01 1.398e+01 9.822e+01 6.561e+02 1.031e-01 1.836e-01 1.450e-01
  6.300e-02 2.086e-01 7.406e-02 5.462e-01 1.511e+00 4.795e+00 4.945e+01
  9.976e-03 5.244e-02 5.278e-02 1.580e-02 2.653e-02 5.444e-03 1.646e+01
  1.834e+01 1.141e+02 8.092e+02 1.312e-01 3.635e-01 3.219e-01 1.108e-01
  2.827e-01 9.208e-02]] [1 1 1]


In [48]:
# classify breast cancer with RF
model = RF(num_trees = 100)
model.fit(X_train, y_train)

threshold = 0.5
pred_train = model.predict(X_train) > threshold
acc_train = accuracy(pred_train, y_train)

pred_test = model.predict(X_test) > threshold
acc_test = accuracy(pred_test, y_test)

print('train accuracy: {} | test accuracy: {}'.format(acc_train, acc_test))

train accuracy: 0.9849246231155779 | test accuracy: 0.9473684210526315


In [79]:
# classify breast cancer with GBDT
model = GBDT(num_trees = 100, learning_rate = 7.5e-4)
model.fit(X_train, y_train)

threshold = 0.5
pred_train = model.predict(X_train) > threshold
acc_train = accuracy(pred_train, y_train)

pred_test = model.predict(X_test) > threshold
acc_test = accuracy(pred_test, y_test)

print('train accuracy: {} | test accuracy: {}'.format(acc_train, acc_test))

train accuracy: 0.9949748743718593 | test accuracy: 0.9590643274853801


In [77]:
# Comparison with logistic regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=16)

logreg.fit(X_train, y_train)

pred_train = logreg.predict(X_train)
acc_train = accuracy(pred_train, y_train)

pred_test = logreg.predict(X_test)
acc_test = accuracy(pred_test, y_test)

print('train accuracy: {} | test accuracy: {}'.format(acc_train, acc_test))

train accuracy: 0.9547738693467337 | test accuracy: 0.9473684210526315


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
