In [103]:
!pip install striprtf
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [104]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold, KFold
from sklearn.feature_selection import SelectFromModel, SelectKBest, r_regression

In [105]:
#converting rtf to text for accessing the json script easily
from striprtf.striprtf import rtf_to_text
with open('algoparams_from_ui.json.rtf', 'r') as file:
    rtf = file.read()
    text = rtf_to_text(rtf)
print(text)

{
    "session_name": "test",
    "session_description": "test",
    "design_state_data": {

      "session_info" : {
        "project_id": "1",
        "experiment_id": "kkkk-11",
        "dataset":"iris_modified.csv",
        "session_name": "test",
        "session_description": "test"
        },

      "target": {
        "prediction_type": "Regression",
        "target": "petal_width",
        "type":"regression",
        "partitioning": true
      },
      "train": {
        "policy": "Split the dataset",
        "time_variable": "sepal_length",
        "sampling_method": "No sampling(whole data)",
        "split": "Randomly",
        "k_fold": false,
        "train_ratio": 0,
        "random_seed": 0
      },
      "metrics": {
        "optomize_model_hyperparameters_for": "AUC",
        "optimize_threshold_for": "F1 Score",
        "compute_lift_at": 0,
        "cost_matrix_gain_for_true_prediction_true_result": 1,
        "cost_matrix_gain_for_true_prediction_false_result": 0,

In [106]:
import json
out_file = open("algoparams_from_ui.json", "w") #creating a json file using the text from rtf
json.dump(text, out_file, indent = 4)
json_text = open('algoparams_from_ui.json','r')
data = json.load(json_text) # loading the json file
data = json.loads(data) # json is in the form of string, therefore using loads

## 1. Read the target and type of regression to be run

In [107]:
data['design_state_data']['target']

{'prediction_type': 'Regression',
 'target': 'petal_width',
 'type': 'regression',
 'partitioning': True}

In [108]:
data['design_state_data']['target']['type']

'regression'

## 2. Read the features(which are column names in the csv) and figure out what missing imputation needs to be applied and apply that to the columns loaded in a dataframe 

In [109]:
import pandas as pd
df = pd.read_csv('iris.csv')
column_names = list(df.columns.values)
column_names

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

In [110]:
if 'feature_handling' in data['design_state_data']:  #checking if feature_handling exists 
  for key in data['design_state_data']['feature_handling']:
    #checking if the features mentioned in feature_handling are present in the column names and checking if the missing_values are handled
    if key in column_names and 'missing_values' in data['design_state_data']['feature_handling'][key]['feature_details']:
      #checking if average values need to be used for imputations and confirming that the variable type is numerical
      if data['design_state_data']['feature_handling'][key]['feature_details']['impute_with'] == 'Average of values' and data['design_state_data']['feature_handling'][key]['feature_variable_type'] == 'numerical':
        df[key].fillna(df[key].mean(), inplace = True)
      elif data['design_state_data']['feature_handling'][key]['feature_details']['impute_with'] == 'custom':
        df[key].fillna(data['design_state_data']['feature_handling'][key]['feature_details']['impute_value'], inplace = True) #for custom imputations, the missing value is imputed by the impute value
      else:
        print("No imputations specified for {}".format(key))
else:
  print('Feature handling has not been specified in the JSON file')


## 3. Compute feature reduction based on input.

In [111]:
feat_red_options = ['no reduction', 'correlation with target', 'tree-based', 'principal component analysis'] #options existing for feature reduction methods
feat_red = input()
if feat_red.lower() not in feat_red_options:
  print('Please provide a method among the options',(feat_red_options))

tree-based


In [112]:
data['design_state_data']['feature_reduction']['feature_reduction_method'] = feat_red

In [113]:
X = df.iloc[:,[0,1,2,3]]
Y = df.iloc[:,[-1]]
#Encoding labels from text to integers
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(Y)
Y_encoded = le.transform(Y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [114]:
def tree_based(X, Y_encoded, data): #returns the reduced features using RandomForestRegressor and SelectFromModel methods
  num_trees = data['design_state_data']['feature_reduction']['num_of_trees']
  depth_trees = data['design_state_data']['feature_reduction']['depth_of_trees']
  max_features = data['design_state_data']['feature_reduction']['num_of_features_to_keep']
  sel_model = SelectFromModel(RandomForestRegressor(n_estimators = int(num_trees),max_depth = int(depth_trees)), max_features = int(max_features) )
  sel_model.fit(X, Y_encoded)
  final_features = sel_model.transform(X)
  #final_features = [X.columns[(sel_model.get_support())],Y_encoded.columns[(sel_model.get_support())]]
  return final_features

def no_reduction(X): #returns the input as it is, since no reduction
  return X

def corr_with_target(X, Y_encoded, data): #returns reduced features based on pearson correlation
  num_features_to_keep = data['design_state_data']['feature_reduction']['num_of_features_to_keep']
  final_features = SelectKBest(r_regression, k=int(num_features_to_keep)).fit_transform(X, Y_encoded)
  return final_features

def PCA(X, data): #returns reduced features by performing PCA
  num_features_to_keep = data['design_state_data']['feature_reduction']['num_of_features_to_keep']
  final_features = PCA(n_components=int(num_features_to_keep)).fit(X)
  return final_features

In [115]:
if feat_red.lower() == 'tree-based':
  reduced_features = tree_based(X, Y_encoded, data)
elif feat_red.lower() == 'no reduction':
  reduced_features = no_reduction(X)
elif feat_red.lower() == 'correlation with target':
  reduced_features = corr_with_target(X, Y_encoded, data)
elif feat_red.lower() == 'principal component analysis':
  reduced_features = corr_with_target(X, data)

## 4. Parse the json and make the model objects(using sklean)  

In [116]:
#Regression and classification models seggregated
if data['design_state_data']['target']['prediction_type'] == 'Regression':
  models = {"RandomForestRegressor":  RandomForestRegressor(),
            "GBTRegressor": GradientBoostingRegressor(),
            "LinearRegression": LinearRegression(),
            "RidgeRegression": Ridge(),
            "LassoRegression": Lasso(),
            "ElasticNetRegression": ElasticNet(),
            "xg_boost": XGBRegressor(),
            "DecisionTreeRegressor": DecisionTreeRegressor()}
elif data['design_state_data']['target']['prediction_type'] == 'Classification':
  models = {"RandomForestClassifier": RandomForestClassifier(),
            "GBTClassifier": GradientBoostingClassifier(),
            "LogisticRegression": LogisticRegression(),
            "DecisionTreeClassifier": DecisionTreeClassifier(),
            "SVM": SVC(),
            "SGD": SGDClassifier(),
            "KNN": KNeighborsClassifier(),
            "extra_random_trees": ExtraTreesClassifier(),
            "neural_network": MLPClassifier()}
else:
  print('The prediction_type can be Regression or Classification only')


## 5. Run the fit and predict on each model and 6. Log to the console the standard model metrics that apply

In [117]:
hyp_params = data['design_state_data']['hyperparameters']
if (hyp_params['stratified']) == True and hyp_params['max_iterations'] < 1:
  cv = StratifiedKFold(n_splits = hyp_params['num_of_folds'], shuffle = hyp_params['shuffle_grid'], random_state = hyp_params['random_state'])
elif hyp_params['stratified'] == True and hyp_params['max_iterations'] > 1:
  cv = RepeatedStratifiedKFold(n_splits = hyp_params['num_of_folds'], n_repeats = hyp_params['max_iterations'], random_state = hyp_params['random_state'])
elif hyp_params['stratified'] == False and hyp_params['max_iterations'] > 1:
  cv = RepeatedKFold(n_splits = hyp_params['num_of_folds'], n_repeats = hyp_params['max_iterations'], random_state = hyp_params['random_state'])
else:
  cv = KFold(n_splits = hyp_params['num_of_folds'], shuffle = hyp_params['shuffle_grid'], random_state = hyp_params['random_state'])


In [240]:
def RFparams(data, model_name):
  RF_params = data['design_state_data']['algorithms'][model_name]
  parameters = {'n_estimators': [RF_params['min_trees'],RF_params['max_trees']],
                'max_depth': [RF_params['min_depth'],RF_params['max_depth']],
                'min_samples_leaf': [RF_params['min_samples_per_leaf_min_value'],RF_params['min_samples_per_leaf_max_value']]}
  return parameters

def GBTparams(data,model_name):
  GBTparams = data['design_state_data']['algorithms'][model_name]
  parameters = {}
  #parameters['n_estimators'] = GBTparams['num_of_BoostingStages']
  parameters['subsample'] = [GBTparams['min_subsample'], GBTparams['max_subsample']]
  parameters['learning_rate'] = [GBTparams['min_stepsize'], GBTparams['max_stepsize']]
  parameters['n_iter_no_change'] = [GBTparams['min_iter'], GBTparams['max_iter']]
  parameters['max_depth'] = [GBTparams['min_depth'], GBTparams['max_depth']]
  return parameters

def LRparams(data,model_name):
  LRparams = data['design_state_data']['algorithms'][model_name]
  parameters = {}
  #parameters['n_jobs'] = LRparams['parallelism']
  parameters['max_iter'] = [LRparams['min_iter'],LRparams['max_iter']]
  parameters['C'] = [LRparams['min_regparam'],LRparams['max_regparam']]
  parameters['l1_ratio'] = [LRparams['min_elasticnet'],LRparams['max_elasticnet']]
  return parameters 

def Rid_Laso_ENetparams(data,model_name):
  Rid_Lasoparams = data['design_state_data']['algorithms'][model_name]
  parameters = {}
  if type(Rid_Lasoparams['regularization_term']) == 'list':
    parameters['solver'] = Rid_Lasoparams['regularization_term']
  parameters['max_iter'] = [Rid_Lasoparams['min_iter'],Rid_Lasoparams['max_iter']]
  parameters['alpha'] = [Rid_Lasoparams['min_regparam'],Rid_Lasoparams['max_regparam']]
  if model_name == 'ElasticNet':
    parameters['l1_ratio'] = [Rid_Lasoparams['min_elasticnet'],Rid_Lasoparams['max_elasticnet']] 
  return parameters 

def DTparams(data,model_name):
  DTparams = data['design_state_data']['algorithms'][model_name]
  parameters = {}
  parameters['max_depth'] = [DTparams['min_depth'],DTparams['max_depth']]
  parameters['min_samples_leaf'] = DTparams['min_samples_per_leaf']
  '''
  if DTparams['use_best'] == True:
    parameters['splitter'] = 'best'
  if DTparams['use_random'] == True:
    parameters['splitter'] = 'random'
  '''
  return parameters

def xgboost(data,model_name):
  xgboost = data['design_state_data']['algorithms'][model_name]
  parameters = {}
  parameters['max_depth'] = xgboost['max_depth_of_tree']
  #parameters['learning_rate'] = xgboost['learningRate']
  return parameters

def SVMparams(data,model_name):
  SVMparams = data['design_state_data']['algorithms'][model_name]
  parameters = {'C': SVMparams['c_value']}
  return parameters

def SGDparams(data,model_name):
  SGDparams = data['design_state_data']['algorithms'][model_name]
  parameters = {'alpha': SGDparams['alpha_value']}
  return parameters

def KNNparams(data,model_name):
  KNNparams = data['design_state_data']['algorithms'][model_name]
  for keys in KNNparams:
    if type(KNNparams[keys]) == 'list' and len(KNNparams[keys])>1:
      parameters = {'n_neighbors': SGDparams['k_value']}
    else:
      parameters = {}
  return parameters

def ertparams(data,model_name):
  ertparams = data['design_state_data']['algorithms'][model_name]
  parameters = {}
  parameters['n_estimators'] = ertparams['num_of_trees']
  parameters['max_depth'] = ertparams['max_depth']
  parameters['min_samples_leaf'] = ertparams['min_samples_per_leaf']
  return parameters

def nnparams(data,model_name):
  nnparams = data['design_state_data']['algorithms'][model_name]
  parameters = {'hidden_layer_sizes':nnparams['hidden_layer_sizes']}
  return parameters


##RandomForestRegressor

In [142]:
model_name = 'RandomForestRegressor'
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model RandomForestRegressor() has been selected


In [143]:
if model_name == 'RandomForestRegressor' or 'RandomForestClassifier':
  model = models[model_name]
  clf = GridSearchCV(model, RFparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)

In [144]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=RandomForestRegressor(), n_jobs=5,
             param_grid={'max_depth': [20, 25], 'min_samples_leaf': [5, 10],
                         'n_estimators': [10, 20]})

In [145]:
clf.cv_results_

{'mean_fit_time': array([0.06412659, 0.12107549, 0.06878752, 0.10117151, 0.05508318,
        0.11682417, 0.04902808, 0.09367784]),
 'std_fit_time': array([0.01698023, 0.02488072, 0.01200999, 0.02801626, 0.00924941,
        0.03416579, 0.01003389, 0.02168163]),
 'mean_score_time': array([0.01348025, 0.01290162, 0.00670715, 0.00958719, 0.00522788,
        0.01117345, 0.00837322, 0.00901131]),
 'std_score_time': array([0.00874494, 0.00393404, 0.00525052, 0.00521179, 0.00483702,
        0.00521997, 0.00554527, 0.00707996]),
 'param_max_depth': masked_array(data=[20, 20, 20, 20, 25, 25, 25, 25],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[5, 5, 10, 10, 5, 5, 10, 10],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[10, 20, 10, 20, 10, 20, 10,

In [146]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.9479396192622999
Best parameters are {'max_depth': 20, 'min_samples_leaf': 10, 'n_estimators': 10}


##GBTRegressor

In [150]:
model_name = 'GBTRegressor' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model GradientBoostingRegressor() has been selected


In [151]:
if model_name == 'GBTRegressor' or 'GBTClassifier':
  model = models[model_name]
  clf = GridSearchCV(model, GBTparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)

In [152]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

96 fits failed out of a total of 192.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/ensemble/_gb.py", line 525, in fit
    self._check_params()
  File "/usr/local/lib/python3.8/dist-packages/sklearn/ensemble/_gb.py", line 317, in _check_params
    raise ValueError("subsample must be in (0,1] but was %r" % self.subsample)
ValueError: subsample must be in (0,1] but was 2

 0.94423382        nan 0.9395258         nan 0.93501328        nan
 0.92773

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=GradientBoostingRegressor(), n_jobs=5,
             param_grid={'learning_rate': [0.1, 0.5], 'max_depth': [5, 7],
                         'n_iter_no_change': [20, 40], 'subsample': [1, 2]})

In [153]:
clf.cv_results_

{'mean_fit_time': array([0.08487833, 0.00174675, 0.1062727 , 0.00283136, 0.10243229,
        0.00317464, 0.09528997, 0.00285594, 0.04088541, 0.00419698,
        0.07005858, 0.00364143, 0.04781775, 0.00355287, 0.0672097 ,
        0.00117381]),
 'std_fit_time': array([0.03187752, 0.0018994 , 0.02289673, 0.00409155, 0.0398629 ,
        0.00321823, 0.02813023, 0.00296949, 0.01384878, 0.00540399,
        0.01076958, 0.00602436, 0.01228296, 0.00505846, 0.01879549,
        0.00011501]),
 'mean_score_time': array([0.00158546, 0.        , 0.00292885, 0.        , 0.00334648,
        0.        , 0.00591292, 0.        , 0.00174999, 0.        ,
        0.0039466 , 0.        , 0.00139827, 0.        , 0.00330279,
        0.        ]),
 'std_score_time': array([0.00202356, 0.        , 0.00380436, 0.        , 0.00435788,
        0.        , 0.00508542, 0.        , 0.00173979, 0.        ,
        0.00543941, 0.        , 0.00142336, 0.        , 0.00410244,
        0.        ]),
 'param_learning_rate': ma

In [154]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.9442338216186107
Best parameters are {'learning_rate': 0.1, 'max_depth': 7, 'n_iter_no_change': 40, 'subsample': 1}


##LinearRegression

In [162]:
model_name = 'LinearRegression' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model LinearRegression() has been selected


In [164]:
if model_name == 'LinearRegression':
  model = models[model_name]
  LRprmts = data['design_state_data']['algorithms'][model_name]
  params = {'n_jobs': [LRprmts['parallelism'],hyp_params['parallelism']]}
  clf = GridSearchCV(model, params, cv = cv)

In [165]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=LinearRegression(), param_grid={'n_jobs': [2, 5]})

In [166]:
clf.cv_results_

{'mean_fit_time': array([0.00288409, 0.00059454]),
 'std_fit_time': array([7.47127378e-03, 8.13884540e-05]),
 'mean_score_time': array([0.00057358, 0.00035894]),
 'std_score_time': array([7.41447411e-04, 3.87285890e-05]),
 'param_n_jobs': masked_array(data=[2, 5],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_jobs': 2}, {'n_jobs': 5}],
 'split0_test_score': array([0.94835887, 0.94835887]),
 'split1_test_score': array([0.92509356, 0.92509356]),
 'split2_test_score': array([0.93433085, 0.93433085]),
 'split3_test_score': array([0.89722349, 0.89722349]),
 'split4_test_score': array([0.93323749, 0.93323749]),
 'split5_test_score': array([0.90589291, 0.90589291]),
 'split6_test_score': array([0.95007922, 0.95007922]),
 'split7_test_score': array([0.92092985, 0.92092985]),
 'split8_test_score': array([0.9235643, 0.9235643]),
 'split9_test_score': array([0.94013407, 0.94013407]),
 'split10_test_score': array([0.90739558, 0.90739558]),
 

In [167]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.9237978929792084
Best parameters are {'n_jobs': 2}


##Ridge Regression

In [168]:
model_name = 'RidgeRegression' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model Ridge() has been selected


In [170]:
if model_name == 'RidgeRegression' or 'LassoRegression' or 'ElasticNetRegression':
  model = models[model_name]
  clf = GridSearchCV(model, Rid_Laso_ENetparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)


In [171]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=Ridge(), n_jobs=5,
             param_grid={'alpha': [0.5, 0.8], 'max_iter': [30, 50]})

In [172]:
clf.cv_results_

{'mean_fit_time': array([0.0020563 , 0.00253419, 0.00478288, 0.00116547]),
 'std_fit_time': array([0.00207605, 0.00341435, 0.00626494, 0.0001409 ]),
 'mean_score_time': array([0.00071804, 0.00298045, 0.00179402, 0.00350247]),
 'std_score_time': array([0.00032462, 0.00380174, 0.00280238, 0.00503971]),
 'param_alpha': masked_array(data=[0.5, 0.5, 0.8, 0.8],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_iter': masked_array(data=[30, 50, 30, 50],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 0.5, 'max_iter': 30},
  {'alpha': 0.5, 'max_iter': 50},
  {'alpha': 0.8, 'max_iter': 30},
  {'alpha': 0.8, 'max_iter': 50}],
 'split0_test_score': array([0.94688091, 0.94688091, 0.94601677, 0.94601677]),
 'split1_test_score': array([0.9259236 , 0.9259236 , 0.92622704, 0.92622704]),
 'split2_test_score': array([0.9328643 , 0.9328643 , 0.93204962, 0.93204962]),
 

In [173]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.9237860057774075
Best parameters are {'alpha': 0.5, 'max_iter': 30}


##Lasso

In [174]:
model_name = 'LassoRegression' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model Lasso() has been selected


In [176]:
if model_name == 'RidgeRegression' or 'LassoRegression' or 'ElasticNetRegression':
  model = models[model_name]
  clf = GridSearchCV(model, Rid_Laso_ENetparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)

In [177]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=Lasso(), n_jobs=5,
             param_grid={'alpha': [0.5, 0.8], 'max_iter': [30, 50]})

In [178]:
clf.cv_results_

{'mean_fit_time': array([0.00459117, 0.0011586 , 0.00213784, 0.00129463]),
 'std_fit_time': array([0.00478719, 0.00029212, 0.00210917, 0.00023525]),
 'mean_score_time': array([0.0010099 , 0.00063829, 0.00070111, 0.00072865]),
 'std_score_time': array([7.63222087e-04, 9.75486945e-05, 9.38234583e-05, 1.63932520e-04]),
 'param_alpha': masked_array(data=[0.5, 0.5, 0.8, 0.8],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_iter': masked_array(data=[30, 50, 30, 50],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 0.5, 'max_iter': 30},
  {'alpha': 0.5, 'max_iter': 50},
  {'alpha': 0.8, 'max_iter': 30},
  {'alpha': 0.8, 'max_iter': 50}],
 'split0_test_score': array([0.77525223, 0.77525223, 0.58337983, 0.58337983]),
 'split1_test_score': array([0.77006375, 0.77006375, 0.57725948, 0.57725948]),
 'split2_test_score': array([0.78452998, 0.78452998, 0.59856807,

In [179]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.7787163441585195
Best parameters are {'alpha': 0.5, 'max_iter': 30}


##Elastic

In [180]:
model_name = 'ElasticNetRegression' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model ElasticNet() has been selected


In [181]:
if model_name == 'RidgeRegression' or 'LassoRegression' or 'ElasticNetRegression':
  model = models[model_name]
  clf = GridSearchCV(model, Rid_Laso_ENetparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)

In [182]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=ElasticNet(), n_jobs=5,
             param_grid={'alpha': [0.5, 0.8], 'max_iter': [30, 50]})

In [183]:
clf.cv_results_

{'mean_fit_time': array([0.00224199, 0.00371265, 0.00195007, 0.00120366]),
 'std_fit_time': array([0.00204255, 0.0029561 , 0.0017252 , 0.00016654]),
 'mean_score_time': array([0.00320995, 0.00142572, 0.00147825, 0.00082115]),
 'std_score_time': array([0.0043803 , 0.00251924, 0.00185376, 0.00042943]),
 'param_alpha': masked_array(data=[0.5, 0.5, 0.8, 0.8],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_iter': masked_array(data=[30, 50, 30, 50],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 0.5, 'max_iter': 30},
  {'alpha': 0.5, 'max_iter': 50},
  {'alpha': 0.8, 'max_iter': 30},
  {'alpha': 0.8, 'max_iter': 50}],
 'split0_test_score': array([0.84491795, 0.84491795, 0.77030829, 0.77030829]),
 'split1_test_score': array([0.84262701, 0.84262701, 0.76517238, 0.76517238]),
 'split2_test_score': array([0.84956123, 0.84956123, 0.77935063, 0.77935063]),
 

In [184]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.846074517921123
Best parameters are {'alpha': 0.5, 'max_iter': 30}


##xgboost

In [185]:
model_name = 'xg_boost' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model XGBRegressor() has been selected


In [187]:
if model_name == 'xg_boost':
  xgbparams = data['design_state_data']['algorithms'][model_name]
  early_stopping_rounds = xgbparams['early_stopping_rounds']
  reg_alpha = xgbparams['l1_regularization']
  reg_lambda = xgbparams['l2_regularization']
  gamma = xgbparams['gamma']
  min_child_weight = xgbparams['min_child_weight']
  subsample = xgbparams['sub_sample']
  colsample_bytree = xgbparams['col_sample_by_tree']
  model = XGBRegressor(early_stopping_rounds = early_stopping_rounds, reg_alpha = reg_alpha, reg_lambda = reg_lambda, gamma = gamma, 
                min_child_weight = min_child_weight, subsample = subsample, colsample_bytree = colsample_bytree)
  clf = GridSearchCV(model, xgboost(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)

In [188]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new) ##error due to inputs provided in json

24 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/xgboost/sklearn.py", line 391, in fit
    self._Booster = train(params, trainDmatrix,
  File "/usr/local/lib/python3.8/dist-packages/xgboost/training.py", line 212, in train
    return _train_internal(params, dtrain,
  File "/usr/local/lib/python3.8/dist-packages/xgboost/training.py", line 74, in _train_internal
    bst.update(dtrain, i, obj)
  File "/usr/local/lib/pytho



XGBoostError: ignored

In [None]:
clf.cv_results_

In [None]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

##DTRegressor

In [189]:
model_name = 'DecisionTreeRegressor' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model DecisionTreeRegressor() has been selected


In [191]:
if model_name == 'DecisionTreeRegressor' or 'DecisionTreeClassifier':
  DTprmts = data['design_state_data']['algorithms'][model_name]
  if DTprmts['use_best'] == True:
    splitter = 'best'
  if DTprmts['use_random'] == True:
    splitter = 'random'

  model = DecisionTreeRegressor(splitter = splitter)
  clf = GridSearchCV(model, DTparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)

In [192]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=DecisionTreeRegressor(splitter='random'), n_jobs=5,
             param_grid={'max_depth': [4, 7], 'min_samples_leaf': [12, 6]})

In [193]:
clf.cv_results_

{'mean_fit_time': array([0.0016259 , 0.00094336, 0.00289486, 0.00273496]),
 'std_fit_time': array([0.00152069, 0.00057002, 0.00367691, 0.00380048]),
 'mean_score_time': array([0.0027253 , 0.00112206, 0.00089594, 0.00266995]),
 'std_score_time': array([0.00384529, 0.00117234, 0.00050219, 0.00368293]),
 'param_max_depth': masked_array(data=[4, 4, 7, 7],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[12, 6, 12, 6],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 4, 'min_samples_leaf': 12},
  {'max_depth': 4, 'min_samples_leaf': 6},
  {'max_depth': 7, 'min_samples_leaf': 12},
  {'max_depth': 7, 'min_samples_leaf': 6}],
 'split0_test_score': array([0.87129567, 0.93924537, 0.76300295, 0.97958545]),
 'split1_test_score': array([0.90813193, 0.92900462, 0.94552252, 0.95099581]),
 'split2_test_score': array([0.        ,

In [194]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.9456901005365596
Best parameters are {'max_depth': 7, 'min_samples_leaf': 6}


##Classifiers

In [198]:
#using the classification models from here

models = {"RandomForestClassifier": RandomForestClassifier(),
            "GBTClassifier": GradientBoostingClassifier(),
            "LogisticRegression": LogisticRegression(),
            "DecisionTreeClassifier": DecisionTreeClassifier(),
            "SVM": SVC(),
            "SGD": SGDClassifier(),
            "KNN": KNeighborsClassifier(),
            "extra_random_trees": ExtraTreesClassifier(),
            "neural_network": MLPClassifier()}


##Random Forest Classifier

In [199]:
model_name = 'RandomForestClassifier' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model RandomForestClassifier() has been selected


In [200]:
if model_name == 'RandomForestRegressor' or 'RandomForestClassifier':
  model = models[model_name]
  clf = GridSearchCV(model, RFparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)

In [201]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=RandomForestClassifier(), n_jobs=5,
             param_grid={'max_depth': [20, 30], 'min_samples_leaf': [5, 50],
                         'n_estimators': [10, 30]})

In [202]:
clf.cv_results_

{'mean_fit_time': array([0.06708113, 0.21205048, 0.06089719, 0.18878053, 0.06686407,
        0.19286791, 0.06038463, 0.16238016]),
 'std_fit_time': array([0.01431798, 0.03715446, 0.02180424, 0.03437108, 0.02662736,
        0.02705041, 0.02099798, 0.05175007]),
 'mean_score_time': array([0.01036847, 0.01759801, 0.00702397, 0.01527804, 0.00711028,
        0.01663182, 0.00736296, 0.01682379]),
 'std_score_time': array([0.00587295, 0.00896026, 0.00677888, 0.00469373, 0.00536969,
        0.00600052, 0.00490459, 0.00884308]),
 'param_max_depth': masked_array(data=[20, 20, 20, 20, 30, 30, 30, 30],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[5, 5, 50, 50, 5, 5, 50, 50],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[10, 30, 10, 30, 10, 30, 10,

In [203]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.9533333333333335
Best parameters are {'max_depth': 20, 'min_samples_leaf': 5, 'n_estimators': 10}


##GBTClassifier

In [204]:
model_name = 'GBTClassifier' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model GradientBoostingClassifier() has been selected


In [205]:
if model_name == 'GBTRegressor' or 'GBTClassifier':
  model = models[model_name]
  clf = GridSearchCV(model, GBTparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)

In [206]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

96 fits failed out of a total of 192.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/ensemble/_gb.py", line 525, in fit
    self._check_params()
  File "/usr/local/lib/python3.8/dist-packages/sklearn/ensemble/_gb.py", line 317, in _check_params
    raise ValueError("subsample must be in (0,1] but was %r" % self.subsample)
ValueError: subsample must be in (0,1] but was 2

 0.95333333        nan 0.94666667        nan 0.95              nan
 0.94666

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=GradientBoostingClassifier(), n_jobs=5,
             param_grid={'learning_rate': [0.1, 0.5], 'max_depth': [5, 7],
                         'n_iter_no_change': [20, 40], 'subsample': [1, 2]})

In [207]:
clf.cv_results_

{'mean_fit_time': array([0.57270191, 0.00379646, 0.92061422, 0.00660962, 0.82639625,
        0.00489253, 0.99786087, 0.0028923 , 0.35412397, 0.00303088,
        0.57504894, 0.00478325, 0.33285534, 0.00667614, 0.51047397,
        0.00461626]),
 'std_fit_time': array([0.31785081, 0.00364916, 0.41545652, 0.00529602, 0.44576212,
        0.00366913, 0.3455331 , 0.0015488 , 0.10238829, 0.00358803,
        0.09047212, 0.00497716, 0.0813437 , 0.00472408, 0.10354517,
        0.00469081]),
 'mean_score_time': array([0.00271694, 0.        , 0.00329608, 0.        , 0.0043444 ,
        0.        , 0.00365718, 0.        , 0.00643243, 0.        ,
        0.00363135, 0.        , 0.00518878, 0.        , 0.00550338,
        0.        ]),
 'std_score_time': array([0.00236029, 0.        , 0.00417628, 0.        , 0.00492288,
        0.        , 0.00401022, 0.        , 0.00550891, 0.        ,
        0.00420413, 0.        , 0.00497266, 0.        , 0.00526904,
        0.        ]),
 'param_learning_rate': ma

In [208]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.9533333333333335
Best parameters are {'learning_rate': 0.1, 'max_depth': 5, 'n_iter_no_change': 40, 'subsample': 1}


##Logistic Regression

In [209]:
model_name = 'LogisticRegression' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model LogisticRegression() has been selected


In [213]:
if model_name ==  'LogisticRegression':
  model = models[model_name]
  clf = GridSearchCV(model, LRparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)

In [214]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=LogisticRegression(), n_jobs=5,
             param_grid={'C': [0.5, 0.8], 'l1_ratio': [0.5, 0.8],
                         'max_iter': [30, 50]})

In [215]:
clf.cv_results_

{'mean_fit_time': array([0.05180156, 0.057166  , 0.05322971, 0.0578807 , 0.05070702,
        0.07268629, 0.06290066, 0.04058333]),
 'std_fit_time': array([0.01733044, 0.01023873, 0.01294388, 0.01271839, 0.011108  ,
        0.02980458, 0.03378334, 0.0139    ]),
 'mean_score_time': array([0.00328505, 0.00059485, 0.00296493, 0.00109698, 0.00061828,
        0.00504762, 0.00127101, 0.00059706]),
 'std_score_time': array([4.82055634e-03, 3.96899508e-05, 4.95394594e-03, 1.50654050e-03,
        7.32388298e-05, 5.61885402e-03, 1.96622041e-03, 5.25691003e-05]),
 'param_C': masked_array(data=[0.5, 0.5, 0.5, 0.5, 0.8, 0.8, 0.8, 0.8],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_l1_ratio': masked_array(data=[0.5, 0.5, 0.8, 0.8, 0.5, 0.5, 0.8, 0.8],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_iter': masked_array(d

In [216]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.9666666666666668
Best parameters are {'C': 0.5, 'l1_ratio': 0.5, 'max_iter': 30}


##DTClassifier

In [217]:
model_name = 'DecisionTreeClassifier' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model DecisionTreeClassifier() has been selected


In [219]:
if model_name == 'DecisionTreeRegressor' or 'DecisionTreeClassifier':
  DTprmts = data['design_state_data']['algorithms'][model_name]
  if DTprmts['use_best'] == True:
    splitter = 'best'
  if DTprmts['use_random'] == True:
    splitter = 'random'

  model = DecisionTreeClassifier(splitter = splitter)
  clf = GridSearchCV(model, DTparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)

In [220]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=DecisionTreeClassifier(splitter='random'), n_jobs=5,
             param_grid={'max_depth': [4, 7], 'min_samples_leaf': [12, 6]})

In [221]:
clf.cv_results_

{'mean_fit_time': array([0.0019262 , 0.00099049, 0.00573446, 0.002895  ]),
 'std_fit_time': array([0.00189113, 0.00014964, 0.00473969, 0.00289793]),
 'mean_score_time': array([0.0021625 , 0.00269101, 0.00073892, 0.00119742]),
 'std_score_time': array([0.00289597, 0.00315692, 0.00055639, 0.00148985]),
 'param_max_depth': masked_array(data=[4, 4, 7, 7],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[12, 6, 12, 6],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 4, 'min_samples_leaf': 12},
  {'max_depth': 4, 'min_samples_leaf': 6},
  {'max_depth': 7, 'min_samples_leaf': 12},
  {'max_depth': 7, 'min_samples_leaf': 6}],
 'split0_test_score': array([0.88, 1.  , 0.84, 0.96]),
 'split1_test_score': array([0.88, 0.96, 0.8 , 0.96]),
 'split2_test_score': array([0.96, 1.  , 0.64, 0.96]),
 'split3_test_score': array([0.88

In [222]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.9466666666666667
Best parameters are {'max_depth': 7, 'min_samples_leaf': 6}


##SVM

In [223]:
model_name = 'SVM' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model SVC() has been selected


In [225]:
if model_name == 'SVM':
  svmprms = data['design_state_data']['algorithms'][model_name]
  tol = svmprms['tolerance']
  max_iter = svmprms['max_iterations']
  if svmprms['linear_kernel'] == True:
    kernel = 'linear'
  elif svmprms['rep_kernel'] == True:
    kernel = 'rbf' #assuming rep_kernel represents rbf kernel
  elif svmprms['polynomial_kernel'] == True:
    kernel = 'poly'
  elif svmprms['sigmoid_kernel'] == True:
    kernel = 'sigmoid'
  if svmprms['auto'] == True:
    gamma = 'auto'
  elif svmprms['scale'] == True:
    gamma = 'scale'
  
  model = SVC(tol = tol, max_iter = max_iter, kernel = kernel, gamma = gamma)
  clf = GridSearchCV(model, SVMparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)


In [226]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=SVC(gamma='auto', kernel='linear', max_iter=7, tol=7),
             n_jobs=5, param_grid={'C': [566, 79]})

In [227]:
clf.cv_results_

{'mean_fit_time': array([0.00522786, 0.0061939 ]),
 'std_fit_time': array([0.00541965, 0.00564068]),
 'mean_score_time': array([0.00082346, 0.00054942]),
 'std_score_time': array([5.72458316e-04, 5.69451566e-05]),
 'param_C': masked_array(data=[566, 79],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 566}, {'C': 79}],
 'split0_test_score': array([0.32, 0.32]),
 'split1_test_score': array([0.32, 0.32]),
 'split2_test_score': array([0.32, 0.32]),
 'split3_test_score': array([0.32, 0.32]),
 'split4_test_score': array([0.36, 0.36]),
 'split5_test_score': array([0.36, 0.36]),
 'split6_test_score': array([0.32, 0.32]),
 'split7_test_score': array([0.32, 0.32]),
 'split8_test_score': array([0.32, 0.32]),
 'split9_test_score': array([0.32, 0.32]),
 'split10_test_score': array([0.36, 0.36]),
 'split11_test_score': array([0.36, 0.36]),
 'mean_test_score': array([0.33333333, 0.33333333]),
 'std_test_score': array([0.01885618, 0.01885618]),

In [228]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.33333333333333326
Best parameters are {'C': 566}


##SGD

In [229]:
model_name = 'SGD' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model SGDClassifier() has been selected


In [232]:
if model_name == 'SGD':
  sgdprms = data['design_state_data']['algorithms'][model_name]
  if sgdprms['use_modified_hubber_loss'] == True:
    loss = 'modified_huber'
  else: 
    loss='hinge'
  tol = sgdprms['tolerance']
  if sgdprms['use_l1_regularization'] == True:
    penalty = 'l1'
  if sgdprms['use_l2_regularization'] == True:
    penalty = 'l2'
  if sgdprms['use_elastic_net_regularization'] == True:
    penalty = 'elasticnet'

  model = SGDClassifier(loss = loss, tol = tol, penalty = penalty)
  clf = GridSearchCV(model, SGDparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)


In [233]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=SGDClassifier(penalty='elasticnet', tol=56), n_jobs=5,
             param_grid={'alpha': [79, 56]})

In [234]:
clf.cv_results_

{'mean_fit_time': array([0.01240194, 0.00840565]),
 'std_fit_time': array([0.00814806, 0.007223  ]),
 'mean_score_time': array([0.00290807, 0.00115919]),
 'std_score_time': array([0.00446131, 0.00170846]),
 'param_alpha': masked_array(data=[79, 56],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 79}, {'alpha': 56}],
 'split0_test_score': array([0.32, 0.36]),
 'split1_test_score': array([0.32, 0.32]),
 'split2_test_score': array([0.32, 0.36]),
 'split3_test_score': array([0.32, 0.36]),
 'split4_test_score': array([0.36, 0.36]),
 'split5_test_score': array([0.32, 0.32]),
 'split6_test_score': array([0.36, 0.32]),
 'split7_test_score': array([0.32, 0.36]),
 'split8_test_score': array([0.32, 0.32]),
 'split9_test_score': array([0.32, 0.36]),
 'split10_test_score': array([0.32, 0.32]),
 'split11_test_score': array([0.36, 0.36]),
 'mean_test_score': array([0.33      , 0.34333333]),
 'std_test_score': array([0.01732051, 0.01972027]

In [235]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.34333333333333327
Best parameters are {'alpha': 56}


##KNN

In [237]:
model_name = 'KNN' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model KNeighborsClassifier() has been selected


In [245]:
if model_name == 'KNN':
  KNNprms = data['design_state_data']['algorithms'][model_name]
  n_neighbors = KNNprms['k_value'][0]
  if KNNprms['distance_weighting'] == True:
    weights = 'distance'
  else: 
    weights = 'uniform'
  if 'auto' not in KNNprms['neighbour_finding_algorithm'].lower():
    algorithm = KNNprms['neighbour_finding_algorithm']
  else:
    algorithm = 'auto'

  model = KNeighborsClassifier(n_neighbors = n_neighbors, weights = weights, algorithm = algorithm)
  clf = GridSearchCV(model, KNNparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)

In [246]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=KNeighborsClassifier(n_neighbors=78, weights='distance'),
             n_jobs=5, param_grid={})

In [247]:
clf.cv_results_

{'mean_fit_time': array([0.00100676]),
 'std_fit_time': array([6.04713005e-05]),
 'mean_score_time': array([0.00474717]),
 'std_score_time': array([0.00458932]),
 'params': [{}],
 'split0_test_score': array([1.]),
 'split1_test_score': array([0.96]),
 'split2_test_score': array([0.96]),
 'split3_test_score': array([0.96]),
 'split4_test_score': array([0.96]),
 'split5_test_score': array([0.92]),
 'split6_test_score': array([1.]),
 'split7_test_score': array([0.96]),
 'split8_test_score': array([0.96]),
 'split9_test_score': array([0.96]),
 'split10_test_score': array([0.96]),
 'split11_test_score': array([0.96]),
 'mean_test_score': array([0.96333333]),
 'std_test_score': array([0.01972027]),
 'rank_test_score': array([1], dtype=int32)}

In [248]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.9633333333333335
Best parameters are {}


##ExtraRandomTrees

In [249]:
model_name = 'extra_random_trees' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model ExtraTreesClassifier() has been selected


In [250]:
if model_name == 'extra_random_trees':
  model = models[model_name]
  clf = GridSearchCV(model, ertparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)

In [251]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=6, random_state=1),
             estimator=ExtraTreesClassifier(), n_jobs=5,
             param_grid={'max_depth': [12, 45], 'min_samples_leaf': [78, 56],
                         'n_estimators': [45, 489]})

In [252]:
clf.cv_results_

{'mean_fit_time': array([0.19420159, 2.15656573, 0.21972982, 3.04480716, 0.23685308,
        2.26849626, 0.19005891, 2.33187173]),
 'std_fit_time': array([0.06201081, 0.47538584, 0.05880652, 0.66962346, 0.03571167,
        0.17888228, 0.03141438, 0.37573963]),
 'mean_score_time': array([0.02340704, 0.23675525, 0.03285964, 0.29140184, 0.02741347,
        0.2351807 , 0.02388124, 0.26764603]),
 'std_score_time': array([0.0079298 , 0.06327348, 0.01706501, 0.05002689, 0.00671272,
        0.0620893 , 0.0068124 , 0.08698584]),
 'param_max_depth': masked_array(data=[12, 12, 12, 12, 45, 45, 45, 45],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[78, 78, 56, 56, 78, 78, 56, 56],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[45, 489, 45, 489, 45, 4

In [253]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is 0.6933333333333332
Best parameters are {'max_depth': 12, 'min_samples_leaf': 56, 'n_estimators': 489}


##Neural network

In [254]:
model_name = 'neural_network' 
for key in models:
  if model_name == key:
    print('The model {} has been selected'.format(models[key]))

The model MLPClassifier() has been selected


In [262]:
if model_name == 'neural_network':
  nnprms = data['design_state_data']['algorithms'][model_name]
  alpha = nnprms['alpha_value']
  max_iter = nnprms['max_iterations']
  tol = nnprms['convergence_tolerance']
  early_stopping = nnprms['early_stopping']
  solver = nnprms['solver'].lower()
  shuffle = nnprms['shuffle_data']
  learning_rate_init = nnprms['initial_learning_rate']
  if nnprms['automatic_batching'] == True:
    batch_size = 'auto'
  beta_1 = nnprms['beta_1']
  beta_2 = nnprms['beta_2']
  epsilon = nnprms['epsilon']
  power_t = nnprms['power_t']
  nesterovs_momentum = nnprms['use_nesterov_momentum']

  model = MLPClassifier(alpha = alpha, max_iter = max_iter, tol = tol, early_stopping = early_stopping, solver = solver, shuffle = shuffle, learning_rate_init=learning_rate_init,
                batch_size = batch_size, beta_1 = beta_1, beta_2 = beta_2, epsilon = epsilon, power_t = power_t, nesterovs_momentum = nesterovs_momentum)
  clf = GridSearchCV(model, nnparams(data, model_name), n_jobs = hyp_params['parallelism'], cv = cv)

In [263]:
X_new = reduced_features
Y_new = Y_encoded
clf.fit(X_new, Y_new) ##error due to input values

24 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 752, in fit
    return self._fit(X, y, incremental=False)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 384, in _fit
    self._validate_hyperparameters()
  File "/usr/local/lib/python3.8/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 451

ValueError: ignored

In [259]:
clf.cv_results_

AttributeError: ignored

In [264]:
print("Best score is {}".format(clf.best_score_))
print("Best parameters are {}".format(clf.best_params_))

Best score is nan
Best parameters are {'hidden_layer_sizes': 67}
