In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, f1_score, roc_auc_score

In [2]:
# Read JSON
json_model = pd.read_json("model.json")
json_model

Unnamed: 0,session_name,session_description,design_state_data
algorithms,test,test,{'RandomForestClassifier': {'model_name': 'Ran...
feature_generation,test,test,"{'linear_interactions': [['petal_length', 'sep..."
feature_handling,test,test,{'sepal_length': {'feature_name': 'sepal_lengt...
feature_reduction,test,test,"{'feature_reduction_method': 'Tree-based', 'nu..."
hyperparameters,test,test,"{'strategy': 'Grid Search', 'shuffle_grid': Tr..."
metrics,test,test,"{'optimize_model_hyperparameters_for': 'AUC', ..."
probability_calibration,test,test,{'probability_calibration_method': 'Sigmoid - ...
session_info,test,test,"{'project_id': '1', 'experiment_id': 'kkkk-11'..."
target,test,test,"{'prediction_type': 'Regression', 'target': 'p..."
train,test,test,"{'policy': 'Split the dataset', 'time_variable..."


In [3]:
design_state_data = json_model['design_state_data']
design_state_data

algorithms                 {'RandomForestClassifier': {'model_name': 'Ran...
feature_generation         {'linear_interactions': [['petal_length', 'sep...
feature_handling           {'sepal_length': {'feature_name': 'sepal_lengt...
feature_reduction          {'feature_reduction_method': 'Tree-based', 'nu...
hyperparameters            {'strategy': 'Grid Search', 'shuffle_grid': Tr...
metrics                    {'optimize_model_hyperparameters_for': 'AUC', ...
probability_calibration    {'probability_calibration_method': 'Sigmoid - ...
session_info               {'project_id': '1', 'experiment_id': 'kkkk-11'...
target                     {'prediction_type': 'Regression', 'target': 'p...
train                      {'policy': 'Split the dataset', 'time_variable...
weighting_stratergy        {'weighting_stratergy_method': 'Sample weights...
Name: design_state_data, dtype: object

In [4]:
design_state_data['session_info']

{'project_id': '1',
 'experiment_id': 'kkkk-11',
 'dataset': 'iris_modified.csv',
 'session_name': 'test',
 'session_description': 'test'}

In [5]:
# Load the dataset
data = pd.read_csv(json_model['design_state_data']['session_info']['dataset'])
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [6]:
feature_variables = list(design_state_data['feature_handling'].keys())
feature_variables

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

In [7]:
numerical_variables = [v for v in feature_variables if design_state_data['feature_handling'][v]['feature_variable_type'] == 'numerical']
numerical_variables

['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [8]:
# Data Imputation
for x in numerical_variables:
    data[x] = data[x].fillna(design_state_data['feature_handling'][x]['feature_details']['impute_value'])
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [9]:
categorical_variables = [v for v in feature_variables if design_state_data['feature_handling'][v]['feature_variable_type'] == 'text']
categorical_variables

['species']

In [10]:
# Feature Generation 
from sklearn.preprocessing import PolynomialFeatures, RobustScaler

# Define the linear interaction pipeline
linear_interactions = PolynomialFeatures(interaction_only=True, include_bias=False)
# Apply the linear interaction pipeline to the selected features
features_to_interact = design_state_data['feature_generation']['linear_interactions'][0]
new_linear_feature = linear_interactions.fit_transform(data[features_to_interact])
new_linear_feature
nf = pd.DataFrame(new_linear_feature).drop([0,1],axis=1)
nf = nf.rename(columns={2: 'new_linear_feature'})
data = pd.concat([data,nf],axis=1)
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,new_linear_feature
0,5.1,3.5,1.4,0.2,Iris-setosa,4.90
1,4.9,3.0,1.4,0.2,Iris-setosa,4.20
2,4.7,3.2,1.3,0.2,Iris-setosa,4.16
3,4.6,3.1,1.5,0.2,Iris-setosa,4.65
4,5.0,3.6,1.4,0.2,Iris-setosa,5.04
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica,15.60
146,6.3,2.5,5.0,1.9,Iris-virginica,12.50
147,6.5,3.0,5.2,2.0,Iris-virginica,15.60
148,6.2,3.4,5.4,2.3,Iris-virginica,18.36


In [11]:
# Hashing of categorical features
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer(n_features=1000)

for x in categorical_variables:
    data[x] = vectorizer.fit_transform(data[x].astype(str)).toarray()
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,new_linear_feature
0,5.1,3.5,1.4,0.2,0.0,4.90
1,4.9,3.0,1.4,0.2,0.0,4.20
2,4.7,3.2,1.3,0.2,0.0,4.16
3,4.6,3.1,1.5,0.2,0.0,4.65
4,5.0,3.6,1.4,0.2,0.0,5.04
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0.0,15.60
146,6.3,2.5,5.0,1.9,0.0,12.50
147,6.5,3.0,5.2,2.0,0.0,15.60
148,6.2,3.4,5.4,2.3,0.0,18.36


In [12]:
target_variable = design_state_data['target']['target']
# Separate target variable from feature variables
y = data[target_variable]
X = data.drop(target_variable,axis=1)
X

Unnamed: 0,sepal_length,sepal_width,petal_length,species,new_linear_feature
0,5.1,3.5,1.4,0.0,4.90
1,4.9,3.0,1.4,0.0,4.20
2,4.7,3.2,1.3,0.0,4.16
3,4.6,3.1,1.5,0.0,4.65
4,5.0,3.6,1.4,0.0,5.04
...,...,...,...,...,...
145,6.7,3.0,5.2,0.0,15.60
146,6.3,2.5,5.0,0.0,12.50
147,6.5,3.0,5.2,0.0,15.60
148,6.2,3.4,5.4,0.0,18.36


In [13]:
sampling_method = design_state_data['train']['sampling_method']
sampling_ratio = design_state_data['train']['train_ratio']
sampling_seed = design_state_data['train']['random_seed']

# Split into train and test
n = design_state_data['hyperparameters']['num_of_folds']
if design_state_data['train']['k_fold']:
    kf = KFold(n_splits=n)
    for train_idx, val_idx in kf.split(X):
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[val_idx], y[val_idx]
else:
    # Split the data into training and testing sets based on the given sampling policy
    if sampling_method == 'No sampling(whole data)':
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=sampling_seed)
    elif sampling_method == 'Stratified sampling':
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=sampling_ratio, stratify=y, random_state=sampling_seed)
    elif sampling_method == 'Random sampling':
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=sampling_ratio, random_state=sampling_seed)

In [14]:
# Set up the algorithm and parameter grid
if design_state_data['hyperparameters']['shuffle_grid']:
    random_state = design_state_data['hyperparameters']['random_state']
else:
    random_state = None

if design_state_data['hyperparameters']['parallelism'] == 1:
    n_jobs = None
else:
    n_jobs = design_state_data['hyperparameters']['parallelism']

if design_state_data['hyperparameters']['stratified']:
    scoring = 'accuracy'
else:
    scoring = None
    
if design_state_data['hyperparameters']['cross_validation_strategy'] == 'Time-based K-fold(with overlap)':
    cv = TimeSeriesSplit(n_splits=design_state_data['hyperparameters']['num_of_folds'], max_train_size=None, test_size=None)
else:
    raise ValueError(f"Unsupported cross-validation strategy: {design_state_data['hyperparameters']['cross_validation_strategy']}")


In [15]:
# prediction_type = 'Regression' if design_state_data['target']['prediction_type'] == 'Regression' else 'Classification'

for algorithm, params in design_state_data['algorithms'].items():
    if params['is_selected']:
        if algorithm == 'RandomForestClassifier':
            clf = RandomForestClassifier(n_estimators=params['max_trees'],
                                         max_depth=params['max_depth'],
                                         min_samples_leaf=params['min_samples_per_leaf_min_value'],
                                         max_samples=params['min_samples_per_leaf_max_value'],
                                         random_state=random_state, n_jobs=n_jobs 
                                        )
        elif algorithm == 'RandomForestRegressor':
            clf = RandomForestRegressor(n_estimators=params['max_trees'],
                                        max_depth=params['max_depth'],
                                        min_samples_leaf=params['min_samples_per_leaf_min_value'],
                                        max_samples=params['min_samples_per_leaf_max_value'],
                                        random_state=random_state, n_jobs=n_jobs
                                       )
        elif algorithm == 'GBTClassifier':
            clf = GBTClassifier()
        elif algorithm == 'GBTRegressor':
            clf = GBTRegressor()
        elif algorithm == 'LinearRegression':
            clf = LogisticRegression()
        elif algorithm == 'RidgeRegression':
            clf = RidgeRegression()
        elif algorithm == 'LassoRegression':
            clf = LassoRegression()
        elif algorithm == 'ElasticNetRegression':
            clf = ElasticNet()
        elif algorithm == 'LogisticRegression':
            clf = LogisticRegression()
        elif algorithm == 'xg_boost':
            clf = GradientBoostingClassifier()
        elif algorithm == 'DecisionTreeRegressor':
            clf = DecisionTreeRegressor()
        elif algorithm == 'DecisionTreeClassifier':
            clf = DecisionTreeClassifier()
        elif algorithm == 'SVM':
            clf = SVC(C=params['c_value'])
        elif algorithm == 'KNN':
            clf = KNeighborsClassifier(n_neighbors=params['k_value'])
        elif algorithm == 'neural_network':
            clf = MLPClassifier()
        # Add more elif statements for other algorithms here

print("Model selected: ",clf)

Model selected:  RandomForestRegressor(max_depth=25, max_samples=10, min_samples_leaf=5,
                      n_estimators=20, n_jobs=5, random_state=1)


In [16]:
model = clf.fit(X_train, y_train)
#model_acc = model.score(X_test, y_test)

# Evaluate the best models on the test data
model_acc = model.score(X_test, y_test)

print(f'Accuracy: {model_acc:.2f}')

Accuracy: 0.75
