In [23]:
!pip install smac



In [24]:
from ConfigSpace import Configuration, ConfigurationSpace

import numpy as np
from smac import HyperparameterOptimizationFacade, Scenario
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

iris = datasets.load_iris()


def train(config: Configuration, seed: int = 0) -> float:
    classifier = SVC(C=config["C"], random_state=seed)
    scores = cross_val_score(classifier, iris.data, iris.target, cv=5)
#     print (np.mean(scores))
    return 1 - np.mean(scores)


configspace = ConfigurationSpace({"C": (0.100, 1000.0)})

# Scenario object specifying the optimization environment
scenario = Scenario(configspace, deterministic=True, n_trials=200)

# Use SMAC to find the best configuration/hyperparameters
smac = HyperparameterOptimizationFacade(scenario, train)
incumbent = smac.optimize()

[INFO][abstract_initial_design.py:147] Using 10 initial design configurations and 0 additional configurations.
[INFO][smbo.py:497] Continuing from previous run.
[INFO][smbo.py:270] Optimization process was already finished. Returning incumbent...


In [25]:
incumbent.values


<bound method Mapping.values of Configuration(values={
  'C': 11.346459377614869,
})>

In [26]:
classifier = SVC(C=6.724148702862232)
scores = cross_val_score(classifier, iris.data, iris.target, cv=5)
print(np.mean(scores))

0.9866666666666667


In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, ShuffleSplit
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
import seaborn as sn
import warnings

warnings.filterwarnings('ignore')

In [28]:
# Load the Credit Card Fraud Detection dataset
url = "https://www.openml.org/data/get_csv/31/dataset_31.csv"
df = pd.read_csv(url)

# Drop missing values
df = df.dropna()

# Convert categorical features to numerical
df = pd.get_dummies(df, drop_first=True)

In [29]:
df.head()

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,checking_status_'<0',checking_status_'>=200',checking_status_'no checking',...,other_payment_plans_none,other_payment_plans_stores,housing_own,housing_rent,job_'unemp/unskilled non res',job_'unskilled resident',job_skilled,own_telephone_yes,foreign_worker_yes,class_good
0,6,1169,4,4,67,2,1,True,False,False,...,True,False,True,False,False,False,True,True,True,True
1,48,5951,2,2,22,1,1,False,False,False,...,True,False,True,False,False,False,True,False,True,False
2,12,2096,2,3,49,1,2,False,False,True,...,True,False,True,False,False,True,False,False,True,True
3,42,7882,2,4,45,1,2,True,False,False,...,True,False,False,False,False,False,True,False,True,True
4,24,4870,3,4,53,2,2,True,False,False,...,True,False,False,False,False,False,True,False,True,False


In [30]:
df['class_good'].value_counts()

class_good
True     700
False    300
Name: count, dtype: int64

In [31]:
target = "class_good"
X = df.drop(columns=target)
y = df[target]

# Split the dataset into training and testing sets
X_train_raw, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
from sklearn.feature_selection import SelectKBest, f_classif

def Which_features(X_train,y_train,number_of_columns):

    # Select the top k features based on ANOVA F-statistic
    selector = SelectKBest(score_func=f_classif, k=number_of_columns)
    X_train_selected = selector.fit_transform(X_train, y_train)

    # Get the column names of the selected features
    selected_feature_names = X_train.columns[selector.get_support()]

    
    return list(selected_feature_names)

selected_features = Which_features(X_train_raw,y_train,number_of_columns=15)
selected_features

['duration',
 'credit_amount',
 'age',
 "checking_status_'<0'",
 "checking_status_'no checking'",
 "credit_history_'critical/other existing credit'",
 "credit_history_'no credits/all paid'",
 "purpose_'new car'",
 'purpose_radio/tv',
 "savings_status_'<100'",
 "property_magnitude_'no known property'",
 "property_magnitude_'real estate'",
 'other_payment_plans_none',
 'housing_own',
 'housing_rent']

In [33]:
X =X[selected_features]
X_train_raw = X_train_raw[selected_features]
X_test = X_test[selected_features]

In [34]:
X_train_raw.shape

(800, 15)

In [35]:
classifer=KNeighborsClassifier(n_neighbors=7)
classifer.fit(X_train_raw,y_train)
y_pred = classifer.predict(X_test)
loss=1-accuracy_score(y_test,y_pred)
print(loss)


0.32999999999999996


In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import datasets
iris = datasets.load_iris()

X = iris.data[:, 1:]
y = iris.data[:, 0] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.10212647866320387


In [47]:
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Float, Integer
from ConfigSpace.conditions import InCondition,EqualsCondition
from smac import HyperparameterOptimizationFacade, Scenario
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
#potential optimizations
#ass timeseries and regression models
#search for the best hypers parameters for the each model
#add random state parameters
#what is the problem with linear svc
class Models:
    def __init__(self,similarModels,problemType):
        self.Models=similarModels
        self.Problemtype=problemType
    def configspace(self):
        confs = ConfigurationSpace(seed=0)
        #HPOs
        if self.Problemtype=='Classification':
            # models are ['KNN','LR',"RF",'SVC']
            models=Categorical('Models',self.Models)
            #KNN parameters
            Kneighbors=Integer('Ks',(1,10),default=1)
            #LR  and svc Parameters
            rc=Float('regularizationStre',(0.01,1))
            #RF parameters
            nestimators=Integer('n_estimators',(1,20),default=10)
            #SVC parameters
            kernel=Categorical('kernel',['linear','rbf'])
            #dependencies

            useks=InCondition(child=Kneighbors,parent=models,values=['KNN'])
            userc=InCondition(child=rc,parent=models,values=['LR','SVC'])
            usekernel=InCondition(child=kernel,parent=models,values=['SVC'])
            useEst=EqualsCondition(child=nestimators,parent=models,value='RF')



            #adding conditions and HPs
            confs.add_hyperparameters([models,Kneighbors,rc,nestimators,kernel])
            confs.add_conditions([useks,userc,usekernel,useEst])
        elif self.Problemtype=='Regression':
            models=Categorical('Models',self.Models)
            #linear regression parameters
            #lasso and ridge regression parameters
            alpha=Float('alpha',(0.01,100))
            
            #random forest and XGboost parameters
            nestimators=Integer('n_estimators',(1,20),default=10)
            #dependencies 
            usealpha=InCondition(child=alpha,parent=models,values=['Lasso','Ridge'])
            useEst=InCondition(child=nestimators,parent=models,values=['RF','XGboost'])
            #adding conditions and HPs
            confs.add_hyperparameters([models,alpha,nestimators])
            confs.add_conditions([usealpha,useEst])



            
        return confs
    def train(self,config:Configuration,seed: int=0):
        
        config_dict=config.get_dictionary()
        model=config_dict['Models']
        print(f"config_dict:{config_dict}")
        if self.Problemtype=='Classification':
            if model =='KNN':
                Classifier=KNeighborsClassifier(n_neighbors=config_dict['Ks'])
                Classifier.fit(X_train_raw,y_train)
                y_pred = Classifier.predict(X_test)
                loss=1-accuracy_score(y_test,y_pred)
                print("the loss is: ",loss)
                return loss
            elif model=='LR':
                Classifier=LogisticRegression(C=config_dict['regularizationStre'])
                Classifier.fit(X_train_raw,y_train)
                y_pred = Classifier.predict(X_test)
                loss=1-accuracy_score(y_test,y_pred)
                print("the losss is: ",loss)
                return loss
            elif model=='RF':
                Classifier=RandomForestClassifier(n_estimators=config_dict['n_estimators'])
                Classifier.fit(X_train_raw,y_train)
                y_pred = Classifier.predict(X_test)
                loss=1-accuracy_score(y_test,y_pred)
                print("the loss is: ",loss)
                return loss
            elif model=='SVC':
                Classifier=SVC(C=config_dict['regularizationStre'],kernel=config_dict['kernel'])
                Classifier.fit(X_train_raw,y_train)
                y_pred = Classifier.predict(X_test)
                loss=1-accuracy_score(y_test,y_pred)
                print("the loss is: ",loss)
                return loss
        elif self.Problemtype=='Regression':
            return self.regression(config_dict)
    def regression(self, configDict):
            model=configDict['Models']
            if model=='LR':
                regressor=LinearRegression()
                regressor.fit(X_train,y_train)
                y_pred = regressor.predict(X_test)
                mse = mean_squared_error(y_test, y_pred)
                print("Mean Squared Error:", mse)
                return mse
            elif model=='Lasso':
                regressor=Lasso(alpha=configDict['alpha'])
                regressor.fit(X_train,y_train)
                y_pred = regressor.predict(X_test)
                mse = mean_squared_error(y_test, y_pred)
                print("Mean Squared Error:", mse)
                return mse
            elif model=='Ridge':
                regressor=Ridge(alpha=configDict['alpha'])
                regressor.fit(X_train,y_train)
                y_pred = regressor.predict(X_test)
                mse = mean_squared_error(y_test, y_pred)
                print("Mean Squared Error:", mse)
                return mse
            elif model=='RF':
                regressor=RandomForestRegressor(n_estimators=configDict['n_estimators'])
                regressor.fit(X_train,y_train)
                y_pred = regressor.predict(X_test)
                mse = mean_squared_error(y_test, y_pred)
                print("Mean Squared Error:", mse)
                return mse
            elif model=='XGboost':
                regressor=XGBRegressor(n_estimators=configDict['n_estimators'])
                regressor.fit(X_train,y_train)
                y_pred = regressor.predict(X_test)
                mse = mean_squared_error(y_test, y_pred)
                print("Mean Squared Error:", mse)
                return mse

In [50]:
if __name__ =="__main__":
    #classifier=Models(['KNN','LR','RF','SVC'],'Classification')
    classifier=Models(['LR','Lasso','Ridge','RF','XGboost'],'Regression')
    scenario = Scenario(classifier.configspace(), deterministic=True, n_trials=100)
    smac = HyperparameterOptimizationFacade(scenario, classifier.train)
    incumbent = smac.optimize()
    incumbent.values
    print(incumbent)

    


[INFO][abstract_initial_design.py:95] Reducing the number of initial configurations from 30 to 25 (max_ratio == 0.25).
[INFO][abstract_initial_design.py:147] Using 20 initial design configurations and 0 additional configurations.
[INFO][abstract_intensifier.py:305] Using only one seed for deterministic scenario.
config_dict:{'Models': 'Lasso', 'alpha': 62.060263897813854}
Mean Squared Error: 0.719450694444445
[INFO][abstract_intensifier.py:515] Added config bababd as new incumbent because there are no incumbents yet.
config_dict:{'Models': 'Ridge', 'alpha': 30.637191347517074}
Mean Squared Error: 0.11162686091350908
[INFO][abstract_intensifier.py:594] Added config 3559a2 and rejected config bababd as incumbent because it is not better than the incumbents on 1 instances:
config_dict:{'Models': 'XGboost', 'n_estimators': 7}
Mean Squared Error: 0.09114166605050956
[INFO][abstract_intensifier.py:594] Added config 255114 and rejected config 3559a2 as incumbent because it is not better than 

In [78]:
regressor=RandomForestRegressor(n_estimators=19)
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.0918528657535652


In [39]:
Classifier=RandomForestClassifier(n_estimators=19)
Classifier.fit(X_train_raw,y_train)
y_pred = Classifier.predict(X_test)
loss=1-accuracy_score(y_test,y_pred)
print("the loss is: ",loss)

ValueError: Unknown label type: 'continuous'