In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from fairness.pre_processing import categorical_to_numeric_converter
from sklearn.model_selection import GridSearchCV

In [2]:
dataset = pd.read_csv('./dataset/ull/ULL_dataset.csv', sep=',')

In [3]:
dataset = categorical_to_numeric_converter(dataset)

In [4]:
output_columns = ['level_mat', 'level_len', 'level_ing']

In [5]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

svc_classifier = SVC()
random_forest_classifier = RandomForestClassifier()
decision_tree_classifier = DecisionTreeClassifier()
knn_classifier = KNeighborsClassifier()

models = [
    svc_classifier,
    random_forest_classifier,
    decision_tree_classifier,
    knn_classifier
]

svc_params = {
    'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree' : range(1, 10),
    'gamma' : ['scale', 'auto']
}

random_forest_params = {
    'min_samples_leaf': [5, 7, 9, 11],
    'n_estimators': [200, 500],
    'max_depth' : [10, 20, 50, 80, 100, 150],
    'criterion' :['gini', 'entropy']
}

decision_tree_params = {
    'min_samples_leaf': [5, 7, 9, 11],
    'max_depth' : [10, 20, 50, 80, 100, 150],
    'criterion' :['gini', 'entropy']
}

knn_params = {
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': range(20, 50),
}


model_params = [svc_params, random_forest_params, decision_tree_params, knn_params]

In [None]:
for output_column in output_columns:
    
    X = dataset.loc[:, dataset.columns!=output_column]
    y = dataset[output_column]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    standard_scaler_train = StandardScaler()
    X_train = standard_scaler_train.fit_transform(X_train)

    standard_scaler_test = StandardScaler()
    X_test = standard_scaler_test.fit_transform(X_test)

    best_params = None
    max_accuracy = 0
    best_model = None
    
    for model_index in range(0, len(models)):
        
        grid_search = GridSearchCV(
        estimator=models[model_index],
        param_grid=model_params[model_index],
        scoring='accuracy', 
        return_train_score=True,
        cv=3
        )

        grid_search_classifier = grid_search.fit(X_train, y_train)
        y_pred = grid_search_classifier.predict(X_test)
        if accuracy_score(y_test, y_pred) > max_accuracy:
            max_accuracy = accuracy_score(y_test, y_pred)
            best_model = models[model_index]
            best_params = model_params[model_index]
    
    
    print("OUTPUT COLUMN: ", output_column)
    print("Best model: ", best_model)
    print("Best_params: ", best_params)
    print("Accuracy: ", max_accuracy)