In [1]:
from ColumnTransformers import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import json
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import roc_curve, auc, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
def KNNSearch(df_X,y,metric='recall') : 
    param_grid = {
        'n_neighbors' : np.arange(1,20,3),
                'weights' : ['uniform', 'distance']
    }
    transformer=PipeLineColumnTransformer() 
    transformer.fit(df_X,y) 
    X=transformer.transform(df_X) 
    model= KNeighborsClassifier(n_jobs=-1)
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=metric, n_jobs=-1)
    grid_search.fit(X,y)
    best_params=grid_search.best_params_
    with open("savedModels/KNN.json", "w") as f:
        json.dump(best_params, f, indent=4)
    return best_params

def LogRegSearch(df_X,y,metric='recall') : 
    param_grid = {'C' : [1e-5, 1e-3, 1e-1, 1, 100, 1000],
                  'class_weight' : ['balanced', {0:1,1:2}, {0:1,1:4}, {0:1,1:7}, {0:1,1:10}]
    }
    transformer=PipeLineColumnTransformer() 
    transformer.fit(df_X,y) 
    X=transformer.transform(df_X) 
    model= LogisticRegression(penalty='l1', solver = 'saga', n_jobs=-1, random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=metric, n_jobs=-1)
    grid_search.fit(X,y)
    best_params=grid_search.best_params_
    with open("savedModels/LogisticRegression.json", "w") as f:
        json.dump(best_params, f, indent=4)
    return best_params

def SVMSearch(df_X,y,metric='recall') : 
    param_grid = {
            'C': [1e-5, 1e-3, 1e-1, 1, 10, 100],
            'max_iter': [1000, 5000, 10000],
            'tol': [1e-3, 1e-5],
            'penalty': ['l2', 'l1'],
            'class_weight' : ['balanced', {0:1,1:2}, {0:1,1:4}, {0:1,1:7}, {0:1,1:10}]
}
    transformer=PipeLineColumnTransformer() 
    transformer.fit(df_X,y) 
    X=transformer.transform(df_X) 
    model= LinearSVC(random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=metric, n_jobs=-1)
    grid_search.fit(X,y)
    best_params=grid_search.best_params_
    with open("savedModels/SVM.json", "w") as f:
        json.dump(best_params, f, indent=4)
    return best_params

def DecisionTreeSearch(df_X,y,metric='recall') : 
    param_grid = {'max_depth': [2, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [2, 5, 10],
    'max_features': ['sqrt', 'log2', 0.2, 0.5, 0.8],
    'max_leaf_nodes': [10, 20, 50],
    'class_weight' : ['balanced', {0:1,1:2}, {0:1,1:4}, {0:1,1:7}, {0:1,1:10}]
    }
    transformer=PipeLineColumnTransformer() 
    transformer.fit(df_X,y) 
    X=transformer.transform(df_X) 
    model= DecisionTreeClassifier(random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=metric, n_jobs=-1)
    grid_search.fit(X,y)
    best_params=grid_search.best_params_
    with open("savedModels/DecisionTree.json", "w") as f:
        json.dump(best_params, f, indent=4)
    return best_params

def RandomForestSearch(df_X,y,metric='recall') : 
    param_grid = {'n_estimators': [50, 200, 500, 1000],
    'max_depth': [2, 5, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['sqrt', 'log2', 0.2, 0.5, 0.8],
    'class_weight' : ['balanced', {0:1,1:2}, {0:1,1:4}, {0:1,1:7}, {0:1,1:10}]
    }
    transformer=PipeLineColumnTransformer() 
    transformer.fit(df_X,y) 
    X=transformer.transform(df_X) 
    model= RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=metric, n_jobs=-1)
    grid_search.fit(X,y)
    best_params=grid_search.best_params_
    with open("savedModels/RandomForest.json", "w") as f:
        json.dump(best_params, f, indent=4)
    return best_params

def NaiveBayesSearch(df_X,y,metric='recall') : 
    param_grid = {'alpha': [0.01, 0.1, 1.0, 10.0],
    'fit_prior': [True, False],
    'min_categories': [2, 5]
    }
    transformer=PipeLineColumnTransformer(NB=True) 
    transformer.fit(df_X,y) 
    X=transformer.transform(df_X) 
    model= CategoricalNB()
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=metric, n_jobs=-1)
    grid_search.fit(X,y)
    best_params=grid_search.best_params_
    with open("savedModels/CategoricalNB.json", "w") as f:
        json.dump(best_params, f, indent=4)
    return best_params

In [3]:
def ReadCreatedModel(modelName): 
    with open(f"savedModels/{modelName}.json", "r") as f:
        best_params=json.load(f) 
    if modelName == "KNN":
        # best_params['n_neighbors'] = int(best_params['n_neighbors'])
        # best_params['n_neighbors'] = float(best_params['n_neighbors'])
        # best_params['n_neighbors'] = float(best_params['n_neighbors'])
        # best_params['n_neighbors'] = float(best_params['n_neighbors'])
        # best_params['n_neighbors'] = float(best_params['n_neighbors'])
        model = KNeighborsClassifier(n_jobs=-1, **best_params)
    elif modelName == "LogisticRegression":
        # best_params['C'] = float(best_params['C'])
        model = LogisticRegression(penalty='l1', solver = 'saga', n_jobs=-1, random_state=42, **best_params)
    elif modelName == "SVM":
        # best_params['C'] = float(best_params['C'])
        # best_params['max_iter'] = float(best_params['max_iter'])
        # best_params['tol'] = float(best_params['tol'])
        model = LinearSVC(random_state=42, **best_params)
    elif modelName == "DecisionTree":
        # best_params['max_depth'] = int(best_params['max_depth'])
        # best_params['min_samples_split'] = int(best_params['n_neighbors'])
        # best_params['min_samples_leaf'] = int(best_params['n_neighbors'])
        # best_params['max_features'] = float(best_params['max_features']) if is_number(best_params['max_features']) else best_params['max_features']
        # best_params['max_leaf_nodes'] = int(best_params['max_leaf_nodes'])
        model = DecisionTreeClassifier(random_state=42, **best_params)
    elif modelName == "RandomForest":
        # best_params['n_estimators'] = float(best_params['n_estimators'])
        # best_params['max_depth'] = float(best_params['max_depth'])
        # best_params['min_samples_split'] = float(best_params['min_samples_split'])
        # best_params['min_samples_leaf'] = float(best_params['min_samples_leaf'])
        # best_params['max_features'] = float(best_params['max_features']) if is_number(best_params['max_features']) else best_params['max_features']
        model = RandomForestClassifier(random_state=42, **best_params)
    elif modelName == "CategoricalNB":
        model = CategoricalNB(**best_params)
    return model
    
def TestingModel(modelName,X_train,X_test,y_train,y_test,threshold): 
    
    Model=ReadCreatedModel(modelName) 
    Model=PipelineModel(Model)
    # X_train = PipeLineColumnTransformer(NB = modelName == "CategoricalNB").fit_transform(X_train, y_train)
    # X_test = PipeLineColumnTransformer(NB = modelName == "CategoricalNB").fit_transform(X_test, y_test)
    Model.fit(X_train,y_train) 
    y_scores=Model.predict_proba(X_test)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test, y_scores)
    roc_auc = auc(fpr, tpr)
    aucPlot(fpr, tpr,roc_auc)

    y_pred = (np.array(y_scores) > threshold).astype(int)
    PredictionQualityInfo(y_pred,y_test)
    print(f"AUC: {roc_auc}, Recall: {recall_score(y_test,y_pred)}")
    return Model


def aucPlot(fpr,tpr,roc_auc): 
    plt.figure()
    plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray") 
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.grid(True)
    plt.show()

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [4]:
X_train, y_train = getTrainingData()
X_test, y_test = getTestData()

In [None]:
KNNSearch(X_train,y_train)

In [None]:
try:
    LogRegSearch(X_train,y_train)
except Exception as e:
    print("Cosik nie poszło")
    print(e)

In [None]:
try:
    DecisionTreeSearch(X_train,y_train)
except Exception as e:
    print("Cosik nie poszło")
    print(e)
try:
    RandomForestSearch(X_train,y_train)
except Exception as e:
    print("Cosik nie poszło")
    print(e)
try:
    SVMSearch(X_train,y_train)  
except Exception as e:
    print("Cosik nie poszło")
    print(e)


KeyboardInterrupt: 

In [None]:
NaiveBayesSearch(X_train,y_train)

In [None]:
import os
os.system("shutdown /s /t 0")