In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import pickle

In [2]:
# Function to perform Sequential Feature Selection
def perform_sfs_feature_selection(X, y):
    sfs_features_list = []
    
    # Initialize models for feature selection
    log_model = LogisticRegression(solver='lbfgs', max_iter=1000)
    rf_model = RandomForestClassifier(n_estimators=5, criterion='entropy', random_state=0)
    
    sfs_models = [log_model, rf_model]
    
    # Perform feature selection for each model
    for model in sfs_models:
        sfs = SFS(model, k_features=6, forward=True, floating=False, scoring='accuracy', cv=5)
        sfs_fit = sfs.fit(X, y)
        sfs_features = sfs.transform(X)
        sfs_features_list.append(sfs_features)
    
    return sfs_features_list

In [3]:

# Function to get feature names after feature selection
def get_sfs_feature_names(X, y):
    sfs_feature_names_list = []
    
    # Initialize models for feature selection
    log_model = LogisticRegression(solver='lbfgs', max_iter=1000)
    rf_model = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    
    sfs_models = [log_model, rf_model]
    
    # Get feature names for each model
    for model in sfs_models:
        sfs = SFS(model, k_features=6, forward=True, floating=False, scoring='accuracy', cv=5)
        sfs_fit = sfs.fit(X, y)
        sfs_feature_names = list(sfs.k_feature_names_)
        sfs_feature_names_list.append(sfs_feature_names)
    
    return sfs_feature_names_list

In [4]:
# Function to split and scale the dataset
def split_and_scale_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    
    # Feature scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test, scaler

In [5]:
# Function to evaluate a classifier
def evaluate_classifier(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    
    # Compute confusion matrix, accuracy, and classification report
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    return accuracy, report, cm

In [6]:
# Function to train Logistic Regression model
def train_logistic_regression(X_train, y_train, X_test, y_test):
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [7]:
# Function to train SVM with linear kernel
def train_svm_linear(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [8]:

# Function to train SVM with RBF kernel
def train_svm_rbf(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [9]:
# Function to train Naive Bayes model
def train_naive_bayes(X_train, y_train, X_test, y_test):
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [10]:
# Function to train K-Nearest Neighbors model
def train_knn(X_train, y_train, X_test, y_test):
    classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [11]:
# Function to train Decision Tree model
def train_decision_tree(X_train, y_train, X_test, y_test):
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [12]:
# Function to train Random Forest model
def train_random_forest(X_train, y_train, X_test, y_test):
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [13]:
 #Function to compare model accuracies
def compare_models_accuracies(accuracies_logistic, accuracies_svm_linear, accuracies_svm_rbf, accuracies_knn, accuracies_naive, accuracies_decision_tree, accuracies_random_forest):
    models_df = pd.DataFrame(index=['Logistic', 'RandomForest'], columns=['Logistic', 'SVM_Linear', 'SVM_RBF', 'KNN', 'NaiveBayes', 'DecisionTree', 'RandomForest'])

    # Populate DataFrame with accuracies
    for i, index in enumerate(models_df.index):
        models_df['Logistic'][index] = accuracies_logistic[i]
        models_df['SVM_Linear'][index] = accuracies_svm_linear[i]
        models_df['SVM_RBF'][index] = accuracies_svm_rbf[i]
        models_df['KNN'][index] = accuracies_knn[i]
        models_df['NaiveBayes'][index] = accuracies_naive[i]
        models_df['DecisionTree'][index] = accuracies_decision_tree[i]
        models_df['RandomForest'][index] = accuracies_random_forest[i]
    
    return models_df

In [16]:
def main():
    # Load dataset and preprocess
    dataset = pd.read_csv("prep.csv")
    df = pd.get_dummies(dataset, drop_first=True)

    X = df.drop('classification_yes', axis=1)
    y = df['classification_yes']

    # Perform feature selection
    selected_features_list = perform_sfs_feature_selection(X, y)

    # Lists to store accuracies for each model
    accuracies_logistic = []
    accuracies_svm_linear = []
    accuracies_svm_rbf = []
    accuracies_knn = []
    accuracies_naive = []
    accuracies_decision_tree = []
    accuracies_random_forest = []

    best_model = None
    best_accuracy = 0
    best_model_name = ""
    best_scaler = None

    # Train and evaluate models for each selected feature set
    for features in selected_features_list:
        X_train, X_test, y_train, y_test, scaler = split_and_scale_data(features, y)
        
        # Train Logistic Regression model
        log_clf, log_acc, log_rep, log_cm = train_logistic_regression(X_train, y_train, X_test, y_test)
        accuracies_logistic.append(log_acc)
        
        # Train SVM with linear kernel
        svm_lin_clf, svm_lin_acc, svm_lin_rep, svm_lin_cm = train_svm_linear(X_train, y_train, X_test, y_test)
        accuracies_svm_linear.append(svm_lin_acc)
        
        # Train SVM with RBF kernel
        svm_rbf_clf, svm_rbf_acc, svm_rbf_rep, svm_rbf_cm = train_svm_rbf(X_train, y_train, X_test, y_test)
        accuracies_svm_rbf.append(svm_rbf_acc)
        
        # Train K-Nearest Neighbors model
        knn_clf, knn_acc, knn_rep, knn_cm = train_knn(X_train, y_train, X_test, y_test)
        accuracies_knn.append(knn_acc)
        
        # Train Naive Bayes model
        naive_clf, naive_acc, naive_rep, naive_cm = train_naive_bayes(X_train, y_train, X_test, y_test)
        accuracies_naive.append(naive_acc)
        
        # Train Decision Tree model
        dt_clf, dt_acc, dt_rep, dt_cm = train_decision_tree(X_train, y_train, X_test, y_test)
        accuracies_decision_tree.append(dt_acc)
        
        # Train Random Forest model
        rf_clf, rf_acc, rf_rep, rf_cm = train_random_forest(X_train, y_train, X_test, y_test)
        accuracies_random_forest.append(rf_acc)
        
        # Update the best model if current model has better accuracy
        if log_acc > best_accuracy:
            best_accuracy = log_acc
            best_model = log_clf
            best_model_name = "LogisticRegression"
            best_scaler = scaler
        
        if svm_lin_acc > best_accuracy:
            best_accuracy = svm_lin_acc
            best_model = svm_lin_clf
            best_model_name = "SVM_Linear"
            best_scaler = scaler
        
        if svm_rbf_acc > best_accuracy:
            best_accuracy = svm_rbf_acc
            best_model = svm_rbf_clf
            best_model_name = "SVM_RBF"
            best_scaler = scaler
        
        if knn_acc > best_accuracy:
            best_accuracy = knn_acc
            best_model = knn_clf
            best_model_name = "KNN"
            best_scaler = scaler
        
        if naive_acc > best_accuracy:
            best_accuracy = naive_acc
            best_model = naive_clf
            best_model_name = "NaiveBayes"
            best_scaler = scaler
        
        if dt_acc > best_accuracy:
            best_accuracy = dt_acc
            best_model = dt_clf
            best_model_name = "DecisionTree"
            best_scaler = scaler
        
        if rf_acc > best_accuracy:
            best_accuracy = rf_acc
            best_model = rf_clf
            best_model_name = "RandomForest"
            best_scaler = scaler

    print(f"Best Model: {best_model_name}")
    print(f"Accuracy: {best_accuracy}")
    
    # Save the best model and scaler
    model_filename = f"finalized_model_{best_model_name.lower()}.sav"
    pickle.dump(best_model, open(model_filename, 'wb'))
    pickle.dump(best_scaler, open('scaler.pkl', 'wb'))

    # Example input for prediction
    example_input = best_scaler.transform([[5, 50, 0, 0, 148.1126761,7]])
    
    # Load the saved model and make a prediction
    loaded_model = pickle.load(open(model_filename, 'rb'))
    prediction_result = loaded_model.predict(example_input)
    prediction_result = prediction_result.astype(int)
    
    print("Prediction result:", prediction_result)
    
    
if __name__ == "__main__":
    main()




Best Model: LogisticRegression
Accuracy: 1.0
Prediction result: [1]
