In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pickle

In [2]:
# Function to perform PCA
def perform_pca(X, n_components=6):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    return X_pca, pca

In [3]:
# Function to split and scale the dataset
def split_and_scale_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    
    # Feature scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test, scaler

In [4]:
# Function to evaluate a classifier
def evaluate_classifier(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    
    # Compute confusion matrix, accuracy, and classification report
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    return accuracy, report, cm

In [5]:
# Function to train Logistic Regression model
def train_logistic_regression(X_train, y_train, X_test, y_test):
    classifier = LogisticRegression(random_state=0, max_iter=1000)
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [6]:
# Function to train SVM with linear kernel
def train_svm_linear(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [7]:
# Function to train SVM with RBF kernel
def train_svm_rbf(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [8]:
# Function to train Naive Bayes model
def train_naive_bayes(X_train, y_train, X_test, y_test):
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [9]:
# Function to train K-Nearest Neighbors model
def train_knn(X_train, y_train, X_test, y_test):
    classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [10]:
# Function to train Decision Tree model
def train_decision_tree(X_train, y_train, X_test, y_test):
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [11]:
# Function to train Random Forest model
def train_random_forest(X_train, y_train, X_test, y_test):
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    accuracy, report, cm = evaluate_classifier(classifier, X_test, y_test)
    return classifier, accuracy, report, cm

In [12]:
# Main function to process the dataset and train models
def main():
    # Load dataset and preprocess
    dataset = pd.read_csv("prep.csv")
    df = pd.get_dummies(dataset, drop_first=True)

    X = df.drop('classification_yes', axis=1)
    y = df['classification_yes']

    # Split and scale the data
    X_train, X_test, y_train, y_test, scaler = split_and_scale_data(X, y)

    # Perform PCA
    X_train_pca, pca = perform_pca(X_train, n_components=6)
    X_test_pca = pca.transform(X_test)

    # Train and evaluate models
    models = {
        "Logistic Regression": train_logistic_regression,
        "SVM Linear": train_svm_linear,
        "SVM RBF": train_svm_rbf,
        "Naive Bayes": train_naive_bayes,
        "K-Nearest Neighbors": train_knn,
        "Decision Tree": train_decision_tree,
        "Random Forest": train_random_forest
    }

    best_model = None
    best_accuracy = 0
    best_model_name = ""
    best_scaler = None

    for model_name, train_model in models.items():
        classifier, accuracy, report, cm = train_model(X_train_pca, y_train, X_test_pca, y_test)
        print(f"Model: {model_name}")
        print(f"Accuracy: {accuracy}")
        print(f"Confusion Matrix:\n{cm}")
        print(f"Classification Report:\n{report}\n")

        # Update the best model if current model has better accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = classifier
            best_model_name = model_name
            best_scaler = scaler

    print(f"Best Model: {best_model_name}")
    print(f"Accuracy: {best_accuracy}")

    # Save the best model, scaler, and PCA transformer
    model_filename = f"finalized_model_{best_model_name.lower().replace(' ', '_')}.sav"
    pickle.dump(best_model, open(model_filename, 'wb'))
    pickle.dump(best_scaler, open('scaler.pkl', 'wb'))
    pickle.dump(pca, open('pca.pkl', 'wb'))

    # Example input for prediction (must match the original feature set)
    example_input = [[76.45994832, 3, 0, 148.1126761, 3.077356021, 137.528754, 4.62724359, 12.51815562, 38.86890244, 8408.191126, 4.705597015, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]]

    # Scale the example input
    example_input_scaled = best_scaler.transform(example_input)

    # Apply PCA to the scaled input
    example_input_pca = pca.transform(example_input_scaled)

    # Load the saved model and make a prediction
    loaded_model = pickle.load(open(model_filename, 'rb'))
    prediction_result = loaded_model.predict(example_input_pca)
    prediction_result = prediction_result.astype(int)

    print("Prediction result:", prediction_result)

if __name__ == "__main__":
    main()

Model: Logistic Regression
Accuracy: 0.99
Confusion Matrix:
[[36  0]
 [ 1 63]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        36
           1       1.00      0.98      0.99        64

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100


Model: SVM Linear
Accuracy: 0.99
Confusion Matrix:
[[36  0]
 [ 1 63]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        36
           1       1.00      0.98      0.99        64

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100


Model: SVM RBF
Accuracy: 1.0
Confusion Matrix:
[[36  0]
 [ 0 64]]
Classification Report:
              precision    recall  f1-score   support

     

