In [None]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

def train_svm_validation_split(train_data, max_iter=100, reg_param=0.1):
    """
    Trains an SVM model using the given training dataset.

    Args:
        train_data (pyspark.sql.DataFrame): The training dataset.
        max_iter (int): Maximum number of iterations (default: 100).
        reg_param (float): Regularization parameter (default: 0.1).

    Returns:
        pyspark.ml.classification.LinearSVCModel: The trained SVM model.
    """
    svm = LinearSVC(labelCol="label", featuresCol="features", maxIter=max_iter, regParam=reg_param)
    svm_model = svm.fit(train_data)
    return svm_model

def predict_svm_result(model, test_data):
    """
    Makes predictions using the trained SVM model on the testing data.

    Args:
        model (pyspark.ml.classification.LinearSVCModel): The trained SVM model.
        test_data (pyspark.sql.DataFrame): The testing data.

    Returns:
        pyspark.sql.DataFrame: The DataFrame with predicted results and actual labels.
    """
    predictions = model.transform(test_data)
    predictions = predictions.select("prediction", "label")
    return predictions

def tenfold_svm_tuning_model(data):
    """
    Performs tenfold cross-validation on the dataset and trains SVM models.

    Args:
        data (pyspark.sql.DataFrame): The original dataset.

    Returns:
        tuple: A tuple containing two lists:
            - model1_results: A list of tuples (iteration, sensitivity, specificity) for model 1.
            - model2_results: A list of tuples (iteration, sensitivity, specificity) for model 2.
    """
    # Separate the dataset into abnormal and normal people based on sy_glucose column
    normal_people, abnormal_people = separate_datasets_by_glucose(data)

    # Get the features of different models
    feature1, feature2_1, feature2_2 = retrive_features()

    # Initialize lists to store the results
    model1_results = []
    model2_results = []

    # Perform tenfold cross-validation
    for i in range(10):
        # Split the abnormal and normal people datasets into train and test data
        abnormal_train, abnormal_test = abnormal_people.randomSplit([0.9, 0.1], seed=i)
        normal_train, normal_test = normal_people.randomSplit([0.9, 0.1], seed=i)

        # Combine the train datasets using combine_train_datasets function
        original_train_data = combine_train_datasets(abnormal_train, normal_train)

        # Combine the test datasets using combine_testing_datasets function
        main_test_data, dataset_number = combine_testing_datasets(normal_test, abnormal_test)

        original_train_data = retrieve_selected_features(feature1, original_train_data)

        # Train the original data (model 1)
        model1 = train_svm_validation_split(original_train_data)

        # Train model 2.1 and model 2.2
        normal_train = retrieve_selected_features(feature2_1, normal_train)
        model2_1 = train_svm_validation_split(normal_train)

        abnormal_train = retrieve_selected_features(feature2_2, abnormal_train)
        model2_2 = train_svm_validation_split(abnormal_train)

        # Predict the results for model 1 for total people
        test_data_model1 = retrieve_selected_features(feature1, main_test_data)
        prediction1 = predict_svm_result(model1, test_data_model1)

        # Calculate sensitivity and specificity for model 1
        sensitivity1, specificity1 = calculate_sensitivity_specificity(prediction1)

        # Save the results for model 1
        model1_results.append((i+1, sensitivity1, specificity1))

        # Predict the results for model 2.1 normal people and model 2.2 abnormal people
        test_data_model2_1 = retrieve_selected_features(feature2_1, main_test_data)
        prediction2_1 = predict_svm_result(model2_1, test_data_model2_1)

        test_data_model2_2 = retrieve_selected_features(feature2_2, main_test_data)
        prediction2_2 = predict_svm_result(model2_2, test_data_model2_2)

        # Combine the predictions using combine_predictions function
        prediction2 = combine_predictions(prediction2_1, prediction2_2, dataset_number)

        # Calculate sensitivity and specificity for model 2
        sensitivity2, specificity2 = calculate_sensitivity_specificity(prediction2)

        # Save the results for model 2
        model2_results.append((i+1, sensitivity2, specificity2))

    return model1_results, model2_results

In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

def train_random_forest_validation_split(train_data, max_depth=5, num_trees=100, seed=42):
    """
    Trains a random forest model using the given training dataset.

    Args:
        train_data (pyspark.sql.DataFrame): The training dataset.
        max_depth (int): Maximum depth of the tree (default: 5).
        num_trees (int): Number of trees to train (default: 100).
        seed (int): Random seed for reproducibility (default: 42).

    Returns:
        pyspark.ml.classification.RandomForestClassificationModel: The trained random forest model.
    """
    rf = RandomForestClassifier(labelCol="label", featuresCol="features", maxDepth=max_depth, numTrees=num_trees, seed=seed)
    rf_model = rf.fit(train_data)
    return rf_model

def predict_random_forest_result(model, test_data):
    """
    Makes predictions using the trained random forest model on the testing data.

    Args:
        model (pyspark.ml.classification.RandomForestClassificationModel): The trained random forest model.
        test_data (pyspark.sql.DataFrame): The testing data.

    Returns:
        pyspark.sql.DataFrame: The DataFrame with predicted results and actual labels.
    """
    predictions = model.transform(test_data)
    predictions = predictions.select("prediction", "label")
    return predictions

def tenfold_random_forest_tuning_model(data):
    """
    Performs tenfold cross-validation on the dataset and trains random forest models.

    Args:
        data (pyspark.sql.DataFrame): The original dataset.

    Returns:
        tuple: A tuple containing two lists:
            - model1_results: A list of tuples (iteration, sensitivity, specificity) for model 1.
            - model2_results: A list of tuples (iteration, sensitivity, specificity) for model 2.
    """
    # Separate the dataset into abnormal and normal people based on sy_glucose column
    normal_people, abnormal_people = separate_datasets_by_glucose(data)

    # Get the features of different models
    feature1, feature2_1, feature2_2 = retrive_features()

    # Initialize lists to store the results
    model1_results = []
    model2_results = []

    # Perform tenfold cross-validation
    for i in range(10):
        # Split the abnormal and normal people datasets into train and test data
        abnormal_train, abnormal_test = abnormal_people.randomSplit([0.9, 0.1], seed=i)
        normal_train, normal_test = normal_people.randomSplit([0.9, 0.1], seed=i)

        # Combine the train datasets using combine_train_datasets function
        original_train_data = combine_train_datasets(abnormal_train, normal_train)

        # Combine the test datasets using combine_testing_datasets function
        main_test_data, dataset_number = combine_testing_datasets(normal_test, abnormal_test)

        original_train_data = retrieve_selected_features(feature1, original_train_data)

        # Train the original data (model 1)
        model1 = train_random_forest_validation_split(original_train_data)

        # Train model 2.1 and model 2.2
        normal_train = retrieve_selected_features(feature2_1, normal_train)
        model2_1 = train_random_forest_validation_split(normal_train)

        abnormal_train = retrieve_selected_features(feature2_2, abnormal_train)
        model2_2 = train_random_forest_validation_split(abnormal_train)

        # Predict the results for model 1 for total people
        test_data_model1 = retrieve_selected_features(feature1, main_test_data)
        prediction1 = predict_random_forest_result(model1, test_data_model1)

        # Calculate sensitivity and specificity for model 1
        sensitivity1, specificity1 = calculate_sensitivity_specificity(prediction1)

        # Save the results for model 1
        model1_results.append((i+1, sensitivity1, specificity1))

        # Predict the results for model 2.1 normal people and model 2.2 abnormal people
        test_data_model2_1 = retrieve_selected_features(feature2_1, main_test_data)
        prediction2_1 = predict_random_forest_result(model2_1, test_data_model2_1)

        test_data_model2_2 = retrieve_selected_features(feature2_2, main_test_data)
        prediction2_2 = predict_random_forest_result(model2_2, test_data_model2_2)

        # Combine the predictions using combine_predictions function
        prediction2 = combine_predictions(prediction2_1, prediction2_2, dataset_number)

        # Calculate sensitivity and specificity for model 2
        sensitivity2, specificity2 = calculate_sensitivity_specificity(prediction2)

        # Save the results for model 2
        model2_results.append((i+1, sensitivity2, specificity2))

    return model1_results, model2_results