In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Import statements

In [13]:
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, dok_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
from scipy.sparse import csr_matrix
from scipy.sparse import dok_matrix

#SCGIS

In [14]:
import numpy as np
from scipy.sparse import csr_matrix

def train_SCGIS(SX, max_iterations=100, tolerance=1e-5):
    """
    Trains the SCGIS algorithm based on the provided sparse incidence matrix SX.
    The observed supports (OS) are calculated internally based on SX.

    Parameters:
    - SX: Sparse incidence matrix (scipy.sparse.csr_matrix) or pandas DataFrame
    - max_iterations: Maximum number of iterations for convergence
    - tolerance: Convergence tolerance

    Returns:
    - weights: Estimated weights for the frequent itemsets
    - wnorm: Normalization factor
    """
    # Check if the input matrix is empty
    if SX.shape[0] == 0 or SX.shape[1] == 0:
        raise ValueError("Sparse incidence matrix SX is empty")

    # If SX is a DataFrame, convert it to a sparse matrix
    if isinstance(SX, pd.DataFrame):
        SX = csr_matrix(SX)

    # Calculate observed supports (OS) as the sum of each column
    OS = np.array(SX.sum(axis=0)).flatten()  # Sum over rows to get supports for each itemset

    # Initialize weights and expected supports (ES)
    weights = np.zeros(SX.shape[1], dtype=np.float32)
    ES = np.zeros(SX.shape[1], dtype=np.float32)

    for iteration in range(max_iterations):
        # Calculate probabilities
        P = np.exp(SX.dot(weights))
        if np.sum(P) == 0:
            raise ValueError("Sum of probabilities P is zero")

        P /= np.sum(P)  # Normalize the probabilities

        # Update expected supports
        ES.fill(0)
        for i in range(SX.shape[0]):
            ES += P[i] * SX.getrow(i).toarray().flatten()  # Access each row with getrow(i)

        # Update weights
        wold = weights.copy()
        weights += np.log(OS + 1e-10) - np.log(ES + 1e-10)

        # Normalization
        wnorm = np.sum(np.exp(weights))
        if wnorm == 0:
            raise ValueError("Normalization factor wnorm is zero")

        weights /= wnorm

        # Check for convergence
        wprecision = np.max(np.abs(weights - wold))
        if wprecision < tolerance:
            break

    return weights, wnorm


#Categorize->Dice->intervals->missing->binarize->transform->filter

In [15]:
def categorize_column(dataset, dependent_column_name, num_intervals):
    # Fill NaN values with the mean of the column
    dataset_filled = dataset.fillna(dataset[dependent_column_name].mean())

    # Extract the specified dependent column
    value_ag_filled = dataset_filled[dependent_column_name]

    # Define the target range (0-1)
    target_range = (0, 1)

    # Scale to the target range [0, 1] using np.interp
    value_ag_scaled_filled = np.interp(value_ag_filled, (value_ag_filled.min(), value_ag_filled.max()), target_range)

    # Create intervals and assign labels for scaled values
    data_inter_filled = pd.cut(value_ag_scaled_filled, bins=num_intervals, labels=[chr(ord('A') + i) for i in range(num_intervals)])

    # Store intervals in a DataFrame
    result_df = pd.DataFrame({
        'Category': data_inter_filled,
        dependent_column_name: value_ag_scaled_filled
    })

    return result_df

import pandas as pd
import os
import numpy as np

# Define the path to your Temporary_SXmat directory where X_interval files are saved
save_path = '/content/drive/MyDrive/Temporary_SXmat'

# Load your frequent_patterns dataframe (replace with your actual frequent patterns dataframe)
# For example, assuming `support_df` is loaded from a CSV
# support_df = pd.read_csv('path_to_support_patterns.csv')

def calculate_dice_similarity(dataset_df, support_df):
    """
    Calculate the Dice-Sørensen Coefficient (DSC) matrix for a given dataset and frequent patterns.

    Args:
        dataset_df (pd.DataFrame): The dataset dataframe.
        support_df (pd.DataFrame): The frequent patterns dataframe (support patterns).

    Returns:
        pd.DataFrame: A dataframe containing the DSC matrix.
    """
    unique_items = set()
    for pattern in support_df.iloc[:, 0]:  # Assuming patterns are in the first column
        pattern = str(pattern)
        if pd.notna(pattern):
            try:
                items = pattern.split(', ')
                unique_items.update(items)
            except AttributeError:
                print(f"Skipping pattern {pattern} as it's not a valid string.")

    unique_item_mapping = {item: idx for idx, item in enumerate(unique_items)}
    bit_vector_length = len(unique_item_mapping)

    # Convert dataset rows to bit vectors
    def row_to_bit_vector(row, item_mapping, bit_length):
        bit_vector = np.zeros(bit_length, dtype=int)
        for col, value in row.items():
            item = f"{col}={value}"
            if item in item_mapping:
                bit_vector[item_mapping[item]] = 1
        return bit_vector

    dataset_bit_vectors = np.array([row_to_bit_vector(row, unique_item_mapping, bit_vector_length) for _, row in dataset_df.iterrows()])

    # Convert support patterns to bit vectors
    def pattern_to_bit_vector(pattern, item_mapping, bit_length):
        bit_vector = np.zeros(bit_length, dtype=int)
        try:
            pattern = str(pattern)
            for item in pattern.split(', '):
                if item in item_mapping:
                    bit_vector[item_mapping[item]] = 1
        except AttributeError:
            print(f"Skipping pattern {pattern} as it's not a valid string.")
        return bit_vector

    support_pattern_bit_vectors = np.array([
        pattern_to_bit_vector(pattern, unique_item_mapping, bit_vector_length)
        for pattern in support_df.iloc[:, 0]  # Assuming patterns are in the first column
        if pd.notna(pattern)
    ])

    # Initialize the DSC matrix with the correct shape
    dsc_matrix = np.zeros((len(dataset_bit_vectors), len(support_pattern_bit_vectors)))

    # Calculate DSC for each combination of dataset row and support pattern
    for i, data_vector in enumerate(dataset_bit_vectors):
        for j, pattern_vector in enumerate(support_pattern_bit_vectors):
            intersection = np.sum(data_vector & pattern_vector)
            union_cardinality = np.sum(data_vector) + np.sum(pattern_vector)
            if union_cardinality > 0:
                dsc_matrix[i, j] = (2 * intersection) / union_cardinality
            else:
                dsc_matrix[i, j] = 0

    # Create a dataframe with the DSC matrix
    dsc_df = pd.DataFrame(dsc_matrix, columns=[f'Pattern_{k}' for k in range(len(support_pattern_bit_vectors))])
    return dsc_df

def create_intervals_df(data_frame, column_name, num_intervals):
    # Extract the specified column
    value_ag = data_frame[column_name]

    #print("Extracted column values:")
    #print(value_ag)

    # Check if the column contains numeric data
    if not pd.api.types.is_numeric_dtype(value_ag):
        raise ValueError(f"The '{column_name}' column must contain numeric data.")

    # Check for missing values in the column
    if value_ag.isnull().any():
        raise ValueError(f"The '{column_name}' column contains missing values. Please handle them before processing.")

    # Check if scaling and clipping is necessary
    if value_ag.min() < 0 or value_ag.max() > 1:
        # Rescale to the range [0, 1]
        value_ag_scaled = (value_ag - value_ag.min()) / (value_ag.max() - value_ag.min())

        # Clip values to the range [0, 1]
        value_ag_scaled_clipped = np.clip(value_ag_scaled, 0, 1)
    else:
        # No need to rescale or clip
        value_ag_scaled_clipped = value_ag

    #print("Scaled and clipped column values:")
    #print(value_ag_scaled_clipped)

    # Calculate bin edges dynamically within the [0, 1] range
    bin_edges = np.linspace(0, 1, num=num_intervals + 1)

    # Create intervals and assign labels for clipped values
    data_inter_clipped = pd.cut(value_ag_scaled_clipped.values, bins=bin_edges, labels=[chr(ord('A') + i) for i in range(len(bin_edges) - 1)])

    #print("Computed intervals:")
    #print(data_inter_clipped)

    # Store intervals in a DataFrame
    intervals_df = pd.DataFrame({
        'Category': data_inter_clipped.categories,
        'Min_Value': [value_ag_scaled_clipped[data_inter_clipped == category].min() for category in data_inter_clipped.categories],
        'Max_Value': [value_ag_scaled_clipped[data_inter_clipped == category].max() for category in data_inter_clipped.categories]
    })

    return intervals_df



def handle_missing_values(dataset):
    # Check if there are missing values in the dataset
    if dataset.isnull().any().any():
        # Identify numerical and categorical columns
        numerical_cols = dataset.select_dtypes(include=['number']).columns
        categorical_cols = dataset.select_dtypes(exclude=['number']).columns

        # Impute numerical columns with mean
        imputer_numeric = SimpleImputer(strategy='mean')
        dataset[numerical_cols] = imputer_numeric.fit_transform(dataset[numerical_cols])

        # Impute categorical columns with the most frequent value (mode)
        imputer_categorical = SimpleImputer(strategy='most_frequent')
        dataset[categorical_cols] = imputer_categorical.fit_transform(dataset[categorical_cols])

        return dataset
    else:
        # If no missing values, return the original dataset
        return dataset

def binarize_dataframe(input_df, num_bins):
    df = pd.DataFrame(input_df)
    df_binarized = pd.DataFrame()

    for column in df.columns:
        column_name = column+'_binarized'
        bins = pd.qcut(df[column], q=num_bins, labels=False, duplicates='drop')
        df_binarized[column_name] = (bins == bins.max()).astype(int)

    return df_binarized

def transform_matrix_dice(matrix):
    # Create a copy of the input matrix
    transformed_matrix = matrix.copy()
    print(transformed_matrix)
    # Apply the transformation
    transformed_matrix[transformed_matrix > 0.40] = 1
    transformed_matrix[transformed_matrix <= 0.40] = 0
    return transformed_matrix

def transform_matrix_dice(matrix):
    # Create a copy of the input matrix
    transformed_matrix = matrix.copy()
    #print("Inside Transformed")
    # Apply the transformation
    transformed_matrix[transformed_matrix > 0.40] = 1
    transformed_matrix[transformed_matrix <= 0.40] = 0
    return transformed_matrix


def filter_data_for_interval(df, category_col, category):
    # Filter data for a specific interval category
    return df[df[category_col] == category]

def transform_matrix_for_interval(X_interval, support_df):
    jaccard_result = calculate_dice_similarity(X_interval, support_df)
    #print("Inside Transform")
    #print(jaccard_result.shape)
    transformed_matrix = transform_matrix_dice(jaccard_result)
    print(transformed_matrix.shape)
    return transformed_matrix

#Bayesian model for FP(with interestingness)

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Binarizer
from scipy.special import logsumexp

def find_separate_itemsets_for_measures(dataset_path, freq_pattern_path, drop_column=None, max_iterations=100, convergence_threshold=1e-4, thresholds=None):
    # Load dataset and frequent patterns
    dataset = pd.read_csv(dataset_path)
    freq_patterns = pd.read_csv(freq_pattern_path, sep='\t')

    # Default thresholds if not provided
    if thresholds is None:
        thresholds = {
            "support": 0.01,
            "confidence": 0.000000000001,
            "lift": 0.01,
            "leverage": 0.0,
            "jaccard": 0.5,
            "cosine": 0.5
        }

    # Preprocess dataset - drop column if specified
    if drop_column:
        X = dataset.drop(columns=[drop_column])
    else:
        X = dataset  # If no column is specified to drop, use the full dataset

    binary_data = Binarizer().fit_transform(X)

    # Initialize itemset probabilities pi_s for each pattern
    pi = np.ones(len(freq_patterns))  # Initialize all itemsets with probability 1 (uniform prior)

    # Helper functions for Bayesian Mixture Model

    def e_step(data, pi):
        epsilon = 1e-10  # Small value to avoid log of zero
        log_resp = np.zeros((data.shape[0], len(pi)))  # Initialize the responsibilities
        for i, pattern in enumerate(freq_patterns['Itemsets']):
            # Parse the pattern to get item columns
            pattern_items = [item.split('=')[0] for item in pattern.split()]
            pattern_columns = [col for col in X.columns if col in pattern_items]
            if not pattern_columns:
                continue  # Skip patterns that don't match any columns

            # Indicator function: 1 if itemset is present, 0 if not
            si = np.zeros(data.shape[0])
            for j, row in enumerate(data):
                si[j] = 1 if all(row[X.columns.get_loc(col)] for col in pattern_columns) else 0

            # Calculate log-probability of the data under each itemset
            log_prob = si * np.log(pi[i] + epsilon) + (1 - si) * np.log(1 - pi[i] + epsilon)
            log_resp[:, i] = log_prob

        # Normalize the responsibilities across all itemsets (log-sum-exp trick)
        log_resp -= logsumexp(log_resp, axis=1)[:, np.newaxis]
        return np.exp(log_resp)

    def m_step(data, responsibilities):
        N, D = data.shape
        Nk = responsibilities.sum(axis=0)
        pi_new = Nk / N  # Update component priors (probabilities of each itemset)
        return pi_new

    # Run EM algorithm until convergence or maximum iterations
    for iteration in range(max_iterations):
        prev_pi = pi.copy()

        # Perform E-step and M-step
        responsibilities = e_step(binary_data, pi)
        pi = m_step(binary_data, responsibilities)

        # Check for convergence
        delta_pi = np.linalg.norm(pi - prev_pi)
        if delta_pi < convergence_threshold:
            print(f"Converged after {iteration + 1} iterations.")
            break
    else:
        print("Reached maximum iterations without convergence.")

    # Initialize separate results for each interestingness measure
    support_results = []
    confidence_results = []
    lift_results = []
    leverage_results = []
    jaccard_results = []
    cosine_results = []

    # Calculate interestingness measures for each pattern
    for _, row in freq_patterns.iterrows():
        pattern_string = row['Itemsets']

        # Parse the pattern string into column names and values
        pattern_items = [
            item.strip().split('=')[0]  # Extract the attribute (column name)
            for item in pattern_string.split()  # Split by spaces
            if '=' in item  # Ignore parts without '=' (e.g., "count=8")
        ]

        # Match column names in the dataset
        pattern_columns = [col for col in X.columns if col in pattern_items]
        if not pattern_columns:
            continue  # Skip patterns with no matching columns

        # Calculate support
        support_count = (binary_data[:, [X.columns.get_loc(col) for col in pattern_columns]].sum(axis=1)
                         == len(pattern_columns)).sum()
        support = support_count / binary_data.shape[0]

        # Confidence (Assuming a simple rule A -> B)
        if len(pattern_columns) > 1:
            A, B = pattern_columns[:-1], pattern_columns[-1]
            support_A = (binary_data[:, [X.columns.get_loc(col) for col in A]].sum(axis=1) == len(A)).sum() / binary_data.shape[0]
            confidence = support / (support_A + 1e-10)
        else:
            confidence = np.nan

        # Lift
        if len(pattern_columns) > 1:
            support_B = (binary_data[:, X.columns.get_loc(B)] == 1).sum() / binary_data.shape[0]
            lift = support / (support_A * support_B + 1e-10)
        else:
            lift = np.nan

        # Leverage
        leverage = support - (support_A * support_B) if len(pattern_columns) > 1 else np.nan

        # Jaccard Index
        jaccard = support / (support_A + support_B - support) if len(pattern_columns) > 1 else np.nan

        # Cosine Similarity
        cosine = support / (support_A * support_B + 1e-10) if len(pattern_columns) > 1 else np.nan

        # Store results if they exceed the threshold
        if support >= thresholds['support']:
            support_results.append({"Pattern": pattern_string, "Support": support})
        if confidence >= thresholds['confidence']:
            confidence_results.append({"Pattern": pattern_string, "Confidence": confidence})
        if lift >= thresholds['lift']:
            lift_results.append({"Pattern": pattern_string, "Lift": lift})
        if leverage >= thresholds['leverage']:
            leverage_results.append({"Pattern": pattern_string, "Leverage": leverage})
        if jaccard >= thresholds['jaccard']:
            jaccard_results.append({"Pattern": pattern_string, "Jaccard": jaccard})
        if cosine >= thresholds['cosine']:
            cosine_results.append({"Pattern": pattern_string, "Cosine": cosine})

    # Convert results to DataFrames
    support_df = pd.DataFrame(support_results)
    confidence_df = pd.DataFrame(confidence_results)
    lift_df = pd.DataFrame(lift_results)
    leverage_df = pd.DataFrame(leverage_results)
    jaccard_df = pd.DataFrame(jaccard_results)
    cosine_df = pd.DataFrame(cosine_results)

    return support_df, confidence_df, lift_df, leverage_df, jaccard_df, cosine_df

#Density->process_interval

In [17]:
def calculate_density(matrix):
    num_ones = matrix.nnz if isinstance(matrix, csr_matrix) else np.count_nonzero(matrix)
    total_elements = matrix.shape[0] * matrix.shape[1]
    density = num_ones / total_elements
    return density

# def process_interval(row):
#     category = row['Category']
#     interval_min = row['Min_Value']
#     interval_max = row['Max_Value']

#     filtered_data = filter_data_for_interval(Data_discretized, 'Category', category)
#     if not filtered_data.empty:
#         X_interval = filtered_data.drop(columns=['Category'])
#         print(X_interval.shape)
#         val = calculate_dice_similarity(X_interval, support_df)
#         #print(val.shape)
#         val1 = transform_matrix_for_interval(X_interval, support_df)
#         SXmat = val1
#         SXmat = pd.DataFrame(SXmat)
#         # Calculate Density
#         density = calculate_density(SXmat)
#         print(f"Density of incidence matrix for this interval (Category {category}): {density}")
#         weights, w_norm = fis_algorithm_with_auxiliary(SXmat)
#         return category, {'weights': weights, 'wnorm': w_norm}
#     else:
#         print(f"No data for interval {category}")
#         return category, None

def process_interval(row):
    category = row['Category']
    interval_min = row['Min_Value']
    interval_max = row['Max_Value']

    filtered_data = filter_data_for_interval(Data_discretized, 'Category', category)
    if not filtered_data.empty:
        X_interval = filtered_data.drop(columns=['Category'])
        print(X_interval.shape)
        val = calculate_dice_similarity(X_interval, support_df)
        print(val.shape)
        val1 = transform_matrix_for_interval(X_interval, support_df)
        SXmat = val1
        SXmat = pd.DataFrame(SXmat)
        print(SXmat.head(1))
        df_SXmat = pd.DataFrame(SXmat)
        sxmat_file_path = os.path.join(save_path, f'SXmat_Category_{category}.csv')
        df_SXmat.to_csv(sxmat_file_path, index=False)
        print(f"SXmat for Category {category} saved at {sxmat_file_path}")
        # Calculate Density
        density = calculate_density(SXmat)
        print(f"Density of incidence matrix for this interval (Category {category}): {density}")
        weights, w_norm = train_SCGIS(SXmat)
        print(weights.shape)
        #print(wnorm.shape)
        return category, {'weights': weights, 'wnorm': w_norm}
    else:
        print(f"No data for interval {category}")
        return category, None



#Process_test_dataset

In [18]:
# def process_test_data(Data_test, support_df, categories_weights, constant):
#     # Create a temporary DataFrame as a copy of Data_test
#     Data_test_temp = Data_test.copy()

#     # Iterate through each row in Data_test_temp
#     for index, row in Data_test_temp.iterrows():
#         # Convert the row to a DataFrame for compatibility with transform_matrix_for_interval
#         row_df = pd.DataFrame([row])

#         # Calculate the Jaccard/Dice similarity for the current row with support_df
#         jaccard_result = calculate_dice_similarity_demo(row_df, support_df)

#         # Directly apply binarization with a threshold of 0.4
#         transformed_row = jaccard_result.copy()
#         transformed_row[transformed_row > 0.4] = 1
#         transformed_row[transformed_row <= 0.4] = 0

#         # Flatten the transformed_row to ensure it is 1D (1, 87) -> (87,)
#         transformed_row_flat = transformed_row.values.flatten()

#         max_value = 0.0  # Initialize the max value
#         predicted_category = None  # Store the category with the highest value

#         # Iterate over each category and calculate the weighted product
#         for category, params in categories_weights.items():
#             # Ensure the structure is valid
#             if 'weights' in params and 'wnorm' in params:
#                 weights = params['weights']  # Extract weights for the category
#                 wnorm = params['wnorm']      # Extract normalization value

#                 # Reshape weights to (1, 87) to match the transformed_row shape (1, 87)
#                 weights_reshaped = weights.reshape(1, -1)

#                 # Ensure both weights and transformed_row are compatible (both should be (1, 87))
#                 if weights_reshaped.shape != transformed_row.shape:
#                     print(f"Shape mismatch between weights and transformed_row for category {category}")
#                     continue

#                 product = weights_reshaped * transformed_row  # Element-wise multiplication
#                 # Sum the product over all columns (axis=1 sums across columns)
#                 product_sum = product.sum(axis=1)  # This will give a single value
#                 # Calculate the final product with the normalization value and constant
#                 final_product = (product_sum + wnorm) * constant
#                 final_product = final_product.item()
#                 # Check if the current product is higher than max_value
#                 if final_product > max_value:
#                     max_value = final_product
#                     predicted_category = category  # Store the category with the highest product

#         # After processing all categories for the current row, add the predicted category to Data_test_temp
#         Data_test_temp.loc[index, 'predicted_category'] = predicted_category

#     # Return the updated DataFrame with the predicted category column
#     return Data_test_temp

In [26]:
def process_test_data(Data_test, support_df, categories_weights, constant):
    # Create a temporary DataFrame as a copy of Data_test
    Data_test_temp = Data_test.copy()

    # Iterate through each row in Data_test_temp
    for index, row in Data_test_temp.iterrows():
        # Convert the row to a DataFrame for compatibility with transform_matrix_for_interval
        row_df = pd.DataFrame([row])

        # Calculate the Jaccard/Dice similarity for the current row with support_df
        jaccard_result = calculate_dice_similarity(row_df, support_df)

        # Directly apply binarization with a threshold of 0.4
        transformed_row = jaccard_result.copy()
        transformed_row[transformed_row > 0.4] = 1
        transformed_row[transformed_row <= 0.4] = 0

        # Flatten the transformed_row to ensure it is 1D (1, 87) -> (87,)
        transformed_row_flat = transformed_row.values.flatten()

        max_value = 0.0  # Initialize the max value
        predicted_category = None  # Store the category with the highest value

        # Iterate over each category and calculate the weighted product
        for category, params in categories_weights.items():
            # Ensure the structure is valid
            if 'weights' in params and 'wnorm' in params:
                weights = params['weights']  # Extract weights for the category
                wnorm = params['wnorm']      # Extract normalization value

                # Reshape weights to (1, 87) to match the transformed_row shape (1, 87)
                weights_reshaped = weights.reshape(1, -1)

                # Ensure both weights and transformed_row are compatible (both should be (1, 87))
                if weights_reshaped.shape != transformed_row.shape:
                    print(f"Shape mismatch between weights and transformed_row for category {category}")
                    continue

                # Raise each weight to the power of the corresponding binary value in transformed_row
                product = weights_reshaped ** transformed_row  # Element-wise exponentiation

                # Sum the product over all columns (axis=1 sums across columns)
                product_sum = product.sum(axis=1)  # This will give a single value

                # Calculate the final product with the normalization value and constant
                final_product = (product_sum + wnorm) * constant
                final_product = final_product.item()

                # Check if the current product is higher than max_value
                if final_product > max_value:
                    max_value = final_product
                    predicted_category = category  # Store the category with the highest product

        # After processing all categories for the current row, add the predicted category to Data_test_temp
        Data_test_temp.loc[index, 'predicted_category'] = predicted_category

    # Return the updated DataFrame with the predicted category column
    return Data_test_temp


#RMSE calculation

In [20]:
import numpy as np

def calculate_rmse_category(X_test, intervals_df, updated_data):
    sum_squared_diff = 0.0
    sum_absolute_diff = 0.0
    actual_values = []
    predicted_values = []

    # Iterate through the test dataset to calculate RMSE, MAE, and collect values for PCC
    for i, row in X_test.iterrows():
        actual_category = row['Category']  # Actual category
        predicted_category = updated_data.loc[i, 'predicted_category']  # Predicted category from updated_data
        #print(f"Predicted Category: {predicted_category}")

        # Get the actual 'Max_Value' based on the actual category
        actual_max_value = intervals_df[intervals_df['Category'] == actual_category]['Max_Value'].values

        # Check if predicted_category exists in intervals_df
        if predicted_category in intervals_df['Category'].values:
            predicted_max_value = intervals_df[intervals_df['Category'] == predicted_category]['Max_Value'].values
        else:
            # Handle case where predicted category is not found in intervals_df
            #print(f"Predicted category '{predicted_category}' not found in intervals_df.")
            predicted_max_value = np.nan  # Set predicted_max_value to NaN or handle it as needed

        #print(f"Actual Max Value: {actual_max_value}, Predicted Max Value: {predicted_max_value}")

        # Ensure the actual max value exists and predicted max value is not NaN
        if len(actual_max_value) > 0:
            actual_max_value = actual_max_value[0]

            # Handle NaN values in actual_max_value or predicted_max_value
            if np.isnan(actual_max_value) or np.isnan(predicted_max_value):
                #print(f"Skipping pair due to NaN value (Actual: {actual_max_value}, Predicted: {predicted_max_value})")

                # Option 1: Set difference to 0 if both are NaN
                squared_diff = 0
                absolute_diff = 0
            else:
                # Calculate squared difference for RMSE
                squared_diff = (predicted_max_value - actual_max_value) ** 2
                absolute_diff = abs(predicted_max_value - actual_max_value)

            sum_squared_diff += squared_diff
            sum_absolute_diff += absolute_diff

            # Collect values for PCC calculation
            actual_values.append(actual_max_value)
            predicted_values.append(predicted_max_value)

    # Ensure we have data to calculate the metrics
    if len(X_test) > 0:
        # Calculate RMSE
        mean_squared_error = sum_squared_diff / len(X_test)
        rmse = np.sqrt(mean_squared_error).item()  # Convert numpy array to scalar

        # Calculate MAE
        mae = (sum_absolute_diff / len(X_test)).item()  # Convert numpy array to scalar

        return rmse, mae

    # Return None if there are no rows in X_test
    return None, None


#Check dataset

In [21]:
import pandas as pd
# Load Dataset while skipping the 'datetime' column
dataset = "/content/drive/MyDrive/socmob_transformed.csv"
concrete = pd.read_csv(dataset)
concrete.head(2)

Unnamed: 0,id,fathers_occupation,sons_occupation,family_structure,race,counts_for_sons_first_occupation,counts_for_sons_current_occupation
0,1,12,12,0,1,22.9,31.3
1,2,12,11,0,1,96.2,86.6


#Calling the function with dataset input

In [22]:
import pandas as pd
# Load Dataset while skipping the 'datetime' column
dataset = "/content/drive/MyDrive/bolts.csv"
concrete = pd.read_csv(dataset)
dependent = "T20BOLT"
concrete_temp = concrete.copy()
X = concrete_temp.drop(dependent, axis=1).copy()
y = concrete[dependent]
# Usage example:
dataset_path = "/content/drive/MyDrive/bolts.csv"
freq_pattern_path = '/content/drive/MyDrive/AprioriOutput/closed_itemsets_bolts.csv'
drop_column = 'T20BOLT'  # Example of the column to drop
support_df, confidence_df, lift_df, leverage_df, jaccard_df, cosine_df = find_separate_itemsets_for_measures(dataset_path, freq_pattern_path, drop_column=drop_column)
support_df = pd.read_csv(freq_pattern_path,sep='\t')
#support_df = support_df.drop(columns=['Support'])
# Handle Missing Values
X = handle_missing_values(X)
# Binarize DataFrame
num_bins = 15
df_binarized = binarize_dataframe(pd.DataFrame(concrete), num_bins)
# Create Intervals DataFrame
num_intervals = 4
intervals_df = create_intervals_df(concrete, dependent, num_intervals)
print(intervals_df)
# Categorize Column
result_df = categorize_column(concrete, dependent, num_intervals)
P_c = 1 / num_intervals
P_x = 1 / concrete.shape[1]
constant = P_c / P_x
Data_discretized = pd.read_csv('/content/drive/MyDrive/Discretized_datasets/discretized_bolts.csv')
Data_discretized['Category'] = result_df['Category']

Converged after 4 iterations.
  Category  Min_Value  Max_Value
0        A        NaN        NaN
1        B   0.420886   0.462025
2        C   0.515823   0.750000
3        D   0.750316   1.000000


In [23]:
# Calculate Jaccard Similarity and Train GIS for Each Interval
categories_weights = {}
categories = Data_discretized['Category'].unique()
# Use ThreadPoolExecutor to run computations in parallel
with ThreadPoolExecutor() as executor:
    # Create a list of futures
    futures = {executor.submit(process_interval, row): row for _, row in intervals_df.iterrows()}
    # Process completed futures
    for future in concurrent.futures.as_completed(futures):
        category, weights_wnorm = future.result()
        if weights_wnorm is not None:
            categories_weights[category] = weights_wnorm

(1, 6)(2, 6)

(1, 138)
(2, 138)
(1, 138)(837, 6)

   Pattern_0  Pattern_1  Pattern_2  Pattern_3  Pattern_4  Pattern_5  \
0        0.0        0.0        0.0        0.0        0.0        0.0   

   Pattern_6  Pattern_7  Pattern_8  Pattern_9  ...  Pattern_128  Pattern_129  \
0        0.0        0.0        0.0        0.0  ...          1.0          0.0   

   Pattern_130  Pattern_131  Pattern_132  Pattern_133  Pattern_134  \
0          1.0          0.0          0.0          0.0          1.0   

   Pattern_135  Pattern_136  Pattern_137  
0          0.0          1.0          1.0  

[1 rows x 138 columns]
(2, 138)
   Pattern_0  Pattern_1  Pattern_2  Pattern_3  Pattern_4  Pattern_5  \
0        0.0        0.0        0.0        0.0        0.0        0.0   

   Pattern_6  Pattern_7  Pattern_8  Pattern_9  ...  Pattern_128  Pattern_129  \
0        0.0        0.0        0.0        0.0  ...          1.0          0.0   

   Pattern_130  Pattern_131  Pattern_132  Pattern_133  Pattern_134  \
0          1

In [24]:
# from google.colab import drive
# drive.mount('/content/drive')

In [27]:
# Split the data into training and testing datasets for model evaluation
Data_train, Data_test = train_test_split(Data_discretized, test_size=0.2, random_state=5)
# Example of calling the function
updated_data = process_test_data(Data_test, support_df, categories_weights, constant)
# Calculate the root mean squared error (RMSE) for the test data
rmse = calculate_rmse_category(Data_test,intervals_df,updated_data)  # Use a custom RMSE calculation function
print("RMSE:", rmse)

RMSE: (0.13546642516684076, 0.07240831942692011)


In [None]:
from sklearn.model_selection import train_test_split

def find_best_rmse_random_state(Data_discretized, intervals_df, support_df, categories_weights, constant):
    best_random_state = None
    min_rmse = float('inf')  # Initialize with a large value to ensure any RMSE is smaller

    # Iterate through random states from 5 to 200
    for random_state in range(3, 101):
        # Split the data into training and testing datasets
        Data_train, Data_test = train_test_split(Data_discretized, test_size=0.2, random_state=random_state)

        # Process the test data
        updated_data = process_test_data(Data_test, support_df, categories_weights, constant)

        # Calculate the root mean squared error (RMSE) and MAE
        rmse, mae = calculate_rmse_category(Data_test, intervals_df, updated_data)

        # Compare only the first value (RMSE) from the tuple
        if rmse is not None and rmse < min_rmse:
            min_rmse = rmse
            min_mae = mae
            best_random_state = random_state

    return best_random_state, min_rmse,min_mae

# Example usage
best_random_state, min_rmse,min_mae = find_best_rmse_random_state(Data_discretized, intervals_df, support_df, categories_weights, constant)
print(f"Best Random State: {best_random_state}, RMSE: {min_rmse,min_mae}")
