In [4]:
import pandas as pd

# Load the CSV file
creditcard_df = pd.read_csv('creditcard.csv' )

preprocess and quantile-based discretization and then convert to catgorical variable

In [5]:
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np

# data pre-processing

# Handle missing values
creditcard_df.dropna(inplace=True)  # Drop rows with missing values


# Drop the "Time" column because it is unnecessary
creditcard_df.drop(columns='Time', inplace=True)

# Use quantile-based discretization
discretizer = KBinsDiscretizer(n_bins=20,  strategy='quantile')  # Use quantile-based discretization

# Specify the numerical columns to be discretized
num_cols = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

# Perform data discretization for each numerical feature
# Loop through each numerical column and discretize
for col in num_cols:
    # Extract the column values as a 2D numpy array
    col_values = creditcard_df[col].values.reshape(-1, 1)
    
    # Apply KBinsDiscretizer and get the transformed feature
    discretized_feature = discretizer.fit_transform(col_values)
    
    # Convert the sparse matrix to a dense numpy array
    discretized_feature = discretized_feature.toarray()
    
    # Create new column names for the discretized feature
    col_names = [col + '_bin_' + str(i) for i in range(discretized_feature.shape[1])]
    
    # Create a DataFrame from the discretized feature
    discretized_df = pd.DataFrame(discretized_feature.astype(int), columns=col_names)
    
    # Concatenate the discretized_df to creditcard_df
    creditcard_df = pd.concat([creditcard_df, discretized_df], axis=1)

# Drop the original numerical columns
creditcard_df.drop(columns=num_cols, inplace=True)

# Display the updated creditcard_df
print(creditcard_df.head())

   Class  V1_bin_0  V1_bin_1  V1_bin_2  V1_bin_3  V1_bin_4  V1_bin_5  \
0    0.0         0         0         0         1         0         0   
1    0.0         0         0         0         0         0         0   
2    0.0         0         0         0         1         0         0   
3    0.0         0         0         0         0         0         1   
4    0.0         0         0         0         0         1         0   

   V1_bin_6  V1_bin_7  V1_bin_8  ...  Amount_bin_10  Amount_bin_11  \
0         0         0         0  ...              0              0   
1         0         0         0  ...              0              0   
2         0         0         0  ...              0              0   
3         0         0         0  ...              0              0   
4         0         0         0  ...              0              0   

   Amount_bin_12  Amount_bin_13  Amount_bin_14  Amount_bin_15  Amount_bin_16  \
0              0              0              0              0     

In [7]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


# Determine the minimum and maximum frequency values within the desired range
min_freq = 400
max_freq = 1000
min_support = min_freq / len(creditcard_df)
max_support = max_freq / len(creditcard_df)

# Define the frequent itemsets using Apriori algorithm
frequent_itemsets = apriori(creditcard_df, min_support=min_support, use_colnames=True)

# Filter the frequent itemsets based on length of 3 or 4
frequent_itemsets = frequent_itemsets[(frequent_itemsets['itemsets'].apply(len) == 3) | (frequent_itemsets['itemsets'].apply(len) == 4)]

# Filter the frequent itemsets based on support threshold
frequent_itemsets = frequent_itemsets[(frequent_itemsets['support'] >= min_support) & (frequent_itemsets['support'] <= max_support)]

# Generate the association rules from the frequent itemsets (only support values)
rules = association_rules(frequent_itemsets, metric="support", support_only=True)

# Filter the rules based on left side length of 2 or 3
rules = rules[(rules['antecedents'].apply(len) == 2) | (rules['antecedents'].apply(len) == 3)]

# Filter the rules based on right side being the desired class
rules = rules[rules['consequents'] == frozenset({'Class_1'})]

# Print the filtered association rules
print(rules)


Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []


In [14]:
print(rules.head())
# Generate association rules with support only
rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.01, support_only=True)

# Filter rules with antecedent length of 3 or 4
rules_3 = rules[rules['antecedents'].apply(lambda x: len(x) == 3)]
rules_4 = rules[rules['antecedents'].apply(lambda x: len(x) == 4)]

# Calculate confidence and interest for rules_3
antecedents_3 = rules_3['antecedents'].apply(lambda x: frozenset(x))
confidence_3 = []
interest_3 = []
for antecedent in antecedents_3:
    support_antecedent = frequent_itemsets[frequent_itemsets['itemsets'] == antecedent]['support'].values[0]
    support_rule = rules_3[rules_3['antecedents'].apply(lambda x: frozenset(x)) == antecedent]['support'].values[0]
    confidence = support_rule / support_antecedent
    interest = confidence / support_antecedent
    confidence_3.append(confidence)
    interest_3.append(interest)
rules_3['confidence'] = confidence_3
rules_3['interest'] = interest_3

# Calculate confidence and interest for rules_4
antecedents_4 = rules_4['antecedents'].apply(lambda x: frozenset(x))
confidence_4 = []
interest_4 = []
for antecedent in antecedents_4:
    support_antecedent = frequent_itemsets[frequent_itemsets['itemsets'] == antecedent]['support'].values[0]
    support_rule = rules_4[rules_4['antecedents'].apply(lambda x: frozenset(x)) == antecedent]['support'].values[0]
    confidence = support_rule / support_antecedent
    interest = confidence / support_antecedent
    confidence_4.append(confidence)
    interest_4.append(interest)
rules_4['confidence'] = confidence_4
rules_4['interest'] = interest_4

# Sort rules by confidence and interest
top_5_rules_3_confidence = rules_3.sort_values(by='confidence', ascending=False).head(5)
top_5_rules_3_interest = rules_3.sort_values(by='interest', ascending=False).head(5)

top_5_rules_4_confidence = rules_4.sort_values(by='confidence', ascending=False).head(5)
top_5_rules_4_interest = rules_4.sort_values(by='interest', ascending=False).head(5)

# Print the results
print("Top 5 rules with highest confidence (3 features):")
print(top_5_rules_3_confidence[['antecedents', 'consequents', 'confidence']])
print("\nTop 5 rules with highest interest (3 features):")
print(top_5_rules_3_interest[['antecedents', 'consequents', 'interest']])

print("\nTop 5 rules with highest confidence (4 features):")
print(top_5_rules_4_confidence[['antecedents', 'consequents', 'confidence']])
print("\nTop 5 rules with highest interest (4 features):")
print(top_5_rules_4_interest[['antecedents', 'consequents', 'interest']])


Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []


KeyError: ignored