In [10]:
import pandas as pd

# Read the data
df = pd.read_csv('www.csv', sep='\t')

# Check for missing values
missing_data = df.isnull().sum()

# Remove duplicates and keep the first occurrence
print(f"Number of rows before removing duplicates: {len(df)}")
df = df.drop_duplicates('Query', keep='first')
df.reset_index(drop=True, inplace=True)
print(f"Number of rows after removing duplicates: {len(df)}")

# Count number of records and attributes (excluding User and Query)
num_records = len(df)
num_attributes = df.shape[1] - 2
print(f"Number of records: {num_records}")
print(f"Number of attributes: {num_attributes}")


Number of rows before removing duplicates: 9999
Number of rows after removing duplicates: 9816
Number of records: 9816
Number of attributes: 500


In [11]:
import pyfpgrowth

# Convert the Query column into a list of lists
associations = df['Query'].apply(lambda x: x.split()).tolist()

# Set minimum support
sigma = 100
min_support = sigma / len(associations)
print(f"Minimum support: {min_support}")

# Find frequent itemsets
patterns = pyfpgrowth.find_frequent_patterns(associations, sigma)

# Number of frequent itemsets and maximum size
num_frequent_itemsets = len(patterns)
max_itemset_size = max(len(itemset) for itemset in patterns)
print(f"Number of frequent itemsets: {num_frequent_itemsets}")
print(f"Maximum size of frequent itemsets: {max_itemset_size}")

# Calculate support
support = {key: value / len(df) for key, value in patterns.items()}
print("Support of itemsets:", support)


Minimum support: 0.010187449062754686
Number of frequent itemsets: 28
Maximum size of frequent itemsets: 2
Support of itemsets: {('high',): 0.010289323553382234, ('with',): 0.010594947025264874, ('my',): 0.011206193969030154, ('home',): 0.011511817440912795, ('you',): 0.011919315403422982, ('i',): 0.011919315403422982, ('is',): 0.01202118989405053, ('state',): 0.012530562347188265, ('what',): 0.013039934800325998, ('city',): 0.013447432762836185, ('florida',): 0.013854930725346373, ('school',): 0.015688671556642216, ('lyrics',): 0.0178280358598207, ('how',): 0.018133659331703342, ('how', 'to'): 0.013243683781581092, ('new',): 0.022616136919315404, ('http',): 0.022819885900570498, ('free',): 0.022921760391198046, ('on',): 0.023634881825590873, ('county',): 0.024144254278728607, ('a',): 0.03463732681336593, ('to',): 0.04339853300733496, ('and',): 0.055623471882640586, ('for',): 0.05633659331703342, ('in', 'the'): 0.011919315403422982, ('of', 'the'): 0.0210880195599022, ('in',): 0.0859820

In [12]:
min_support_values = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01]

for min_support in min_support_values:
    support_count = min_support * len(df)
    patterns = pyfpgrowth.find_frequent_patterns(associations, support_count)

    num_frequent_itemsets = len(patterns)
    max_itemset_size = max(len(itemset) for itemset in patterns)
    print(f"Min support: {min_support}, Number of frequent itemsets: {num_frequent_itemsets}, Max itemset size: {max_itemset_size}")


Min support: 0.001, Number of frequent itemsets: 2281, Max itemset size: 10
Min support: 0.002, Number of frequent itemsets: 484, Max itemset size: 6
Min support: 0.003, Number of frequent itemsets: 256, Max itemset size: 5
Min support: 0.004, Number of frequent itemsets: 156, Max itemset size: 4
Min support: 0.005, Number of frequent itemsets: 93, Max itemset size: 3
Min support: 0.006, Number of frequent itemsets: 70, Max itemset size: 2
Min support: 0.007, Number of frequent itemsets: 57, Max itemset size: 2
Min support: 0.008, Number of frequent itemsets: 46, Max itemset size: 2
Min support: 0.009, Number of frequent itemsets: 35, Max itemset size: 2
Min support: 0.01, Number of frequent itemsets: 29, Max itemset size: 2


In [13]:
min_support = 0.004090909090909091

support_count = min_support * num_records
confidence = 0.8

patterns = pyfpgrowth.find_frequent_patterns(associations, support_count)
rules = pyfpgrowth.generate_association_rules(patterns, confidence)

# Find an example of high-confidence rule X -> Y where Y -> X has lower confidence
print("Generated rules:")
rules


Generated rules:


{('estate',): (('real',), 0.9166666666666666),
 ('york',): (('new',), 0.9139784946236559),
 ('sale',): (('for',), 0.8333333333333334),
 ('i', 'to'): (('the',), 2.309090909090909),
 ('the', 'to'): (('i',), 1.5301204819277108),
 ('i', 'the', 'the'): (('to',), 3.106382978723404),
 ('i', 'the', 'to'): ((), 1.3804347826086956),
 ('the', 'the', 'to'): (('i',), 2.4745762711864407),
 ('a', 'how'): (('to',), 0.9230769230769231),
 ('in', 'to'): (('the',), 1.0681818181818181),
 ('in', 'of', 'the'): ((), 0.8679245283018868),
 ('in', 'the', 'the'): (('of',), 1.0),
 ('of', 'the', 'the'): (('in',), 0.92),
 ('in', 'of'): (('the',), 0.828125)}

In [15]:
min_confidence_range = [0.1, 0.3, 0.5, 0.7, 0.9]

for min_conf in min_confidence_range:
    rules = pyfpgrowth.generate_association_rules(patterns, min_conf)
    num_rules = len(rules)
    print(f"Minimum confidence: {min_conf}")
    print(f"Number of generated rules: {num_rules}")
    print()

Minimum confidence: 0.1
Number of generated rules: 30

Minimum confidence: 0.3
Number of generated rules: 25

Minimum confidence: 0.5
Number of generated rules: 17

Minimum confidence: 0.7
Number of generated rules: 15

Minimum confidence: 0.9
Number of generated rules: 11



In [17]:
# Set a very low support to extract a larger number of frequent patterns
very_low_support = 0.001
support_count = very_low_support * len(df)  # support count in terms of absolute occurrences
patterns = pyfpgrowth.find_frequent_patterns(associations, support_count)

# Generate rules with the low support frequent itemsets and given confidence
confidence = 0.8  # example confidence level, adjust as needed
rules = pyfpgrowth.generate_association_rules(patterns, confidence)

# Sort rules by confidence (index 1 of the tuple) and lift (index 2 of the tuple)
sorted_by_confidence = sorted(rules.items(), key=lambda x: x[1][1], reverse=True)  # sort by confidence
sorted_by_lift = sorted(rules.items(), key=lambda x: x[1][2], reverse=True)  # sort by lift

# Select top 3 interesting and unexpected rules
interesting_rules = sorted_by_confidence[:3]  # Top 3 rules by confidence
unexpected_rules = sorted_by_lift[:3]  # Top 3 rules by lift

# Output the results
print("Interesting rules (by confidence):")
for rule in interesting_rules:
    print(rule)

print("\nUnexpected rules (by lift):")
for rule in unexpected_rules:
    print(rule)


IndexError: tuple index out of range