In [4]:
#importing libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import fpgrowth

#importing the sales dataset transaction
dataset = pd.read_csv("Sales1998_normalized.csv",header=None, engine='python')
# printing the shape of the dataset
dataset.shape


(34070, 27)

In [5]:
# Dynamically determine the number of columns
num_columns = dataset.shape[1]
num_columns


27

In [6]:
 # Convert the transactions to a list
transactions = []
for i in range(len(dataset)):
     transactions.append([str(dataset.values[i, j]) for j in range(num_columns)])

transactions

[['177.0',
  '1065.0',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan'],
 ['477.0',
  '653.0',
  '154.0',
  '537.0',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan'],
 ['1141.0',
  '1241.0',
  '519.0',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan'],
 ['530.0',
  '709.0',
  '1308.0',
  '1012.0',
  '552.0',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
 

In [7]:
print(f"Number of rows processed: {len(transactions)}")

Number of rows processed: 34070


In [8]:
print(dataset.isnull().sum())  # Check for missing values in each column

0         0
1      2451
2      6857
3     11432
4     16692
5     20691
6     25357
7     29908
8     32502
9     32835
10    33206
11    33425
12    33675
13    33771
14    33945
15    33979
16    34019
17    34034
18    34051
19    34058
20    34063
21    34065
22    34069
23    34069
24    34069
25    34069
26    34069
dtype: int64


In [9]:

# Encode the transaction dataset into a one-hot encoded DataFrame, where each column represents a unique item.
# Remove the 'nan' column to clean the data and prepare it for further analysis.
te = TransactionEncoder()
te_ary = te.fit_transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)




In [10]:
# Convert column names to integers if they are numeric, otherwise keep them as strings
def convert_column_names(column):
    try:
        return int(float(column))  # Convert to integer if possible
    except ValueError:
        return str(column)  # Keep as string if conversion fails

df.columns = [convert_column_names(col) for col in df.columns]

# Verify the updated column names
print("Updated Column Names:")
print(df.columns)

Updated Column Names:
Index([    1,    10,   100,  1000,  1001,  1002,  1003,  1004,  1005,  1006,
       ...
         991,   992,   993,   994,   995,   996,   997,   998,   999, 'nan'],
      dtype='object', length=1560)


In [11]:
dataset = dataset.dropna()

In [12]:
print(dataset.isnull().sum())  # Check for missing values in each column

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
dtype: int64


In [13]:
print(df.head())

       1     10    100   1000   1001   1002   1003   1004   1005   1006  ...  \
0  False  False  False  False  False  False  False  False  False  False  ...   
1  False  False  False  False  False  False  False  False  False  False  ...   
2  False  False  False  False  False  False  False  False  False  False  ...   
3  False  False  False  False  False  False  False  False  False  False  ...   
4  False  False  False  False  False  False  False  False  False  False  ...   

     991    992    993    994    995    996    997    998    999   nan  
0  False  False  False  False  False  False  False  False  False  True  
1  False  False  False  False  False  False  False  False  False  True  
2  False  False  False  False  False  False  False  False  False  True  
3  False  False  False  False  False  False  False  False  False  True  
4  False  False  False  False  False  False  False  False  False  True  

[5 rows x 1560 columns]


In [14]:
print("Columns in df:", df.columns)

Columns in df: Index([    1,    10,   100,  1000,  1001,  1002,  1003,  1004,  1005,  1006,
       ...
         991,   992,   993,   994,   995,   996,   997,   998,   999, 'nan'],
      dtype='object', length=1560)


In [15]:
print("Shape of df:", df.shape)

Shape of df: (34070, 1560)


In [16]:
# Filter out infrequent items before encoding
# Count the frequency of each item
item_counts = df.sum(axis=0)
item_counts
# Filter out items that appear in fewer than 5 transactions
frequent_items = item_counts[item_counts >= 5].index
df = df[frequent_items]
df

Unnamed: 0,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,991,992,993,994,995,996,997,998,999,nan
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34065,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
34066,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
34067,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
34068,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [17]:
# Check for and remove 'nan' column after encoding
if 'nan' in df.columns:
    df = df.drop(columns='nan')
df

Unnamed: 0,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34065,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
34066,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
34067,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
34068,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [18]:
# Debug: Check the one-hot encoded DataFrame
print("One-Hot Encoded DataFrame:")
print(df.head())
print("Columns:", df.columns)

One-Hot Encoded DataFrame:
    1      10     100    1000   1001   1002   1003   1004   1005   1006  ...  \
0  False  False  False  False  False  False  False  False  False  False  ...   
1  False  False  False  False  False  False  False  False  False  False  ...   
2  False  False  False  False  False  False  False  False  False  False  ...   
3  False  False  False  False  False  False  False  False  False  False  ...   
4  False  False  False  False  False  False  False  False  False  False  ...   

    990    991    992    993    994    995    996    997    998    999   
0  False  False  False  False  False  False  False  False  False  False  
1  False  False  False  False  False  False  False  False  False  False  
2  False  False  False  False  False  False  False  False  False  False  
3  False  False  False  False  False  False  False  False  False  False  
4  False  False  False  False  False  False  False  False  False  False  

[5 rows x 1559 columns]
Columns: Index([   1,  

In [20]:
# Debug: Check the filtered DataFrame
print("Filtered DataFrame:")
print(df)

Filtered DataFrame:
        1      10     100    1000   1001   1002   1003   1004   1005   1006  \
0      False  False  False  False  False  False  False  False  False  False   
1      False  False  False  False  False  False  False  False  False  False   
2      False  False  False  False  False  False  False  False  False  False   
3      False  False  False  False  False  False  False  False  False  False   
4      False  False  False  False  False  False  False  False  False  False   
...      ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
34065  False  False  False  False  False  False  False  False  False  False   
34066  False  False  False  False  False  False  False  False  False  False   
34067  False  False  False  False  False  False  False  False  False  False   
34068  False  False  False  False  False  False  False  False  False  False   
34069  False  False  False  False  False  False  False  False  False  False   

       ...   990    991    992 

In [21]:
# Analyze co-occurrence of items
co_occurrence = df.T.dot(df)
print("Co-occurrence Matrix:")
print(co_occurrence)

# Filter out infrequent items
item_counts = df.sum(axis=0)
frequent_items = item_counts[item_counts >= 10].index
df = df[frequent_items]

co_occurrence_sum = co_occurrence.sum(axis=1)
print("Items with the highest co-occurrence:")
print(co_occurrence_sum.sort_values(ascending=False).head(10))

# Lower the min_support threshold
min_support = 0.0001  # Further reduced support threshold
model = fpgrowth(df, min_support=min_support, use_colnames=True)

# Debug: Check the frequent itemsets
print("Frequent Itemsets:")
print(model)

# Check for larger itemsets
print("Frequent Itemsets with More Than One Item:")
larger_itemsets = model[model['itemsets'].apply(lambda x: len(x) > 1)]
print(larger_itemsets)



Co-occurrence Matrix:
       1      10     100    1000   1001   1002   1003   1004   1005   1006  \
1      True  False  False  False  False  False  False   True  False  False   
10    False   True   True  False  False   True  False  False  False  False   
100   False   True   True  False  False  False   True  False   True  False   
1000  False  False  False   True  False  False  False  False  False   True   
1001  False  False  False  False   True  False  False  False  False  False   
...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
995   False  False  False  False   True  False  False  False  False   True   
996   False  False  False   True  False  False   True  False  False   True   
997   False  False  False  False  False  False   True  False  False  False   
998   False  False  False  False  False  False  False   True  False   True   
999   False  False   True  False  False  False   True  False  False  False   

      ...   990    991    992    993    9

In [None]:
# Check if any frequent itemsets were found
if model.empty:
    print("No frequent itemsets found. Try lowering the min_support value.")
elif larger_itemsets.empty:
    print("No larger itemsets found. Try lowering the min_support value.")
else:
    # Lower the min_confidence threshold
    min_confidence = 0.1  # Reduced confidence threshold
    rules = association_rules(model, metric='confidence', min_threshold=min_confidence)

    # Debug: Check the generated rules
    print("Generated Rules:")
    print(rules)

    # Check if any rules were generated
    if rules.empty:
        print("No rules generated. Try lowering the min_confidence value.")
    else:
        # Sort rules by lift
        rules = rules.sort_values(by='lift', ascending=False)

        # Print the rules
        print("Association Rules:")
        print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

In [None]:
# Check if any frequent itemsets were found
if model.empty:
    print("No frequent itemsets found. Try lowering the min_support value.")
elif larger_itemsets.empty:
    print("No larger itemsets found. Try lowering the min_support value.")
else:
    # Lower the min_confidence threshold
    min_confidence = 0.1  # Reduced confidence threshold
    rules = association_rules(model, metric='confidence', min_threshold=min_confidence)

    # Debug: Check the generated rules
    print("Generated Rules:")
    print(rules)

    # Check if any rules were generated
    if rules.empty:
        print("No rules generated. Try lowering the min_confidence value.")
    else:
        # Filter strong associations
        strong_rules = rules[(rules['confidence'] > 0.99) & (rules['lift'] > 8400.0)]

        # Check if any strong rules were found
        if strong_rules.empty:
            print("No strong rules found. Try lowering the confidence or lift thresholds.")
        else:
            # Sort strong rules by lift
            strong_rules = strong_rules.sort_values(by='lift', ascending=False)

            # Print the strong rules
            print("Strong Association Rules:")
            print(strong_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

            # Optionally, save the strong rules to a CSV file
            strong_rules.to_csv("Strong_Association_Rules.csv", index=False)
            print("Strong rules saved to 'Strong_Association_Rules.csv'")

Generated Rules:
       antecedents consequents  antecedent support  consequent support  \
0       (177, 876)       (653)            0.000176            0.003082   
1       (177, 653)       (876)            0.000147            0.003493   
2       (876, 653)       (177)            0.000117            0.002994   
3       (177, 420)       (653)            0.000147            0.003082   
4       (177, 653)       (420)            0.000147            0.003170   
...            ...         ...                 ...                 ...   
3827   (184, 1332)      (1010)            0.000147            0.003082   
3828  (1010, 1332)       (184)            0.000117            0.003023   
3829    (113, 307)       (806)            0.000117            0.003229   
3830    (113, 806)       (307)            0.000147            0.003463   
3831    (307, 806)       (113)            0.000117            0.002231   

       support  confidence        lift  representativity  leverage  \
0     0.000117    0.6666