In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
import math

In [2]:
df = pd.read_csv("Dataset1-3.csv")
print("Original Data:")
df.head(5)

Original Data:


Unnamed: 0,Temperature,Humidity,Rainfall,Soil,Crop,Fertilizer
0,2487,8284,29561,Clayey,rice,DAP
1,2869,9665,17896,laterite,Coconut,Good NPK
2,2027,8164,27044,silty clay,rice,MOP
3,2507,9502,1929,sandy,Coconut,Urea
4,2504,959,1748,coastal,Coconut,Urea


In [3]:
def generate_labels(name, number):
    return [name+"_"+str(i+1) for i in range(number)]

In [4]:
def equal_width(data, num_bins, labels=None, include_lowest=False, right=True):
    data_min, data_max = min(data), max(data)
    bin_width = (data_max - data_min) / num_bins
    bin_edges = [data_min + i * bin_width for i in range(num_bins + 1)]
    if labels is None:
        labels = [f'Interval {i+1}' for i in range(num_bins)]
    bin_labels = []
    for value in data:
        for i in range(len(bin_edges) - 1):
            if (include_lowest and i == 0) or (right and value <= bin_edges[i + 1] and value > bin_edges[i]) or (
                    not right and value < bin_edges[i + 1] and value >= bin_edges[i]):
                bin_labels.append(labels[i])
                break
        else:
            bin_labels.append(labels[-1])

    return bin_labels

In [5]:
def equal_freq(data, q, labels):
    quantiles = np.percentile(data, np.linspace(0, 100, q+1))
    labeled_data = np.digitize(data, quantiles)
    labeled_series = pd.Series(labeled_data, name=data.name)
    labeled_series.replace(range(1, q + 1), labels, inplace=True)
    return labeled_series

In [6]:
while True:
    try:
        bins = int(input("Bins: "))
        break
    except:
        print("Tape a valide number")

for attribute in ["Temperature", "Humidity", "Rainfall"]:
    df[attribute] = pd.to_numeric(df[attribute].str.replace(',', '.'), errors='coerce')
    df[attribute+'_equal_freq'] = equal_freq(df[attribute], bins, generate_labels(attribute, bins))
    df[attribute+'_equal_width'] = equal_width(df[attribute], bins, generate_labels(attribute, bins))


In [7]:
df = df[[f'Temperature_equal_width',f'Humidity_equal_width', f'Rainfall_equal_width', f'Soil', f'Crop', f'Fertilizer']]
df

Unnamed: 0,Temperature_equal_width,Humidity_equal_width,Rainfall_equal_width,Soil,Crop,Fertilizer
0,Temperature_2,Humidity_1,Rainfall_3,Clayey,rice,DAP
1,Temperature_3,Humidity_3,Rainfall_1,laterite,Coconut,Good NPK
2,Temperature_1,Humidity_1,Rainfall_3,silty clay,rice,MOP
3,Temperature_2,Humidity_3,Rainfall_2,sandy,Coconut,Urea
4,Temperature_2,Humidity_3,Rainfall_1,coastal,Coconut,Urea
...,...,...,...,...,...,...
290,Temperature_2,Humidity_3,Rainfall_1,sandy,Coconut,MOP
291,Temperature_2,Humidity_1,Rainfall_2,silty clay,rice,MOP
292,Temperature_2,Humidity_1,Rainfall_2,Clayey,rice,MOP
293,Temperature_2,Humidity_1,Rainfall_2,Clayey,rice,MOP


In [8]:
def get_k_itemsets(data, k):
    itemsets = []
    for row in data:
        itemsets.extend(combinations(row, k))
    return itemsets

In [9]:
def calculate_support(data, itemset):
    count = 0
    for row in data:
        if all(item in row for item in itemset):
            count += 1
    return count / len(data)

In [10]:
def find_frequent_itemsets(data, min_support):
    itemsets = get_k_itemsets(data, 1)
    frequent_itemsets = []

    k = 1
    while itemsets:
        frequent_itemsets_k = []

        for itemset in itemsets:
            support = calculate_support(data, itemset)
            if support >= min_support:
                frequent_itemsets_k.append((itemset, support))

        frequent_itemsets.extend(frequent_itemsets_k)

        k += 1
        itemsets = list(set(combinations(set(item for itemset in frequent_itemsets_k for item in itemset[0]), k)))

    return frequent_itemsets

In [11]:
def generate_rules(frequent_itemsets, min_confidence):
    rules = []

    for itemset, support in frequent_itemsets:
        if len(itemset) > 1:
            for i in range(1, len(itemset)):
                antecedent = itemset[:i]
                consequent = itemset[i:]

                support_antecedent = calculate_support(df.values.tolist(), antecedent)
                support_consequent = calculate_support(df.values.tolist(), consequent)
                confidence = support / support_antecedent
                rule_support = support
                if confidence >= min_confidence:
                    rules.append((antecedent, support_antecedent, consequent, support_consequent, rule_support, confidence))

    return rules

In [12]:
min_confidence = 0.8
min_support = 0.2
frequent_itemsets = find_frequent_itemsets(df.values.tolist(), min_support=min_support)
association_rules = generate_rules(frequent_itemsets, min_confidence)
association_rules = sorted(association_rules, key=lambda x: x[3], reverse=True)

for antecedent, support_antecedent, consequent, support_consequent, rule_support, confidence in association_rules:
    print(f"Rule: {antecedent}:{support_antecedent:.2f} -> {consequent}:{support_consequent:.2f}, Support: {rule_support:.2f}, Confidence: {confidence:.2f}")
    

Rule: ('silty clay',):0.22 -> ('Humidity_1',):0.50, Support: 0.22, Confidence: 0.98
Rule: ('Rainfall_3',):0.21 -> ('Humidity_1',):0.50, Support: 0.20, Confidence: 0.98
Rule: ('silty clay',):0.22 -> ('rice',):0.48, Support: 0.22, Confidence: 1.00
Rule: ('Rainfall_3',):0.21 -> ('rice',):0.48, Support: 0.20, Confidence: 0.98
Rule: ('Humidity_1',):0.50 -> ('rice',):0.48, Support: 0.45, Confidence: 0.90
Rule: ('silty clay', 'Humidity_1'):0.22 -> ('rice',):0.48, Support: 0.22, Confidence: 1.00
Rule: ('Rainfall_3', 'Humidity_1'):0.20 -> ('rice',):0.48, Support: 0.20, Confidence: 1.00
Rule: ('Humidity_1', 'Temperature_2'):0.27 -> ('rice',):0.48, Support: 0.25, Confidence: 0.91
Rule: ('silty clay',):0.22 -> ('Humidity_1', 'rice'):0.45, Support: 0.22, Confidence: 0.98
Rule: ('Rainfall_3',):0.21 -> ('Humidity_1', 'rice'):0.45, Support: 0.20, Confidence: 0.98


In [13]:
def correlation(rules):
    rules_with_correlation = []
    for antecedent, support_antecedent, consequent, support_consequent, rule_support, confidence in rules:
        lift = rule_support/(support_antecedent*support_consequent)
        confidence = rule_support/max(support_antecedent, support_consequent)
        cosine = rule_support/math.sqrt(support_antecedent*support_consequent)

        rules_with_correlation.append((antecedent, consequent, lift, confidence, cosine))
    return rules_with_correlation

In [14]:
rules_with_correlation = correlation(association_rules)
for antecedent, consequent, lift, confidence, cosine in rules_with_correlation:
    print(f"Rule: {antecedent} -> {consequent}, Corrélation: lift/confidence/cosine: {lift:.2f}/{confidence:.2f}/{cosine:.2f}")

Rule: ('silty clay',) -> ('Humidity_1',), Corrélation: lift/confidence/cosine: 1.98/0.44/0.65
Rule: ('Rainfall_3',) -> ('Humidity_1',), Corrélation: lift/confidence/cosine: 1.97/0.41/0.63
Rule: ('silty clay',) -> ('rice',), Corrélation: lift/confidence/cosine: 2.06/0.45/0.67
Rule: ('Rainfall_3',) -> ('rice',), Corrélation: lift/confidence/cosine: 2.03/0.42/0.64
Rule: ('Humidity_1',) -> ('rice',), Corrélation: lift/confidence/cosine: 1.85/0.90/0.91
Rule: ('silty clay', 'Humidity_1') -> ('rice',), Corrélation: lift/confidence/cosine: 2.06/0.45/0.67
Rule: ('Rainfall_3', 'Humidity_1') -> ('rice',), Corrélation: lift/confidence/cosine: 2.06/0.42/0.65
Rule: ('Humidity_1', 'Temperature_2') -> ('rice',), Corrélation: lift/confidence/cosine: 1.88/0.51/0.68
Rule: ('silty clay',) -> ('Humidity_1', 'rice'), Corrélation: lift/confidence/cosine: 2.20/0.48/0.69
Rule: ('Rainfall_3',) -> ('Humidity_1', 'rice'), Corrélation: lift/confidence/cosine: 2.20/0.45/0.67
