Wybierz dowolny zbiór danych transakcyjnych z https://www.kaggle.com/datasets lub https://archive.ics.uci.edu/ml/index.php. Wygeneruj i zaprezentuj reguły asocjacyjne z różnymi wartościami progowymi (trzy ustawienia). Porównaj i skomentuj wyniki. Można korzystać z dowolnego narzędzia Python, Rapid Miner lub Statistica.

Przygotuj sprawozdanie z wykonanych prac - skrypt Python z komentarzami, nagrany film, raport pdf. Wyślij do oceny.

### **Importy i wczytanie danych**

In [102]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from ucimlrepo import fetch_ucirepo

# fetch dataset
wine_quality = fetch_ucirepo(id=186)

# data (as pandas dataframes)
X = wine_quality.data.features
y = wine_quality.data.targets

# Połączenie features i target
wine = pd.concat([X, y.rename(columns={'quality': 'quality'})], axis=1)

wine



Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


### Kodowanie danych kategorycznych do formatu One-Hot

In [97]:
# Quality na kategorie
wine['quality'] = pd.cut(wine['quality'],
                         bins=[0, 5, 7, 10],
                         labels=['Low_Quality', 'Medium_Quality', 'High_Quality'])

# Wszystkie cechy na tercyle
for col in wine.columns[:-1]:  # wszystkie oprócz 'quality'
    wine[col] = pd.qcut(wine[col],
                        q=3,
                        labels=['Low', 'Medium', 'High'],
                        duplicates='drop')

# One-hot encoding dla Apriori
data_encoded = pd.get_dummies(wine, dtype=bool)
data_encoded

Unnamed: 0,fixed_acidity_Low,fixed_acidity_Medium,fixed_acidity_High,volatile_acidity_Low,volatile_acidity_Medium,volatile_acidity_High,citric_acid_Low,citric_acid_Medium,citric_acid_High,residual_sugar_Low,...,pH_High,sulphates_Low,sulphates_Medium,sulphates_High,alcohol_Low,alcohol_Medium,alcohol_High,quality_Low_Quality,quality_Medium_Quality,quality_High_Quality
0,False,True,False,False,False,True,True,False,False,True,...,True,False,True,False,True,False,False,True,False,False
1,False,False,True,False,False,True,True,False,False,False,...,False,False,False,True,False,True,False,True,False,False
2,False,False,True,False,False,True,True,False,False,False,...,False,False,False,True,False,True,False,True,False,False
3,False,False,True,False,True,False,False,False,True,True,...,False,False,False,True,False,True,False,False,True,False
4,False,True,False,False,False,True,True,False,False,True,...,True,False,True,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,True,False,False,True,False,False,False,True,False,True,...,False,False,True,False,False,False,True,False,True,False
6493,True,False,False,False,True,False,False,True,False,False,...,False,True,False,False,True,False,False,True,False,False
6494,True,False,False,True,False,False,True,False,False,True,...,False,True,False,False,True,False,False,False,True,False
6495,True,False,False,False,True,False,False,True,False,True,...,True,True,False,False,False,False,True,False,True,False


In [98]:
thresholds = [
    {'min_support': 0.05, 'min_confidence': 0.7},  # ZESTAW 1 - wysoka pewność, umiarkowany support
    {'min_support': 0.03, 'min_confidence': 0.5},  # ZESTAW 2 - umiarkowana pewność i support
    {'min_support': 0.01, 'min_confidence': 0.3}   # ZESTAW 3 - niski support, niska pewność
]

In [99]:
import pandas as pd

all = []

for i, params in enumerate(thresholds, start=1):
    frequent_itemsets = apriori(data_encoded,
                                min_support=params['min_support'],
                                use_colnames=True)

    rules = association_rules(frequent_itemsets,
                              metric="confidence",
                              min_threshold=params['min_confidence'])

    rules = rules.sort_values(by="lift", ascending=False)

    # Dodajemy kolumny z zestawem i progami
    rules["Zestaw"] = f"ZESTAW {i}"
    rules["min_support"] = params["min_support"]
    rules["min_confidence"] = params["min_confidence"]

    all.append(rules)  # tylko pierwsze 10

df = pd.concat(all, ignore_index=True)

df= df[[
    "Zestaw", "min_support", "min_confidence",
    "antecedents", "consequents", "support", "confidence", "lift"
]]

df


Unnamed: 0,Zestaw,min_support,min_confidence,antecedents,consequents,support,confidence,lift
0,ZESTAW 1,0.05,0.7,"(free_sulfur_dioxide_Low, fixed_acidity_High, ...","(total_sulfur_dioxide_Low, density_High)",0.052948,0.783599,5.600708
1,ZESTAW 1,0.05,0.7,"(fixed_acidity_High, residual_sugar_Medium, ch...","(total_sulfur_dioxide_Low, density_High)",0.055256,0.777056,5.553943
2,ZESTAW 1,0.05,0.7,"(total_sulfur_dioxide_Low, density_High, citri...","(free_sulfur_dioxide_Low, volatile_acidity_Hig...",0.051870,0.825980,5.498355
3,ZESTAW 1,0.05,0.7,"(free_sulfur_dioxide_Low, residual_sugar_Mediu...","(fixed_acidity_High, total_sulfur_dioxide_Low,...",0.052948,0.757709,5.469819
4,ZESTAW 1,0.05,0.7,"(total_sulfur_dioxide_Low, citric_acid_High, d...","(fixed_acidity_High, chlorides_High)",0.053563,0.935484,5.460772
...,...,...,...,...,...,...,...,...
732496,ZESTAW 3,0.01,0.3,"(chlorides_High, fixed_acidity_High, volatile_...",(quality_Medium_Quality),0.011852,0.303150,0.503081
732497,ZESTAW 3,0.01,0.3,"(volatile_acidity_High, chlorides_High, pH_Hig...",(quality_Medium_Quality),0.012775,0.301818,0.500872
732498,ZESTAW 3,0.01,0.3,"(free_sulfur_dioxide_Low, residual_sugar_Low, ...",(quality_Medium_Quality),0.011698,0.301587,0.500489
732499,ZESTAW 3,0.01,0.3,"(free_sulfur_dioxide_Low, citric_acid_Low, chl...",(quality_Medium_Quality),0.013545,0.301370,0.500128


### Porównanie zestawów reguł

In [100]:

summary_rows = []
for zestaw in df['Zestaw'].unique():
    subset = df[df['Zestaw'] == zestaw]
    num_rules = len(subset)
    avg_support = subset['support'].mean()
    min_support = subset['support'].min()
    max_support = subset['support'].max()
    avg_confidence = subset['confidence'].mean()
    min_confidence = subset['confidence'].min()
    max_confidence = subset['confidence'].max()
    avg_lift = subset['lift'].mean()
    min_lift = subset['lift'].min()
    max_lift = subset['lift'].max()

    # Liczba reguł związanych z 'Potability'
    related_potability = subset[
        subset['antecedents'].astype(str).str.contains('Potability') |
        subset['consequents'].astype(str).str.contains('Potability')
    ]
    num_potability = len(related_potability)

    # Reguły złożone
    complex_rules = subset[
        subset['antecedents'].astype(str).str.contains(',') |
        subset['consequents'].astype(str).str.contains(',')
    ]
    num_complex = len(complex_rules)

    summary_rows.append({
        'Zestaw': zestaw,
        'Liczba reguł': num_rules,
        'Średni support': avg_support,
        'Min support': min_support,
        'Max support': max_support,
        'Średnia confidence': avg_confidence,
        'Min confidence': min_confidence,
        'Max confidence': max_confidence,
        'Średni lift': avg_lift,
        'Min lift': min_lift,
        'Max lift': max_lift,
        'Reguły z Potability': num_potability,
        'Reguły złożone': num_complex
    })

# Tworzymy dataframe ze statystykami
summary_df = pd.DataFrame(summary_rows)

# Wyświetlamy wynikową tabelę
summary_df


Unnamed: 0,Zestaw,Liczba reguł,Średni support,Min support,Max support,Średnia confidence,Min confidence,Max confidence,Średni lift,Min lift,Max lift,Reguły z Potability,Reguły złożone
0,ZESTAW 1,2735,0.07035,0.050023,0.260736,0.809361,0.7,1.0,2.603872,1.162669,5.600708,0,2727
1,ZESTAW 2,28724,0.045539,0.030014,0.260736,0.677491,0.5,1.0,2.70636,0.829757,8.063966,0,28633
2,ZESTAW 3,701042,0.017856,0.010005,0.260736,0.510444,0.3,1.0,3.061886,0.499916,18.111385,0,700298
