# init

In [149]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from fim import apriori

# lettura dataset

In [116]:
path = "data/"

df = pd.read_csv(path + 'Dataset_forPM.csv')

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17895 entries, 0 to 17894
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              17895 non-null  object 
 1   duration_ms       17895 non-null  int64  
 2   explicit          17895 non-null  bool   
 3   popularity        17895 non-null  int64  
 4   artists           17895 non-null  object 
 5   album_name        17895 non-null  object 
 6   danceability      17895 non-null  float64
 7   energy            17895 non-null  float64
 8   key               17895 non-null  int64  
 9   loudness          17895 non-null  float64
 10  mode              17895 non-null  bool   
 11  speechiness       17895 non-null  float64
 12  acousticness      17895 non-null  float64
 13  instrumentalness  17895 non-null  float64
 14  liveness          17895 non-null  float64
 15  valence           17895 non-null  float64
 16  tempo             17895 non-null  float6

# Feature processing

In [118]:
df = df.drop(['genre_val'], axis=1)

## explicit

In [119]:
df['explicit'] = df['explicit'].map(lambda x: 'Explicit' if x else 'Non_Explicit')

## key

In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17895 entries, 0 to 17894
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              17895 non-null  object 
 1   duration_ms       17895 non-null  int64  
 2   explicit          17895 non-null  object 
 3   popularity        17895 non-null  int64  
 4   artists           17895 non-null  object 
 5   album_name        17895 non-null  object 
 6   danceability      17895 non-null  float64
 7   energy            17895 non-null  float64
 8   key               17895 non-null  int64  
 9   loudness          17895 non-null  float64
 10  mode              17895 non-null  bool   
 11  speechiness       17895 non-null  float64
 12  acousticness      17895 non-null  float64
 13  instrumentalness  17895 non-null  float64
 14  liveness          17895 non-null  float64
 15  valence           17895 non-null  float64
 16  tempo             17895 non-null  float6

In [121]:
key_mapping = {
    0: 'C',
    1: 'C#',
    2: 'D',
    3: 'D#',
    4: 'E',
    5: 'F',
    6: 'F#',
    7: 'G',
    8: 'G#',
    9: 'A',
    10: 'A#',
    11: 'B'
}

# Crea una nuova colonna 'key_feature' nel DataFrame
df['key_song'] = df['key'].map(key_mapping)

In [122]:
df = df.drop('key', axis=1)

## mode

In [123]:
df['mode_song'] = df['mode'].map(lambda x: 'Major' if x else 'Minor')

In [124]:
df = df.drop('mode', axis=1)

## time signature

In [125]:
time_signature_mapping = {
    0: '0_signature',
    1: '1_signature',
    2: '2_signature',
    3: '3_signature',
    4: '4_signature',
    5: '5_signature',
}

df['time_signature_song'] = df['time_signature'].map(time_signature_mapping)

In [126]:
df = df.drop('time_signature', axis=1)

In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17895 entries, 0 to 17894
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   name                 17895 non-null  object 
 1   duration_ms          17895 non-null  int64  
 2   explicit             17895 non-null  object 
 3   popularity           17895 non-null  int64  
 4   artists              17895 non-null  object 
 5   album_name           17895 non-null  object 
 6   danceability         17895 non-null  float64
 7   energy               17895 non-null  float64
 8   loudness             17895 non-null  float64
 9   speechiness          17895 non-null  float64
 10  acousticness         17895 non-null  float64
 11  instrumentalness     17895 non-null  float64
 12  liveness             17895 non-null  float64
 13  valence              17895 non-null  float64
 14  tempo                17895 non-null  float64
 15  n_beats              17895 non-null 

## processing

In [128]:
df['processing'].unique()

array([1.27930535, 2.36741214, 3.70048309, 4.06708595, 0.9160105 ,
       1.1709531 , 1.34355828, 2.72590361, 3.3490566 , 0.75738916,
       1.73891626, 0.74811625])

In [129]:
df['processing_cat'] = df['processing'].apply(lambda x: f"processing_{x}")

In [130]:
df = df.drop('processing', axis=1)

## bins

In [131]:
# Discretizziamo le feature continue
bins = {
    'duration_ms': 5,
    'popularity': 5,
    'danceability': 5,
    'energy': 5,
    'loudness': 5,
    'speechiness': 5,
    'acousticness': 5,
    'instrumentalness': 5,
    'liveness': 5,
    'valence': 5,
    'tempo': 5,
    'n_beats': 5,
    'n_bars': 5,
}

In [132]:
for column, num_bins in bins.items():
    df[column] = pd.cut(df[column], bins=num_bins, labels=False)
    
for column in bins.keys():
    df[column] = df[column].apply(lambda x: f"{column}_{x}")

In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17895 entries, 0 to 17894
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   name                 17895 non-null  object
 1   duration_ms          17895 non-null  object
 2   explicit             17895 non-null  object
 3   popularity           17895 non-null  object
 4   artists              17895 non-null  object
 5   album_name           17895 non-null  object
 6   danceability         17895 non-null  object
 7   energy               17895 non-null  object
 8   loudness             17895 non-null  object
 9   speechiness          17895 non-null  object
 10  acousticness         17895 non-null  object
 11  instrumentalness     17895 non-null  object
 12  liveness             17895 non-null  object
 13  valence              17895 non-null  object
 14  tempo                17895 non-null  object
 15  n_beats              17895 non-null  object
 16  n_ba

# Apriori

In [151]:
X = df.values.tolist()

In [140]:
df.head()

Unnamed: 0,name,duration_ms,explicit,popularity,artists,album_name,danceability,energy,loudness,speechiness,...,liveness,valence,tempo,n_beats,n_bars,genre,key_song,mode_song,time_signature_song,processing_cat
0,Long Road,duration_ms_1,Non_Explicit,popularity_2,Funki Porcini,Hed Phone Sex,danceability_3,energy_2,loudness_2,speechiness_1,...,liveness_1,valence_3,tempo_2,n_beats_2,n_bars_1,j-dance,F,Major,4_signature,processing_1.2793053545586106
1,"Daniâl My Son, Where Did You Vanish?",duration_ms_3,Non_Explicit,popularity_0,Siavash Amini,A Trail of Laughters,danceability_0,energy_0,loudness_0,speechiness_0,...,liveness_2,valence_0,tempo_1,n_beats_2,n_bars_2,iranian,C#,Minor,4_signature,processing_2.36741214057508
2,Ondskapens Galakse,duration_ms_2,Non_Explicit,popularity_0,Kvelertak,Nattesferd,danceability_1,energy_4,loudness_3,speechiness_0,...,liveness_1,valence_2,tempo_2,n_beats_2,n_bars_2,black-metal,D,Major,4_signature,processing_3.70048309178744
3,Can't Look Away,duration_ms_2,Non_Explicit,popularity_1,The Wood Brothers,One Drop of Truth,danceability_2,energy_2,loudness_3,speechiness_0,...,liveness_0,valence_1,tempo_2,n_beats_2,n_bars_2,bluegrass,F,Major,4_signature,processing_1.2793053545586106
4,Thunderground,duration_ms_2,Non_Explicit,popularity_1,The Darkraver;DJ Vince,Happy Hardcore Top 100,danceability_3,energy_3,loudness_2,speechiness_0,...,liveness_1,valence_3,tempo_3,n_beats_2,n_bars_2,happy,G,Major,4_signature,processing_4.067085953878407


In [159]:
supp = 20  # 20%
zmin = 2  # minimum number of items per item set

# Utilizza la funzione apriori con i parametri specificati
itemsets = apriori(X, target="c", supp=supp, zmin=zmin, report="S")
pd.DataFrame(itemsets, columns=["frequent_itemset", "support"])

Unnamed: 0,frequent_itemset,support
0,"(instrumentalness_4, Non_Explicit)",20.201174
1,"(tempo_3, 4_signature)",20.117351
2,"(tempo_3, Non_Explicit)",20.374406
3,"(valence_0, Non_Explicit)",21.279687
4,"(valence_1, 4_signature)",21.693210
...,...,...
762,"(Major, Non_Explicit)",69.756915
763,"(speechiness_0, 4_signature)",69.617212
764,"(speechiness_0, 4_signature, Non_Explicit)",67.443420
765,"(speechiness_0, Non_Explicit)",73.942442


In [158]:
conf = 60
rules = apriori(X, target="r", supp=supp, zmin=zmin, conf=conf, report="aScl")
rules_df = pd.DataFrame(
    rules,
    columns=[
        "consequent",
        "antecedent",
        "abs_support",
        "%_support",
        "confidence",
        "lift",
    ],
)
rules_df.sort_values(by="lift", axis=0, ascending=False)

Unnamed: 0,consequent,antecedent,abs_support,%_support,confidence,lift
87,n_beats_2,"(n_bars_2, 4_signature)",3572,19.960883,0.912388,3.649349
84,n_beats_2,"(n_bars_2, 4_signature, Non_Explicit)",3399,18.994132,0.911749,3.646792
88,n_bars_2,"(n_beats_2, 4_signature)",3572,19.960883,0.856389,3.580628
85,n_bars_2,"(n_beats_2, 4_signature, Non_Explicit)",3399,18.994132,0.854879,3.574314
92,n_beats_2,"(n_bars_2,)",3715,20.759989,0.867991,3.471769
...,...,...,...,...,...,...
3388,speechiness_0,"(duration_ms_1, acousticness_0, instrumentalne...",2583,14.434199,0.665722,0.873202
817,speechiness_0,"(energy_4, duration_ms_1)",2620,14.640961,0.664132,0.871116
886,speechiness_0,"(energy_4, acousticness_0, Major)",2756,15.400950,0.661546,0.867724
976,speechiness_0,"(energy_4, instrumentalness_0, 4_signature)",2838,15.859179,0.656944,0.861689


In [161]:
rules_df_sorted = rules_df.sort_values(by="%_support", ascending=False)
rules_df_sorted

Unnamed: 0,consequent,antecedent,abs_support,%_support,confidence,lift
4327,4_signature,"(Non_Explicit,)",15340,85.722269,0.912118,1.000451
4326,Non_Explicit,"(4_signature,)",15340,85.722269,0.940239,1.000451
4325,speechiness_0,"(Non_Explicit,)",13232,73.942442,0.786776,1.031984
4324,Non_Explicit,"(speechiness_0,)",13232,73.942442,0.969875,1.031984
4318,Major,"(Non_Explicit,)",12483,69.756915,0.742240,1.003126
...,...,...,...,...,...,...
1809,acousticness_0,"(n_beats_1, n_bars_1, loudness_3, Major, Non_E...",2187,12.221291,0.600989,1.085895
1347,loudness_3,"(tempo_2, liveness_0, Major, 4_signature)",2185,12.210115,0.607113,0.997364
633,duration_ms_1,"(danceability_3, Major, speechiness_0)",2183,12.198938,0.600550,1.094272
1152,tempo_2,"(n_bars_1, duration_ms_1, acousticness_0, 4_si...",2178,12.170997,0.603659,1.290155


In [165]:
rules_df['antecedent']

0       (instrumentalness_4, Non_Explicit)
1                    (instrumentalness_4,)
2       (instrumentalness_4, Non_Explicit)
3                    (instrumentalness_4,)
4       (instrumentalness_4, Non_Explicit)
                       ...                
4323                        (4_signature,)
4324                      (speechiness_0,)
4325                       (Non_Explicit,)
4326                        (4_signature,)
4327                       (Non_Explicit,)
Name: antecedent, Length: 4328, dtype: object

In [196]:
rules_df['antecedent'].str.contains('disney', regex=False)

0       False
1       False
2       False
3       False
4       False
        ...  
4323    False
4324    False
4325    False
4326    False
4327    False
Name: antecedent, Length: 4328, dtype: bool

In [197]:
# Trova le regole con 'disney' tra gli antecedenti
rules_with_disney_antecedent = rules_df[rules_df['antecedent'].str.contains('disney', regex=False)]

# Trova le regole con 'disney' tra i consequent
rules_with_disney_consequent = rules_df[rules_df['consequent'].str.contains('disney', regex=False)]

# Stampa le regole con 'disney' tra gli antecedenti
print("Regole con 'disney' tra gli antecedenti:")
print(rules_with_disney_antecedent)

# Stampa le regole con 'disney' tra i consequent
print("\nRegole con 'disney' tra i consequent:")
print(rules_with_disney_consequent)

Regole con 'disney' tra gli antecedenti:
Empty DataFrame
Columns: [consequent, antecedent, abs_support, %_support, confidence, lift]
Index: []

Regole con 'disney' tra i consequent:
Empty DataFrame
Columns: [consequent, antecedent, abs_support, %_support, confidence, lift]
Index: []
