## Setup

### Contains useful pattern mining tools

In [1]:
%pip install mlxtend




You should consider upgrading via the 'c:\Users\Fiona Eguare\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


### Useful imports
Below are the imports that were useful in the cleaning & pre-processing, and the pattern mining

In [40]:
import pandas as pd
import numpy as np
import time
import tracemalloc
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

### Set dataframe default to not truncate the values

In [None]:
pd.set_option("display.max_colwidth", None)

## Cleaning & Preprocessing (general)
We started the cleaning the data by dropping the unneccesary columns reduce the dimensions of the dataset. The columns that were dropped didn't contribute to any improvements in regards of the classification problem so we decided to drop them. We started by converting the strings to floats by encoding them, but we then decided to drop them anyways since they didn't contribute to the algorithm. We also noticed some inconsistincies, for example some values were just missing, others conained questionmarks (?) and others said 'NaN'. We decided to replace all of the missing values with 'NaN' so it would be easier to remove all of them later.

Furthermore, we replaced the mode (Minor/Major) with the float values 0.0 and 1.0, so that all of the feature data is stored as the same type (float64). We also realised that there are five rows that only contain NaN values, so we removed those as well.

Lastly we noticed that the column 'tempo' for some reason contained lots of missing values, so we dropped that as well.

In [19]:
# Fetch Data
data = pd.read_csv('music_dataset.csv', header='infer')

# Drop unnecessary columns
columns_drop = ['instance_id', 'track_name', 'key', 'obtained_date', 'artist_name']  # popularity #duration_ms
rows_drop = ['Rap', 'Alternative']

# Drop rows based on conditions
data = data[~data['music_genre'].isin(rows_drop)]
data = data.drop(columns=columns_drop)
print(data.head(10))

# Replace '?' with NaN
data = data.applymap(lambda cell: np.NaN if str(cell) == "?" else cell)

# Replace missing values with NaN
data = data.fillna(np.NaN)

# Replace mode with binary numbers
data['mode'] = data['mode'].replace({'Minor': 0.0, 'Major': 1.0})

# Count missing values for every column in our dataset
NaN_values = data.isna().sum()
print("Missing values per column:")
print(NaN_values)

# Check to see if they are all on the same row
NaN_rows = data[data.isna().all(axis=1)]
print(NaN_rows)             

# Drop rows with all NaN values
data = data.dropna(how='all')

# Lots of missing values, drop this column
data = data.drop(columns='tempo')

   popularity  acousticness  danceability  duration_ms  energy  \
0        27.0       0.00468         0.652         -1.0   0.941   
1        31.0       0.01270         0.622     218293.0   0.890   
2        28.0       0.00306         0.620     215613.0   0.755   
3        34.0       0.02540         0.774     166875.0   0.700   
4        32.0       0.00465         0.638     222369.0   0.587   
5        47.0       0.00523         0.755     519468.0   0.731   
6        46.0       0.02890         0.572     214408.0   0.803   
7        43.0       0.02970         0.809     416132.0   0.706   
8        39.0       0.00299         0.509     292800.0   0.921   
9        22.0       0.00934         0.578     204800.0   0.731   

   instrumentalness  liveness  loudness   mode  speechiness  \
0          0.792000    0.1150    -5.201  Minor       0.0748   
1          0.950000    0.1240    -7.043  Minor       0.0300   
2          0.011800    0.5340    -4.617  Major       0.0345   
3          0.002530  

  data = data.applymap(lambda cell: np.NaN if str(cell) == "?" else cell)


Missing values per column:
popularity             5
acousticness           5
danceability           5
duration_ms            5
energy                 5
instrumentalness       5
liveness               5
loudness               5
mode                   5
speechiness            5
tempo               3984
valence                5
music_genre            5
dtype: int64
       popularity  acousticness  danceability  duration_ms  energy  \
10000         NaN           NaN           NaN          NaN     NaN   
10001         NaN           NaN           NaN          NaN     NaN   
10002         NaN           NaN           NaN          NaN     NaN   
10003         NaN           NaN           NaN          NaN     NaN   
10004         NaN           NaN           NaN          NaN     NaN   

       instrumentalness  liveness  loudness  mode  speechiness tempo  valence  \
10000               NaN       NaN       NaN   NaN          NaN   NaN      NaN   
10001               NaN       NaN       NaN   NaN   

## Pattern Mining

### Select the data to be mined (all numeric data)

In [36]:
pm_all_data = data.drop(columns='music_genre')
pm_all_data

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode,speechiness,valence
0,27.0,0.00468,0.652,-1.0,0.941,0.79200,0.115,-5.201,0.0,0.0748,0.759
1,31.0,0.01270,0.622,218293.0,0.890,0.95000,0.124,-7.043,0.0,0.0300,0.531
2,28.0,0.00306,0.620,215613.0,0.755,0.01180,0.534,-4.617,1.0,0.0345,0.333
3,34.0,0.02540,0.774,166875.0,0.700,0.00253,0.157,-4.498,1.0,0.2390,0.270
4,32.0,0.00465,0.638,222369.0,0.587,0.90900,0.157,-6.266,1.0,0.0413,0.323
...,...,...,...,...,...,...,...,...,...,...,...
50000,59.0,0.03340,0.913,-1.0,0.574,0.00000,0.119,-7.022,1.0,0.2980,0.330
50001,72.0,0.15700,0.709,251860.0,0.362,0.00000,0.109,-9.814,1.0,0.0550,0.113
50002,51.0,0.00597,0.693,189483.0,0.763,0.00000,0.143,-5.443,1.0,0.1460,0.395
50003,65.0,0.08310,0.782,262773.0,0.472,0.00000,0.106,-5.016,0.0,0.0441,0.354


### Functions to normalise
Can be called to normalise the given data (df) by the normalisation method specified by normType (min-max -> 'm', z-score -> 'z')

In [7]:
def dfNorm(df, normType):
  for (column_name, column) in df.transpose().iterrows():
    if normType == 'm':
      mmNormalise(df, column_name)
    elif normType == 'z':
      zNormalise(df, column_name)
  return df

# Min-Max normalisation function
def mmNormalise(df, name):
  df[name] = ((df[name] - df[name].min()) / (df[name].max() - df[name].min())) - 0.5 # -0.5 to centre it around 0, to accomodate z score in discretisation function. [-0.5,0.5 normalised]

# Z-Score normalisation function
def zNormalise(df, name):
  df[name] = (df[name] - df[name].mean()) / df[name].std()

### Functions to discretise
Can be called to create and populate low and high collums for each arrtribute

In [9]:
def stratifyData(df):
  for (column_name, column) in df.transpose().iterrows():

    new_column_name = 'Low ' + column_name
    df[new_column_name] = df[column_name].apply(isLow)

    new_column_name = 'High ' + column_name
    df[new_column_name] = df[column_name].apply(isHigh)

    df = df.drop(labels=column_name, axis=1)

  return df

# To fill the 'low...' colums
def isLow(val):
  return True if float(val) < 0 else False

# To fill the 'high...' colums
def isHigh(val):
  return True if float(val) >= 0 else False

### Functions to mine frequent patterns using above functions
- getFreq can be called to get a data frame containing all frequent itemsets with a minimum support of minSup, using the algorithm specified by pmType (A.priori -> 'a', FP-Growth -> 'f').
- getMore can be called to get a a datafram containing more interestingness data (association rules) for the frequent patterns given, with a default minimum confidence of 0.7.
- pMine takes the dataframe of data to be mined (df), the type of normalisation desired (nType) and the minimum support desired (minSup) as input, and displays the resluting frequent patterns & their association rules.

In [62]:
def getFreq(df, minSup, pmType):
  if pmType == 'a':
    return apriori(df, use_colnames=True, min_support=minSup)
  elif pmType == 'f':
    return fpgrowth(df, use_colnames=True, min_support=minSup)

def getMore(freq):
  return association_rules(freq)[['antecedents', 'consequents', 'antecedent support', 'consequent support',	'support', 'confidence', 'lift']]

In [61]:
def pMine(df, nType, minSup):
  # normalise the data
  ndf = dfNorm(df, nType)

  # stratify the data
  ndf = stratifyData(ndf)
  display(ndf)

  # get list of frequent patterns with Minimum Support = minSup and display
  freq = getFreq(ndf, minSup, 'f')
  display(freq)

  # get more pattern interestingness metrics for freq patterns (with Minimum Confidence 0.7) and display
  more = getMore(freq)
  display(more)

### Example Frequent Pattern mining outputs
The first table show the dicretised form of the data to be mined, the second shows the frequent patterns mined, and the third shows some interestingness data relating to the frequent patterns.

In [60]:
pMine(pm_all_data.copy(), 'm', 0.7)

Unnamed: 0,Low popularity,High popularity,Low acousticness,High acousticness,Low danceability,High danceability,Low duration_ms,High duration_ms,Low energy,High energy,...,Low liveness,High liveness,Low loudness,High loudness,Low mode,High mode,Low speechiness,High speechiness,Low valence,High valence
0,True,False,True,False,False,True,True,False,False,True,...,True,False,False,True,True,False,True,False,False,True
1,True,False,True,False,False,True,True,False,False,True,...,True,False,False,True,True,False,True,False,False,True
2,True,False,True,False,False,True,True,False,False,True,...,False,True,False,True,False,True,True,False,True,False
3,True,False,True,False,False,True,True,False,False,True,...,True,False,False,True,False,True,True,False,True,False
4,True,False,True,False,False,True,True,False,False,True,...,True,False,False,True,False,True,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,False,True,True,False,False,True,True,False,False,True,...,True,False,False,True,False,True,True,False,True,False
50001,False,True,True,False,False,True,True,False,True,False,...,True,False,False,True,False,True,True,False,True,False
50002,False,True,True,False,False,True,True,False,False,True,...,True,False,False,True,False,True,True,False,True,False
50003,False,True,True,False,False,True,True,False,True,False,...,True,False,False,True,True,False,True,False,True,False


Unnamed: 0,support,itemsets
0,0.999875,(Low duration_ms)
1,0.9938,(Low speechiness)
2,0.94215,(Low liveness)
3,0.92445,(High loudness)
4,0.766025,(Low instrumentalness)
5,0.993675,"(Low duration_ms, Low speechiness)"
6,0.94205,"(Low duration_ms, Low liveness)"
7,0.936875,"(Low liveness, Low speechiness)"
8,0.936775,"(Low duration_ms, Low liveness, Low speechiness)"
9,0.92435,"(Low duration_ms, High loudness)"


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
0,(Low duration_ms),(Low speechiness),0.999875,0.993800,0.993675,0.993799,0.999999
1,(Low speechiness),(Low duration_ms),0.993800,0.999875,0.993675,0.999874,0.999999
2,(Low duration_ms),(Low liveness),0.999875,0.942150,0.942050,0.942168,1.000019
3,(Low liveness),(Low duration_ms),0.942150,0.999875,0.942050,0.999894,1.000019
4,(Low liveness),(Low speechiness),0.942150,0.993800,0.936875,0.994401,1.000605
...,...,...,...,...,...,...,...
100,"(Low duration_ms, Low instrumentalness)","(High loudness, Low liveness)",0.765925,0.869225,0.701150,0.915429,1.053155
101,"(High loudness, Low instrumentalness)","(Low duration_ms, Low liveness)",0.747075,0.942050,0.701150,0.938527,0.996260
102,"(High loudness, Low liveness)","(Low duration_ms, Low instrumentalness)",0.869225,0.765925,0.701150,0.806638,1.053155
103,"(Low instrumentalness, Low liveness)","(Low duration_ms, High loudness)",0.718300,0.924350,0.701150,0.976124,1.056011


## Comparisons


### Min-Max vs. Z-Score normalisation

We can see that while both methods work, Z-Score normalisation gives rise to much less frequent patterns. This is likely due to the variation in the domain of the values, attribute to attribute, and the effect it has on what is classified as low or high. Because of this I think that Min-Max normalisation is more suitable for this dataset.

In [59]:
display('Min-Max', dfNorm(pm_all_data.copy(), 'm'), 'Z-Score', dfNorm(pm_all_data.copy(), 'z'))

print('Min-Max')
display(pMine(pm_all_data.copy(), 'm', 0.7))
print('Z-Score')
display(pMine(pm_all_data.copy(), 'z', 0.7))

'Min-Max'

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode,speechiness,valence
0,-0.218750,-0.495301,0.139465,-0.500000,0.441896,0.295181,-0.393642,0.323883,-0.5,-0.442916,0.265121
1,-0.177083,-0.487249,0.107081,-0.454810,0.390804,0.453815,-0.384554,0.287616,-0.5,-0.491628,0.035282
2,-0.208333,-0.496928,0.104922,-0.455365,0.255562,-0.488153,0.029450,0.335381,0.5,-0.486735,-0.164315
3,-0.145833,-0.474498,0.271157,-0.465454,0.200463,-0.497460,-0.351231,0.337724,0.5,-0.264380,-0.227823
4,-0.166667,-0.495331,0.124352,-0.453966,0.087260,0.412651,-0.351231,0.302914,0.5,-0.479341,-0.174395
...,...,...,...,...,...,...,...,...,...,...,...
50000,0.114583,-0.466466,0.421200,-0.500000,0.074237,-0.500000,-0.389602,0.288029,0.5,-0.200228,-0.167339
50001,0.250000,-0.342369,0.200993,-0.447861,-0.138144,-0.500000,-0.399700,0.233058,0.5,-0.464445,-0.386089
50002,0.031250,-0.494006,0.183722,-0.460774,0.263576,-0.500000,-0.365368,0.319118,0.5,-0.365500,-0.101815
50003,0.177083,-0.416566,0.279793,-0.445602,-0.027946,-0.500000,-0.402729,0.327525,-0.5,-0.476297,-0.143145


'Z-Score'

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode,speechiness,valence
0,-0.930057,-0.942231,0.609281,-1.652803,1.304673,1.648396,-0.476741,0.690479,-1.375502,-0.084666,1.184586
1,-0.672340,-0.919777,0.441458,-0.054252,1.120651,2.102344,-0.421715,0.412047,-1.375502,-0.574569,0.288730
2,-0.865627,-0.946766,0.430270,-0.073878,0.633536,-0.593186,2.084991,0.778755,0.726989,-0.525360,-0.489250
3,-0.479052,-0.884220,1.291761,-0.430782,0.435081,-0.619820,-0.219956,0.796743,0.726989,1.710918,-0.736789
4,-0.607911,-0.942315,0.530964,-0.024404,0.027348,1.984548,-0.219956,0.529497,0.726989,-0.451000,-0.528542
...,...,...,...,...,...,...,...,...,...,...,...
50000,1.131678,-0.861823,2.069342,-1.652803,-0.019560,-0.627089,-0.452285,0.415221,0.726989,2.356103,-0.501037
50001,1.969258,-0.515776,0.928145,0.191556,-0.784512,-0.627089,-0.513424,-0.006810,0.726989,-0.301186,-1.353672
50002,0.616244,-0.938619,0.838639,-0.265226,0.662402,-0.627089,-0.305551,0.653899,0.726989,0.693931,-0.245640
50003,1.518253,-0.722676,1.336514,0.271471,-0.387603,-0.627089,-0.531766,0.718443,-1.375502,-0.420381,-0.406737


Min-Max


Unnamed: 0,Low popularity,High popularity,Low acousticness,High acousticness,Low danceability,High danceability,Low duration_ms,High duration_ms,Low energy,High energy,...,Low liveness,High liveness,Low loudness,High loudness,Low mode,High mode,Low speechiness,High speechiness,Low valence,High valence
0,True,False,True,False,False,True,True,False,False,True,...,True,False,False,True,True,False,True,False,False,True
1,True,False,True,False,False,True,True,False,False,True,...,True,False,False,True,True,False,True,False,False,True
2,True,False,True,False,False,True,True,False,False,True,...,False,True,False,True,False,True,True,False,True,False
3,True,False,True,False,False,True,True,False,False,True,...,True,False,False,True,False,True,True,False,True,False
4,True,False,True,False,False,True,True,False,False,True,...,True,False,False,True,False,True,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,False,True,True,False,False,True,True,False,False,True,...,True,False,False,True,False,True,True,False,True,False
50001,False,True,True,False,False,True,True,False,True,False,...,True,False,False,True,False,True,True,False,True,False
50002,False,True,True,False,False,True,True,False,False,True,...,True,False,False,True,False,True,True,False,True,False
50003,False,True,True,False,False,True,True,False,True,False,...,True,False,False,True,True,False,True,False,True,False


Unnamed: 0,support,itemsets
0,0.999875,(Low duration_ms)
1,0.9938,(Low speechiness)
2,0.94215,(Low liveness)
3,0.92445,(High loudness)
4,0.766025,(Low instrumentalness)
5,0.993675,"(Low duration_ms, Low speechiness)"
6,0.94205,"(Low duration_ms, Low liveness)"
7,0.936875,"(Low liveness, Low speechiness)"
8,0.936775,"(Low duration_ms, Low liveness, Low speechiness)"
9,0.92435,"(Low duration_ms, High loudness)"


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
0,(Low duration_ms),(Low speechiness),0.999875,0.993800,0.993675,0.993799,0.999999
1,(Low speechiness),(Low duration_ms),0.993800,0.999875,0.993675,0.999874,0.999999
2,(Low duration_ms),(Low liveness),0.999875,0.942150,0.942050,0.942168,1.000019
3,(Low liveness),(Low duration_ms),0.942150,0.999875,0.942050,0.999894,1.000019
4,(Low liveness),(Low speechiness),0.942150,0.993800,0.936875,0.994401,1.000605
...,...,...,...,...,...,...,...
100,"(Low duration_ms, Low instrumentalness)","(High loudness, Low liveness)",0.765925,0.869225,0.701150,0.915429,1.053155
101,"(High loudness, Low instrumentalness)","(Low duration_ms, Low liveness)",0.747075,0.942050,0.701150,0.938527,0.996260
102,"(High loudness, Low liveness)","(Low duration_ms, Low instrumentalness)",0.869225,0.765925,0.701150,0.806638,1.053155
103,"(Low instrumentalness, Low liveness)","(Low duration_ms, High loudness)",0.718300,0.924350,0.701150,0.976124,1.056011


None

Z-Score


Unnamed: 0,Low popularity,High popularity,Low acousticness,High acousticness,Low danceability,High danceability,Low duration_ms,High duration_ms,Low energy,High energy,...,Low liveness,High liveness,Low loudness,High loudness,Low mode,High mode,Low speechiness,High speechiness,Low valence,High valence
0,True,False,True,False,False,True,True,False,False,True,...,True,False,False,True,True,False,True,False,False,True
1,True,False,True,False,False,True,True,False,False,True,...,True,False,False,True,True,False,True,False,False,True
2,True,False,True,False,False,True,True,False,False,True,...,False,True,False,True,False,True,True,False,True,False
3,True,False,True,False,False,True,True,False,False,True,...,True,False,False,True,False,True,False,True,True,False
4,True,False,True,False,False,True,True,False,False,True,...,True,False,False,True,False,True,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,False,True,True,False,False,True,True,False,True,False,...,True,False,False,True,False,True,False,True,True,False
50001,False,True,True,False,False,True,False,True,True,False,...,True,False,True,False,False,True,True,False,True,False
50002,False,True,True,False,False,True,True,False,False,True,...,True,False,False,True,False,True,False,True,True,False
50003,False,True,True,False,False,True,False,True,True,False,...,True,False,False,True,True,False,True,False,True,False


Unnamed: 0,support,itemsets
0,0.760325,(Low speechiness)
1,0.718075,(Low instrumentalness)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift


None

### Apriori vs. FP-Growth

Much of this section taken from reference [Github repo](https://github.com/mustafahakkoz/Classification_Clustering_Freq_Pattern_Mining/blob/main/3.frequent-pattern-miningv2.ipynb) provided on course website (repo by mustafahakkoz)

#### Function to calulate the memory used to excecute an algorithm

In [32]:
def calculate_mem_use(snapshot, key_type='lineno', func=""):
    snapshot = snapshot.filter_traces((
        tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
        tracemalloc.Filter(False, "<unknown>"),
    ))
    top_stats = snapshot.statistics(key_type)
    total = 0
    for index, stat in enumerate(top_stats, 1):
        frame = stat.traceback[0]
        if func in frame.filename or "data" in frame.filename:
            total = total + stat.size
    return total

#### To display, memory used, time taken, and output for each algorithm
We can see that while the output was the same from both algorithms, Aprior was more efficient in terms of time and memory usage. It took 1,200B, and 1.18429660797ns less to complete the same task with Apriori, compared to FP-Growth. In other words, FP-Growth took 7% more memory, and 3949% more time.

In [46]:
# normalise & discretise the data
ndf = dfNorm(pm_all_data.copy(), 'm')
ndf = stratifyData(ndf)

# Apriori
start = time.time()
tracemalloc.start()
ap = apriori(ndf, use_colnames=True, min_support=0.8)
snapshot = tracemalloc.take_snapshot()
done = time.time()
elapsed = done - start
mem_use = calculate_mem_use(snapshot, func="fpgrowth")
print("- APRIORI")
print("Mem. use : %1.f B" %(mem_use))
print("Elapsed Time : ",elapsed," ns")
display(ap)

# FP-Growth
start = time.time()
tracemalloc.start()
fp = fpgrowth(ndf, use_colnames=True, min_support=0.8)
snapshot = tracemalloc.take_snapshot()
done = time.time()
elapsed = done - start
mem_use = calculate_mem_use(snapshot, func="fpgrowth")
print("- FP-GROWTH")
print("Mem. use : %1.f B" %(mem_use))
print("Elapsed Time : ",elapsed," ns")
display(fp)

- APRIORI
Mem. use : 16626 B
Elapsed Time :  0.029993534088134766  ns


Unnamed: 0,support,itemsets
0,0.999875,(Low duration_ms)
1,0.94215,(Low liveness)
2,0.92445,(High loudness)
3,0.9938,(Low speechiness)
4,0.94205,"(Low duration_ms, Low liveness)"
5,0.92435,"(Low duration_ms, High loudness)"
6,0.993675,"(Low duration_ms, Low speechiness)"
7,0.869225,"(High loudness, Low liveness)"
8,0.936875,"(Low liveness, Low speechiness)"
9,0.918525,"(High loudness, Low speechiness)"


- FP-GROWTH
Mem. use : 17826 B
Elapsed Time :  1.2142901420593262  ns


Unnamed: 0,support,itemsets
0,0.999875,(Low duration_ms)
1,0.9938,(Low speechiness)
2,0.94215,(Low liveness)
3,0.92445,(High loudness)
4,0.993675,"(Low duration_ms, Low speechiness)"
5,0.94205,"(Low duration_ms, Low liveness)"
6,0.936875,"(Low liveness, Low speechiness)"
7,0.936775,"(Low duration_ms, Low liveness, Low speechiness)"
8,0.92435,"(Low duration_ms, High loudness)"
9,0.918525,"(High loudness, Low speechiness)"
