In [1]:
import pandas as pd
import numpy as numpy
import sys
import mlxtend

In [2]:
data=pd.read_csv("lastfm.csv")
data.head(20)

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany
5,1,schandmaul,f,Germany
6,1,edguy,f,Germany
7,1,jack johnson,f,Germany
8,1,eluveitie,f,Germany
9,1,the killers,f,Germany


In [3]:
data_slav = data.query("country in ('Russian Federation', 'Belarus', 'Poland','Ukraine')")
data_slav.head(10)

Unnamed: 0,user,artist,sex,country
490,35,radiohead,m,Ukraine
491,35,the kooks,m,Ukraine
492,35,coldplay,m,Ukraine
493,35,nine inch nails,m,Ukraine
494,35,hot chip,m,Ukraine
495,35,black rebel motorcycle club,m,Ukraine
496,35,lcd soundsystem,m,Ukraine
497,35,kasabian,m,Ukraine
498,35,n*e*r*d,m,Ukraine
499,35,editors,m,Ukraine


In [4]:
data_slav.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32376 entries, 490 to 289881
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user     32376 non-null  int64 
 1   artist   32376 non-null  object
 2   sex      32376 non-null  object
 3   country  32376 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.2+ MB


In [5]:
data_slav_by_user=data_slav.groupby(['user', 'country', 'sex'])['artist']\
                    .apply(lambda x: ','.join(x)).reset_index()
data_slav_by_user

Unnamed: 0,user,country,sex,artist
0,35,Ukraine,m,"radiohead,the kooks,coldplay,nine inch nails,h..."
1,39,Russian Federation,m,"britney spears,a-ha,joss stone,christina aguil..."
2,50,Russian Federation,f,"the ataris,yann tiersen,the smashing pumpkins,..."
3,64,Poland,m,"apparat,drowning pool,armin van buuren,alexiso..."
4,82,Russian Federation,m,"p.o.d.,chimaira,arch enemy,scar symmetry,slipk..."
...,...,...,...,...
2002,19703,Poland,f,"the offspring,pearl jam,the smashing pumpkins,..."
2003,19706,Russian Federation,f,"enigma,pink floyd,the offspring,scorpions"
2004,19710,Russian Federation,f,"vnv nation,nine inch nails,kmfdm,apoptygma ber..."
2005,19712,Poland,m,"john williams,abba,vangelis,hans zimmer,céline..."


In [6]:
data_slav_by_user['country'].value_counts()

Poland                983
Russian Federation    785
Ukraine               183
Belarus                56
Name: country, dtype: int64

In [7]:
dummy_data_slav_by_user = data_slav_by_user['artist'].str.get_dummies(',')
data_slav = pd.concat([data_slav_by_user, dummy_data_slav_by_user], axis = 1)
data_slav.head(10)

Unnamed: 0,user,country,sex,artist,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,311,36 crazyfists,...,weezer,wilco,within temptation,wolfgang amadeus mozart,wu-tang clan,yann tiersen,yeah yeah yeahs,yellowcard,zero 7,Édith piaf
0,35,Ukraine,m,"radiohead,the kooks,coldplay,nine inch nails,h...",0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,39,Russian Federation,m,"britney spears,a-ha,joss stone,christina aguil...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,50,Russian Federation,f,"the ataris,yann tiersen,the smashing pumpkins,...",0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
3,64,Poland,m,"apparat,drowning pool,armin van buuren,alexiso...",0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,82,Russian Federation,m,"p.o.d.,chimaira,arch enemy,scar symmetry,slipk...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,83,Poland,m,"black eyed peas,dido,nelly furtado",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,84,Russian Federation,m,"queen,radiohead,the beatles,louis armstrong,el...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,92,Ukraine,m,"armin van buuren,nickelback,depeche mode,enigm...",0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,113,Poland,m,"flogging molly,rammstein,nine inch nails,board...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,129,Poland,f,"adele,placebo,tricky,röyksopp,amy winehouse,ma...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
from mlxtend.frequent_patterns import apriori, association_rules

frequent_itemsets = apriori(data_slav.drop(columns = ['user', 'country', 'artist', 'sex']), 
                           min_support=0.01, use_colnames=True)

frequent_itemsets.sort_values(by = ['support'], ascending=False)

Unnamed: 0,support,itemsets
309,0.129048,(metallica)
478,0.123568,(the prodigy)
393,0.122073,(red hot chili peppers)
386,0.114599,(radiohead)
102,0.112108,(coldplay)
...,...,...
726,0.010463,"(coma, linkin park)"
724,0.010463,"(coma, koЯn)"
1187,0.010463,"(portishead, red hot chili peppers)"
721,0.010463,"(coma, disturbed)"


In [17]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(above & beyond),(armin van buuren),0.021923,0.038864,0.014449,0.659091,16.958916,0.013597,2.819332
1,(ferry corsten),(armin van buuren),0.022422,0.038864,0.015446,0.688889,17.725641,0.014575,3.089366
2,(kaiser chiefs),(franz ferdinand),0.023418,0.056303,0.014449,0.617021,10.958953,0.013131,2.464098
3,(t.i.),(kanye west),0.015944,0.041854,0.012456,0.78125,18.666295,0.011789,4.380098
4,(king crimson),(pink floyd),0.017439,0.099153,0.010463,0.6,6.051256,0.008734,2.252118
5,(the pussycat dolls),(lady gaga),0.01993,0.029895,0.012456,0.625,20.90625,0.011861,2.586946
6,"(paul van dyk, above & beyond)",(armin van buuren),0.012955,0.038864,0.010962,0.846154,21.772189,0.010458,6.247384
7,"(paul van dyk, armin van buuren)",(above & beyond),0.016442,0.021923,0.010962,0.666667,30.409091,0.010601,2.93423
8,"(above & beyond, armin van buuren)",(paul van dyk),0.014449,0.030892,0.010962,0.758621,24.557286,0.010515,4.014877
9,"(myslovitz, akurat)",(happysad),0.018435,0.053812,0.01146,0.621622,11.551802,0.010468,2.500641


In [64]:
rules[
    (rules.lift>3) &
    (rules.confidence>0.5)
]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(above & beyond),(armin van buuren),0.021923,0.038864,0.014449,0.659091,16.958916,0.013597,2.819332,1
1,(ferry corsten),(armin van buuren),0.022422,0.038864,0.015446,0.688889,17.725641,0.014575,3.089366,1
2,(kaiser chiefs),(franz ferdinand),0.023418,0.056303,0.014449,0.617021,10.958953,0.013131,2.464098,1
3,(t.i.),(kanye west),0.015944,0.041854,0.012456,0.78125,18.666295,0.011789,4.380098,1
4,(king crimson),(pink floyd),0.017439,0.099153,0.010463,0.6,6.051256,0.008734,2.252118,1
5,(the pussycat dolls),(lady gaga),0.01993,0.029895,0.012456,0.625,20.90625,0.011861,2.586946,1
6,"(paul van dyk, above & beyond)",(armin van buuren),0.012955,0.038864,0.010962,0.846154,21.772189,0.010458,6.247384,2
7,"(paul van dyk, armin van buuren)",(above & beyond),0.016442,0.021923,0.010962,0.666667,30.409091,0.010601,2.93423,2
8,"(above & beyond, armin van buuren)",(paul van dyk),0.014449,0.030892,0.010962,0.758621,24.557286,0.010515,4.014877,2
9,"(myslovitz, akurat)",(happysad),0.018435,0.053812,0.01146,0.621622,11.551802,0.010468,2.500641,2


In [21]:
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules[ (rules['antecedent_len'] >= 2) &
       (rules['confidence'] > 0.5) &
       (rules['lift'] > 10) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
6,"(paul van dyk, above & beyond)",(armin van buuren),0.012955,0.038864,0.010962,0.846154,21.772189,0.010458,6.247384,2
7,"(paul van dyk, armin van buuren)",(above & beyond),0.016442,0.021923,0.010962,0.666667,30.409091,0.010601,2.93423,2
8,"(above & beyond, armin van buuren)",(paul van dyk),0.014449,0.030892,0.010962,0.758621,24.557286,0.010515,4.014877,2
9,"(myslovitz, akurat)",(happysad),0.018435,0.053812,0.01146,0.621622,11.551802,0.010468,2.500641,2
10,"(happysad, pidżama porno)",(akurat),0.02292,0.048331,0.014449,0.630435,13.044151,0.013342,2.575105,2
11,"(happysad, akurat)",(pidżama porno),0.023916,0.050324,0.014449,0.604167,12.005569,0.013246,2.399182,2
12,"(pidżama porno, akurat)",(happysad),0.020927,0.053812,0.014449,0.690476,12.831349,0.013323,3.056916,2
13,"(happysad, red hot chili peppers)",(akurat),0.016442,0.048331,0.010463,0.636364,13.166823,0.009669,2.61709,2
14,"(akurat, red hot chili peppers)",(happysad),0.016442,0.053812,0.010463,0.636364,11.825758,0.009579,2.602018,2
16,"(ferry corsten, atb)",(armin van buuren),0.011958,0.038864,0.010463,0.875,22.514423,0.009999,7.689088,2


In [38]:
from mlxtend.frequent_patterns import fpgrowth
items_fpgrowth = fpgrowth(data_slav.drop(columns = ['user', 'country', 'artist', 'sex']), min_support=0.01, use_colnames=True)
items_fpgrowth.sort_values(by = ['support'], ascending=False)

Unnamed: 0,support,itemsets
155,0.129048,(metallica)
108,0.123568,(the prodigy)
58,0.122073,(red hot chili peppers)
0,0.114599,(radiohead)
1,0.112108,(coldplay)
...,...,...
565,0.010463,"(portishead, red hot chili peppers)"
842,0.010463,"(pink floyd, rammstein)"
561,0.010463,"(portishead, nirvana)"
553,0.010463,"(massive attack, metallica)"


In [46]:
rules_fpgrowth = association_rules(items_fpgrowth, metric="confidence", min_threshold=0.6)
rules_fpgrowth["antecedent_len"] = rules_fpgrowth["antecedents"].apply(lambda x: len(x))
rules_fpgrowth

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,"(linkin park, koЯn)",(system of a down),0.01993,0.106129,0.012955,0.65,6.124648,0.010839,2.553918,2
1,"(björk, massive attack)",(portishead),0.018934,0.074738,0.011958,0.631579,8.450526,0.010543,2.511424,2
2,"(the doors, led zeppelin)",(pink floyd),0.017937,0.099153,0.01146,0.638889,6.443467,0.009681,2.494653,2
3,(the pussycat dolls),(lady gaga),0.01993,0.029895,0.012456,0.625,20.90625,0.011861,2.586946,1
4,"(myslovitz, pidżama porno)",(happysad),0.015446,0.053812,0.010463,0.677419,12.58871,0.009632,2.933184,2
5,"(happysad, pidżama porno)",(akurat),0.02292,0.048331,0.014449,0.630435,13.044151,0.013342,2.575105,2
6,"(happysad, akurat)",(pidżama porno),0.023916,0.050324,0.014449,0.604167,12.005569,0.013246,2.399182,2
7,"(pidżama porno, akurat)",(happysad),0.020927,0.053812,0.014449,0.690476,12.831349,0.013323,3.056916,2
8,"(happysad, red hot chili peppers)",(akurat),0.016442,0.048331,0.010463,0.636364,13.166823,0.009669,2.61709,2
9,"(akurat, red hot chili peppers)",(happysad),0.016442,0.053812,0.010463,0.636364,11.825758,0.009579,2.602018,2


In [47]:
rules_fpgrowth[ (rules_fpgrowth['antecedent_len'] >= 2)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,"(linkin park, koЯn)",(system of a down),0.01993,0.106129,0.012955,0.65,6.124648,0.010839,2.553918,2
1,"(björk, massive attack)",(portishead),0.018934,0.074738,0.011958,0.631579,8.450526,0.010543,2.511424,2
2,"(the doors, led zeppelin)",(pink floyd),0.017937,0.099153,0.01146,0.638889,6.443467,0.009681,2.494653,2
4,"(myslovitz, pidżama porno)",(happysad),0.015446,0.053812,0.010463,0.677419,12.58871,0.009632,2.933184,2
5,"(happysad, pidżama porno)",(akurat),0.02292,0.048331,0.014449,0.630435,13.044151,0.013342,2.575105,2
6,"(happysad, akurat)",(pidżama porno),0.023916,0.050324,0.014449,0.604167,12.005569,0.013246,2.399182,2
7,"(pidżama porno, akurat)",(happysad),0.020927,0.053812,0.014449,0.690476,12.831349,0.013323,3.056916,2
8,"(happysad, red hot chili peppers)",(akurat),0.016442,0.048331,0.010463,0.636364,13.166823,0.009669,2.61709,2
9,"(akurat, red hot chili peppers)",(happysad),0.016442,0.053812,0.010463,0.636364,11.825758,0.009579,2.602018,2
10,"(myslovitz, akurat)",(happysad),0.018435,0.053812,0.01146,0.621622,11.551802,0.010468,2.500641,2


In [52]:
from mlxtend.frequent_patterns import fpmax

fpm=fpmax(data_slav.drop(columns=['user','country','artist', 'sex']), min_support=0.01, use_colnames=True)
fpm.sort_values(by = ['support'], ascending=False)

Unnamed: 0,support,itemsets
713,0.051320,([unknown])
538,0.037867,(tori amos)
537,0.037867,(clint mansell)
500,0.035874,(god is an astronaut)
501,0.035874,(boards of canada)
...,...,...
420,0.010463,"(bonobo, the cinematic orchestra)"
867,0.010463,"(u2, massive attack)"
878,0.010463,"(u2, coldplay, radiohead)"
879,0.010463,"(30 seconds to mars, slipknot)"


In [63]:
rules_fpmax = association_rules(fpm, metric="confidence", min_threshold=0.025, support_only=True)
rules_fpmax

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(iron maiden),(metallica),,,0.031888,,,,
1,(metallica),(iron maiden),,,0.031888,,,,
2,(radiohead),(portishead),,,0.027404,,,,
3,(portishead),(radiohead),,,0.027404,,,,
4,(rammstein),(metallica),,,0.027902,,,,
5,(metallica),(rammstein),,,0.027902,,,,
6,(the offspring),(red hot chili peppers),,,0.025411,,,,
7,(red hot chili peppers),(the offspring),,,0.025411,,,,
8,(pink floyd),(the beatles),,,0.025411,,,,
9,(the beatles),(pink floyd),,,0.025411,,,,
