In [1]:
import pandas as pd
import numpy as numpy
import sys
import mlxtend

In [2]:
data=pd.read_csv("lastfm.csv")
data.head(20)

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany
5,1,schandmaul,f,Germany
6,1,edguy,f,Germany
7,1,jack johnson,f,Germany
8,1,eluveitie,f,Germany
9,1,the killers,f,Germany


In [3]:
data_slav = data.query("country in ('Russian Federation', 'Belarus', 'Poland')")
data_slav.head(10)

Unnamed: 0,user,artist,sex,country
547,39,britney spears,m,Russian Federation
548,39,a-ha,m,Russian Federation
549,39,joss stone,m,Russian Federation
550,39,christina aguilera,m,Russian Federation
551,39,hilary duff,m,Russian Federation
552,39,katie melua,m,Russian Federation
553,39,katy perry,m,Russian Federation
554,39,norah jones,m,Russian Federation
555,39,september,m,Russian Federation
556,39,shakira,m,Russian Federation


In [4]:
data_slav.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29609 entries, 547 to 289869
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user     29609 non-null  int64 
 1   artist   29609 non-null  object
 2   sex      29609 non-null  object
 3   country  29609 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.1+ MB


In [5]:
data_slav_by_user=data_slav.groupby(['user', 'country', 'sex'])['artist']\
                    .apply(lambda x: ','.join(x)).reset_index()
data_slav_by_user

Unnamed: 0,user,country,sex,artist
0,39,Russian Federation,m,"britney spears,a-ha,joss stone,christina aguil..."
1,50,Russian Federation,f,"the ataris,yann tiersen,the smashing pumpkins,..."
2,64,Poland,m,"apparat,drowning pool,armin van buuren,alexiso..."
3,82,Russian Federation,m,"p.o.d.,chimaira,arch enemy,scar symmetry,slipk..."
4,83,Poland,m,"black eyed peas,dido,nelly furtado"
...,...,...,...,...
1819,19702,Russian Federation,m,"muse,finch,regina spektor,radiohead,rage again..."
1820,19703,Poland,f,"the offspring,pearl jam,the smashing pumpkins,..."
1821,19706,Russian Federation,f,"enigma,pink floyd,the offspring,scorpions"
1822,19710,Russian Federation,f,"vnv nation,nine inch nails,kmfdm,apoptygma ber..."


In [6]:
data_slav_by_user['country'].value_counts()

Poland                983
Russian Federation    785
Belarus                56
Name: country, dtype: int64

In [7]:
dummy_data_slav_by_user = data_slav_by_user['artist'].str.get_dummies(',')
data_slav = pd.concat([data_slav_by_user, dummy_data_slav_by_user], axis = 1)
data_slav.head(10)

Unnamed: 0,user,country,sex,artist,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,311,36 crazyfists,...,weezer,wilco,within temptation,wolfgang amadeus mozart,wu-tang clan,yann tiersen,yeah yeah yeahs,yellowcard,zero 7,Édith piaf
0,39,Russian Federation,m,"britney spears,a-ha,joss stone,christina aguil...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,50,Russian Federation,f,"the ataris,yann tiersen,the smashing pumpkins,...",0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
2,64,Poland,m,"apparat,drowning pool,armin van buuren,alexiso...",0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,82,Russian Federation,m,"p.o.d.,chimaira,arch enemy,scar symmetry,slipk...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,83,Poland,m,"black eyed peas,dido,nelly furtado",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,84,Russian Federation,m,"queen,radiohead,the beatles,louis armstrong,el...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,113,Poland,m,"flogging molly,rammstein,nine inch nails,board...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,129,Poland,f,"adele,placebo,tricky,röyksopp,amy winehouse,ma...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,140,Poland,m,"blank & jones,o.s.t.r.,armin van buuren,atb,sc...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,142,Poland,m,"queens of the stone age,pantera,nine inch nail...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
from mlxtend.frequent_patterns import apriori, association_rules

frequent_itemsets = apriori(data_slav.drop(columns = ['user', 'country', 'artist', 'sex']), 
                           min_support=0.01, use_colnames=True)

frequent_itemsets.sort_values(by = ['support'], ascending=False)

Unnamed: 0,support,itemsets
310,0.129386,(metallica)
481,0.126096,(the prodigy)
397,0.121162,(red hot chili peppers)
99,0.112939,(coldplay)
390,0.112390,(radiohead)
...,...,...
951,0.010417,"(koЯn, pink floyd)"
960,0.010417,"(koЯn, the beatles)"
970,0.010417,"(kult, radiohead)"
977,0.010417,"(kylie minogue, madonna)"


In [9]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(nas),(2pac),0.032895,0.032346,0.012061,0.366667,11.335593,0.010997,1.527874
1,(2pac),(nas),0.032346,0.032895,0.012061,0.372881,11.335593,0.010997,1.542141
2,(snoop dogg),(2pac),0.032895,0.032346,0.011513,0.350000,10.820339,0.010449,1.488698
3,(2pac),(snoop dogg),0.032346,0.032895,0.011513,0.355932,10.820339,0.010449,1.501558
4,(30 seconds to mars),(avril lavigne),0.055373,0.057566,0.010965,0.198020,3.439887,0.007777,1.175134
...,...,...,...,...,...,...,...,...,...
1715,"(radiohead, red hot chili peppers)",(muse),0.026316,0.097588,0.010417,0.395833,4.056180,0.007849,1.493648
1716,(muse),"(radiohead, red hot chili peppers)",0.097588,0.026316,0.010417,0.106742,4.056180,0.007849,1.090036
1717,"(nirvana, system of a down)",(red hot chili peppers),0.026316,0.121162,0.010417,0.395833,3.266968,0.007228,1.454628
1718,"(system of a down, red hot chili peppers)",(nirvana),0.029057,0.105811,0.010417,0.358491,3.388014,0.007342,1.393882


In [10]:
rules[
    (rules.lift>3) &
    (rules.confidence>0.5)
]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
40,(a perfect circle),(tool),0.039474,0.058114,0.023026,0.583333,10.037736,0.020732,2.260526
43,(above & beyond),(armin van buuren),0.020285,0.035088,0.013158,0.648649,18.486486,0.012446,2.746289
44,(above & beyond),(atb),0.020285,0.044956,0.010417,0.513514,11.422544,0.009505,1.963146
46,(above & beyond),(ferry corsten),0.020285,0.021382,0.010965,0.540541,25.280665,0.010531,2.129934
47,(ferry corsten),(above & beyond),0.021382,0.020285,0.010965,0.512821,25.280665,0.010531,2.010994
...,...,...,...,...,...,...,...,...,...
1662,"(pink floyd, nirvana)",(metallica),0.020833,0.129386,0.011513,0.552632,4.271186,0.008818,1.946078
1688,"(pink floyd, system of a down)",(metallica),0.015899,0.129386,0.010417,0.655172,5.063705,0.008360,2.524781
1697,"(placebo, nirvana)",(muse),0.022478,0.097588,0.012061,0.536585,5.498493,0.009868,1.947311
1698,"(muse, nirvana)",(placebo),0.023575,0.103070,0.012061,0.511628,4.963879,0.009632,1.836571


In [11]:
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules[ (rules['antecedent_len'] >= 2) &
       (rules['confidence'] > 0.5) &
       (rules['lift'] > 1) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
1495,"(armin van buuren, paul van dyk)",(above & beyond),0.014254,0.020285,0.010417,0.730769,36.024948,0.010128,3.638941,2
1496,"(armin van buuren, above & beyond)",(paul van dyk),0.013158,0.029605,0.010417,0.791667,26.740741,0.010027,4.657895,2
1497,"(paul van dyk, above & beyond)",(armin van buuren),0.012061,0.035088,0.010417,0.863636,24.613636,0.009993,7.076023,2
1501,"(happysad, hey)",(akurat),0.015899,0.05318,0.010965,0.689655,12.968361,0.010119,3.050865,2
1503,"(akurat, hey)",(happysad),0.016447,0.059211,0.010965,0.666667,11.259259,0.009991,2.822368,2
1507,"(myslovitz, happysad)",(akurat),0.023026,0.05318,0.01261,0.547619,10.297496,0.011385,2.092971,2
1508,"(myslovitz, akurat)",(happysad),0.020285,0.059211,0.01261,0.621622,10.498498,0.011409,2.486372,2
1513,"(pidżama porno, happysad)",(akurat),0.025219,0.05318,0.015899,0.630435,11.854774,0.014558,2.561984,2
1514,"(pidżama porno, akurat)",(happysad),0.023026,0.059211,0.015899,0.690476,11.661376,0.014536,3.039474,2
1515,"(happysad, akurat)",(pidżama porno),0.026316,0.055373,0.015899,0.604167,10.910891,0.014442,2.386427,2


In [12]:
from mlxtend.frequent_patterns import fpgrowth
items_fpgrowth = fpgrowth(data_slav.drop(columns = ['user', 'country', 'artist', 'sex']), min_support=0.02, use_colnames=True)
items_fpgrowth.sort_values(by = ['support'], ascending=False)

Unnamed: 0,support,itemsets
87,0.129386,(metallica)
43,0.126096,(the prodigy)
19,0.121162,(red hot chili peppers)
20,0.112939,(coldplay)
5,0.112390,(radiohead)
...,...,...
299,0.020285,"(linkin park, the offspring)"
300,0.020285,"(the prodigy, rammstein)"
311,0.020285,"(pidżama porno, kult)"
116,0.020285,(sabaton)


In [19]:
rules_fpgrowth = association_rules(items_fpgrowth, metric="confidence", min_threshold=0.1)
rules_fpgrowth["antecedent_len"] = rules_fpgrowth["antecedents"].apply(lambda x: len(x))
rules_fpgrowth

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(coldplay),(radiohead),0.112939,0.112390,0.038925,0.344660,3.066635,0.026232,1.354427,1
1,(radiohead),(coldplay),0.112390,0.112939,0.038925,0.346341,3.066635,0.026232,1.357072,1
2,(radiohead),(red hot chili peppers),0.112390,0.121162,0.026316,0.234146,1.932502,0.012698,1.147527,1
3,(red hot chili peppers),(radiohead),0.121162,0.112390,0.026316,0.217195,1.932502,0.012698,1.133883,1
4,(radiohead),(nirvana),0.112390,0.105811,0.024123,0.214634,2.028459,0.012231,1.138563,1
...,...,...,...,...,...,...,...,...,...,...
171,(linkin park),(papa roach),0.107456,0.042215,0.020833,0.193878,4.592632,0.016297,1.188138,1
172,(massive attack),(radiohead),0.080044,0.112390,0.021930,0.273973,2.437688,0.012934,1.222557,1
173,(radiohead),(massive attack),0.112390,0.080044,0.021930,0.195122,2.437688,0.012934,1.142976,1
174,(the prodigy),(massive attack),0.126096,0.080044,0.020833,0.165217,2.064086,0.010740,1.102031,1


In [21]:
rules_fpgrowth[ (rules_fpgrowth['antecedent_len'] >= 2)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
