In [1]:
import numpy as np
import pandas as pd
import mlxtend.frequent_patterns as ml

In [2]:
lastfm = pd.read_csv('lastfm.csv')
countries = ['Germany', 'United States', 'Russian Federation', 'Finland', 'United Kingdom']
lastfm = lastfm[lastfm['country'].isin(countries)]

lastfm

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany
...,...,...,...,...
289904,19715,david bowie,m,United Kingdom
289905,19715,ben harper,m,United Kingdom
289906,19715,lily allen,m,United Kingdom
289907,19715,andrew bird,m,United Kingdom


In [3]:
grouped = lastfm.groupby('user')['artist'].apply(';'.join)

grouped

user
1        red hot chili peppers;the black dahlia murder;...
3        devendra banhart;boards of canada;cocorosie;ap...
4        tv on the radio;tool;kyuss;dj shadow;air;a tri...
5        dream theater;ac/dc;metallica;iron maiden;bob ...
7        soundgarden;stone temple pilots;buckethead;dre...
                               ...                        
19706            enigma;pink floyd;the offspring;scorpions
19709    oasis;foo fighters;kings of leon;taylor swift;...
19710    vnv nation;nine inch nails;kmfdm;apoptygma ber...
19714    misfits;type o negative;arch enemy;red hot chi...
19715    abba;james blunt;jason mraz;amy winehouse;quee...
Name: artist, Length: 6713, dtype: object

In [4]:
grouped_bin = grouped.str.get_dummies(";")
grouped_bin.iloc[0:10, 100:110]

Unnamed: 0_level_0,beastie boys,beatsteaks,beck,bee gees,behemoth,beirut,belle and sebastian,ben folds,ben folds five,ben harper
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0
20,0,0,1,0,0,1,0,0,0,0
22,0,0,0,0,0,0,0,0,0,0
24,0,0,0,0,0,0,0,0,0,0


In [5]:
freq_items = ml.apriori(grouped_bin, min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_items))
print(freq_items)
print("Максимальная длина набора: %d" % freq_items['itemsets'].apply(lambda x: len(x)).max())

Найдено 71 характерных комбинаций
     support                  itemsets
0   0.052138                   (ac/dc)
1   0.065098                     (air)
2   0.055415             (arcade fire)
3   0.065544          (arctic monkeys)
4   0.051095            (beastie boys)
..       ...                       ...
66  0.063906              (the smiths)
67  0.080888       (the white stripes)
68  0.050201               (tom waits)
69  0.054223                      (u2)
70  0.068226  (radiohead, the beatles)

[71 rows x 2 columns]
Максимальная длина набора: 2


In [19]:
grouped_male = lastfm[lastfm['sex'] == 'm'].groupby('user')['artist'].apply(';'.join)
grouped_male_bin = grouped_male.str.get_dummies(";")

grouped_male

user
3        devendra banhart;boards of canada;cocorosie;ap...
4        tv on the radio;tool;kyuss;dj shadow;air;a tri...
5        dream theater;ac/dc;metallica;iron maiden;bob ...
7        soundgarden;stone temple pilots;buckethead;dre...
9        arch enemy;strapping young lad;kreator;childre...
                               ...                        
19702    muse;finch;regina spektor;radiohead;rage again...
19705    muse;neutral milk hotel;animal collective;mode...
19709    oasis;foo fighters;kings of leon;taylor swift;...
19714    misfits;type o negative;arch enemy;red hot chi...
19715    abba;james blunt;jason mraz;amy winehouse;quee...
Name: artist, Length: 4973, dtype: object

In [20]:
freq_male = ml.apriori(grouped_male_bin, min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_male))
print(freq_male)
print("Максимальная длина набора: %d" % freq_male['itemsets'].apply(lambda x: len(x)).max())

Найдено 69 характерных комбинаций
     support                  itemsets
0   0.059924                   (ac/dc)
1   0.062136                     (air)
2   0.050271              (aphex twin)
3   0.051880             (arcade fire)
4   0.059723          (arctic monkeys)
..       ...                       ...
64  0.077619       (the white stripes)
65  0.056706               (tom waits)
66  0.053087                    (tool)
67  0.058516                      (u2)
68  0.066962  (radiohead, the beatles)

[69 rows x 2 columns]
Максимальная длина набора: 2


In [23]:
grouped_female = lastfm[lastfm['sex'] == 'f'].groupby('user')['artist'].apply(';'.join)
grouped_female_bin = grouped_female.str.get_dummies(";")

grouped_female

user
1        red hot chili peppers;the black dahlia murder;...
20       of montreal;the magnetic fields;devo;talking h...
32       nick cave and the bad seeds;black flag;the gos...
33       death cab for cutie;tegan and sara;kimya dawso...
42       soundtrack;groove coverage;avril lavigne;the r...
                               ...                        
19691    nine inch nails;the smashing pumpkins;mindless...
19695    the kills;bright eyes;adam green;cocorosie;dev...
19701    placebo;animal collective;crystal castles;brok...
19706            enigma;pink floyd;the offspring;scorpions
19710    vnv nation;nine inch nails;kmfdm;apoptygma ber...
Name: artist, Length: 1740, dtype: object

In [24]:
freq_female = ml.apriori(grouped_female_bin, min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_female))
print(freq_female)
print("Максимальная длина набора: %d" % freq_female['itemsets'].apply(lambda x: len(x)).max())

Найдено 106 характерных комбинаций
      support                            itemsets
0    0.073563                               (air)
1    0.088506                     (amy winehouse)
2    0.053448                 (animal collective)
3    0.065517                       (arcade fire)
4    0.082184                    (arctic monkeys)
..        ...                                 ...
101  0.063218  (the beatles, death cab for cutie)
102  0.062069    (death cab for cutie, the shins)
103  0.071839            (radiohead, the beatles)
104  0.054598              (radiohead, the shins)
105  0.050000            (the beatles, the shins)

[106 rows x 2 columns]
Максимальная длина набора: 2


In [26]:
grouped_ru = lastfm[lastfm['country'] == 'Russian Federation'].groupby('user')['artist'].apply(';'.join)
grouped_ru_bin = grouped_ru.str.get_dummies(";")

freq_ru = ml.apriori(grouped_ru_bin, min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_ru))
print(freq_ru)
print("Максимальная длина набора: %d" % freq_ru['itemsets'].apply(lambda x: len(x)).max())

Найдено 43 характерных комбинаций
     support                 itemsets
0   0.056051     (30 seconds to mars)
1   0.054777           (apocalyptica)
2   0.056051         (arctic monkeys)
3   0.062420          (avril lavigne)
4   0.095541               (coldplay)
5   0.104459           (depeche mode)
6   0.050955                 (enigma)
7   0.056051            (evanescence)
8   0.071338        (franz ferdinand)
9   0.052229    (god is an astronaut)
10  0.054777              (green day)
11  0.052229              (in flames)
12  0.059873      (infected mushroom)
13  0.056051                   (koЯn)
14  0.059873            (limp bizkit)
15  0.113376            (linkin park)
16  0.062420                (madonna)
17  0.063694         (marilyn manson)
18  0.070064         (massive attack)
19  0.100637              (metallica)
20  0.096815                   (moby)
21  0.124841                   (muse)
22  0.052229             (nickelback)
23  0.071338              (nightwish)
24  0.118471    

In [29]:
grouped_uk = lastfm[lastfm['country'] == 'United Kingdom'].groupby('user')['artist'].apply(';'.join)
grouped_uk_bin = grouped_uk.str.get_dummies(";")

freq_uk = ml.apriori(grouped_uk_bin, min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_uk))
print(freq_uk)
print("Максимальная длина набора: %d" % freq_uk['itemsets'].apply(lambda x: len(x)).max())

Найдено 115 характерных комбинаций
      support                    itemsets
0    0.087580                       (air)
1    0.074045             (amy winehouse)
2    0.062102                (aphex twin)
3    0.086783               (arcade fire)
4    0.119427            (arctic monkeys)
..        ...                         ...
110  0.056529  (radiohead, kings of leon)
111  0.060510           (muse, radiohead)
112  0.050159         (muse, the beatles)
113  0.082803    (radiohead, the beatles)
114  0.052548     (the smiths, radiohead)

[115 rows x 2 columns]
Максимальная длина набора: 2


In [30]:
grouped_us = lastfm[lastfm['country'] == 'United States'].groupby('user')['artist'].apply(';'.join)
grouped_us_bin = grouped_us.str.get_dummies(";")

freq_us = ml.apriori(grouped_us_bin, min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_us))
print(freq_us)
print("Максимальная длина набора: %d" % freq_us['itemsets'].apply(lambda x: len(x)).max())

Найдено 106 характерных комбинаций
      support                     itemsets
0    0.058176                        (air)
1    0.059552                (andrew bird)
2    0.079862          (animal collective)
3    0.074010                (arcade fire)
4    0.067126               (beastie boys)
..        ...                          ...
101  0.052668    (the beatles, pink floyd)
102  0.056454  (sufjan stevens, radiohead)
103  0.100172     (radiohead, the beatles)
104  0.058520       (radiohead, the shins)
105  0.055766     (the beatles, the shins)

[106 rows x 2 columns]
Максимальная длина набора: 2


In [33]:
grouped_fi = lastfm[lastfm['country'] == 'Finland'].groupby('user')['artist'].apply(';'.join)
grouped_fi_bin = grouped_fi.str.get_dummies(";")

freq_fi = ml.apriori(grouped_fi_bin, min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_fi))
print(freq_fi)
print("Максимальная длина набора: %d" % freq_fi['itemsets'].apply(lambda x: len(x)).max())

Найдено 118 характерных комбинаций
      support                       itemsets
0    0.050980                      (50 cent)
1    0.103922                        (ac/dc)
2    0.056863                          (air)
3    0.060784                         (akon)
4    0.111765                     (amorphis)
..        ...                            ...
113  0.050980         (rammstein, metallica)
114  0.050980    (metallica, sonata arctica)
115  0.066667  (metallica, system of a down)
116  0.072549    (sonata arctica, nightwish)
117  0.058824  (nightwish, system of a down)

[118 rows x 2 columns]
Максимальная длина набора: 2


In [34]:
grouped_de = lastfm[lastfm['country'] == 'Germany'].groupby('user')['artist'].apply(';'.join)
grouped_de_bin = grouped_de.str.get_dummies(";")

freq_de = ml.apriori(grouped_de_bin, min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_de))
print(freq_de)
print("Максимальная длина набора: %d" % freq_de['itemsets'].apply(lambda x: len(x)).max())

Найдено 82 характерных комбинаций
     support                            itemsets
0   0.065235                      (3 doors down)
1   0.050119                (30 seconds to mars)
2   0.065235                         ([unknown])
3   0.070804                             (ac/dc)
4   0.076372                               (air)
..       ...                                 ...
77  0.050915            (linkin park, die Ärzte)
78  0.056484              (rammstein, die Ärzte)
79  0.054893  (red hot chili peppers, die Ärzte)
80  0.056484       (die Ärzte, system of a down)
81  0.053302       (rammstein, system of a down)

[82 rows x 2 columns]
Максимальная длина набора: 2
