# Last FM hometask <br>
https://www.kaggle.com/ravichaubey1506/lastfm <br>
1. Выбрать данные по странам своей группы (совместно): <br>
    3530203_70101: Germany, Netherlands <br>
    3530203_70102: Belarus, Ukraine, Poland, Russian Federation<br>
    3530903_70301: Sweden, Finland, Norway, Denmark, Iceland<br>
    3530903_70302: Spain, Portugal, France, Italy, Belgium<br>
    
2. Попытаться найти полезные с точки зрения продвижения групп (или еще чего-нибудь) и нетривиальные правила, используя алгоритмы Apriori, FPGrowth, FPMax и всевозможные метрики. Хотя бы 5 правил.
3. Вывести эти правила в отдельных ячейках. 
4. Подумать, как можно было бы использовать полученные правила.

## Подготовка данных

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("lastfm.csv")
data.head()

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany


In [3]:
countries = ['Spain', 'Portugal', 'France', 'Italy', 'Belgium']
data_countries = pd.DataFrame()
for i in range(len(countries)):
    df2 = data[data.country == countries[i]]
    data_countries= pd.concat([data_countries, df2], ignore_index=True)
data_countries.country.value_counts()

Spain       9322
France      5962
Italy       5717
Belgium     3331
Portugal    2882
Name: country, dtype: int64

## Узнаем, какие артисты нравятся каждому пользователю

In [4]:
data_countries.describe()
data_by_user = data_countries.groupby(['user','country'])['artist'].apply(lambda x: ",".join(x)).reset_index()
data_by_user

Unnamed: 0,user,country,artist
0,6,Portugal,"lily allen,kanye west,sigur rós,pink floyd,ste..."
1,12,Italy,"queen,the beatles,lynyrd skynyrd,neil young,th..."
2,36,Italy,"m.i.a.,fugazi,johnny cash,clint mansell,elton ..."
3,47,Italy,"pj harvey,radiohead,zero 7,einstürzende neubau..."
4,74,Spain,"in extremo,nine inch nails,billy talent,panic!..."
...,...,...,...
1414,19690,Italy,"pink floyd,the beatles,the white stripes,depec..."
1415,19693,Belgium,"air,simon & garfunkel,pixies,serge gainsbourg,..."
1416,19697,Belgium,"underworld,pendulum,the prodigy,sufjan stevens..."
1417,19704,Italy,"keane,pinback,coldplay,travis,calexico,doves,d..."


In [5]:
data_by_user['country'].value_counts()

Spain       506
France      327
Italy       304
Belgium     160
Portugal    122
Name: country, dtype: int64

## Подготовка данных для обработки алгоримами (dummy encoding)

In [6]:
for_dummy_enc = data_countries.drop(columns=['country','sex'])
data_to_process = for_dummy_enc.pivot_table(index='user', columns='artist',aggfunc=any, fill_value=False).astype(int)
data_to_process.columns
#dummy_data = data_by_user['artist'].str.get_dummies(',')
#dummy_data

Index(['...and you will know us by the trail of dead', '2pac', '3 doors down',
       '30 seconds to mars', '311', '36 crazyfists', '44', '50 cent',
       '65daysofstatic', '[unknown]',
       ...
       'wilco', 'within temptation', 'wolfgang amadeus mozart', 'wu-tang clan',
       'yann tiersen', 'yeah yeah yeahs', 'yellowcard', 'yo la tengo',
       'zero 7', 'Édith piaf'],
      dtype='object', name='artist', length=994)

## Обработка данных различными алгоритмами

#### Apriori

In [7]:
from mlxtend.frequent_patterns import apriori, association_rules

frequent_itemsets_apriori = apriori(data_to_process, min_support=0.01, use_colnames=True)

frequent_itemsets_apriori.head()

Unnamed: 0,support,itemsets
0,0.011276,(2pac)
1,0.020437,(3 doors down)
2,0.029598,(30 seconds to mars)
3,0.012685,(50 cent)
4,0.019732,(65daysofstatic)


#### FP-growth

In [8]:
from mlxtend.frequent_patterns import fpgrowth

frequent_itemsets_fpgrowth = fpgrowth(data_to_process, min_support=0.08, use_colnames=True)

frequent_itemsets_fpgrowth.head()

Unnamed: 0,support,itemsets
0,0.174066,(muse)
1,0.130374,(pink floyd)
2,0.123326,(metallica)
3,0.096547,(depeche mode)
4,0.08809,(u2)


#### FPMax

In [9]:
from mlxtend.frequent_patterns import fpmax

frequent_itemsets_fpmax = fpmax(data_to_process, min_support=0.05, use_colnames=True)

frequent_itemsets_fpmax

Unnamed: 0,support,itemsets
0,0.050035,(gorillaz)
1,0.050035,(pj harvey)
2,0.050035,(editors)
3,0.050740,(kaiser chiefs)
4,0.050740,(snow patrol)
...,...,...
67,0.054968,"(radiohead, pink floyd)"
68,0.070472,"(radiohead, the beatles)"
69,0.068358,"(muse, coldplay)"
70,0.080338,"(radiohead, muse)"


## Правила 

Дополнтельные функции

In [10]:
def print_res(algorithm, answer):
    print('Algorithm: ' + algorithm)
    for i in range(len(answer['antecedents'])):
        print(set(answer.iloc[i,0]) , ' -> ', set(answer.iloc[i,1]))

#### 1 правило: 

In [11]:
def first_rule(data_for_this_rule, min_t):
    return association_rules(data_for_this_rule, metric="support", min_threshold=min_t) 

data_first_rule_apriori = first_rule(frequent_itemsets_apriori,0.005)
print(data_first_rule_apriori.head())
data_first_rule_fpgrowth = first_rule(frequent_itemsets_fpgrowth, 0.001)
print(data_first_rule_fpgrowth.head())

            antecedents           consequents  antecedent support  \
0  (30 seconds to mars)        (fall out boy)            0.029598   
1        (fall out boy)  (30 seconds to mars)            0.038760   
2  (30 seconds to mars)         (linkin park)            0.029598   
3         (linkin park)  (30 seconds to mars)            0.083862   
4  (30 seconds to mars)                (muse)            0.029598   

   consequent support   support  confidence      lift  leverage  conviction  
0            0.038760  0.010571    0.357143  9.214286  0.009424    1.495263  
1            0.029598  0.010571    0.272727  9.214286  0.009424    1.334302  
2            0.083862  0.014799    0.500000  5.962185  0.012317    1.832276  
3            0.029598  0.014799    0.176471  5.962185  0.012317    1.178345  
4            0.174066  0.012685    0.428571  2.462117  0.007533    1.445384  
   antecedents  consequents  antecedent support  consequent support   support  \
0  (radiohead)       (muse)         

In [12]:
def first_rule(data, sup, lift):
    response = data[
        (data['antecedent support'] >= sup) &
        (data['lift'] >= lift)
    ]
    return response.loc[:,['antecedents','consequents']]

"""
    Для популярной группы/исполнителя подберем подобрать похожие группы/артистов
"""
print_res("Apriori", first_rule(data_first_rule_apriori,0.2,3))
print_res("FPGrowth", first_rule(data_first_rule_fpgrowth,0.05,1.5))

Algorithm: Apriori
{'radiohead'}  ->  {'thom yorke'}
{'radiohead'}  ->  {'air', 'beck'}
{'radiohead'}  ->  {'air', 'nirvana'}
{'radiohead'}  ->  {'coldplay', 'antony and the johnsons'}
{'radiohead'}  ->  {'arcade fire', 'david bowie'}
{'radiohead'}  ->  {'death cab for cutie', 'arctic monkeys'}
{'radiohead'}  ->  {'portishead', 'beck'}
{'radiohead'}  ->  {'björk', 'muse'}
{'radiohead'}  ->  {'björk', 'placebo'}
{'radiohead'}  ->  {'björk', 'sigur rós'}
{'radiohead'}  ->  {'blur', 'coldplay'}
{'radiohead'}  ->  {'blur', 'franz ferdinand'}
{'radiohead'}  ->  {'eels', 'coldplay'}
{'radiohead'}  ->  {'joy division', 'coldplay'}
{'radiohead'}  ->  {'pj harvey', 'coldplay'}
{'radiohead'}  ->  {'portishead', 'coldplay'}
{'radiohead'}  ->  {'sufjan stevens', 'coldplay'}
{'radiohead'}  ->  {'the doors', 'coldplay'}
{'radiohead'}  ->  {'pink floyd', 'daft punk'}
{'radiohead'}  ->  {'explosions in the sky', 'mogwai'}
{'radiohead'}  ->  {'franz ferdinand', 'interpol'}
{'radiohead'}  ->  {'queens o

#### 2 правило:

In [13]:
def data_for_second_rule(data_for_this_rule, min_t):
    new_data = association_rules(data_for_this_rule, metric="confidence", min_threshold=min_t) 
    new_data["antecedent_len"] = new_data["antecedents"].apply(lambda x: len(x))
    new_data["consequent_len"] = new_data["consequents"].apply(lambda x: len(x))
    return new_data 

frequent_itemsets_fpgrowth = fpgrowth(data_to_process, min_support=0.008, use_colnames=True)

data_second_rule_apriori = data_for_second_rule(frequent_itemsets_apriori,0.5)
print(data_second_rule_apriori.head())

data_second_rule_fpgrowth = data_for_second_rule(frequent_itemsets_fpgrowth, 0.005)
print(data_second_rule_fpgrowth.head())

            antecedents    consequents  antecedent support  \
0  (30 seconds to mars)  (linkin park)            0.029598   
1      (65daysofstatic)       (mogwai)            0.019732   
2                (abba)      (madonna)            0.025370   
3                 (air)    (radiohead)            0.092319   
4       (amy macdonald)     (coldplay)            0.022551   

   consequent support   support  confidence       lift  leverage  conviction  \
0            0.083862  0.014799    0.500000   5.962185  0.012317    1.832276   
1            0.051445  0.010571    0.535714  10.413405  0.009556    2.043042   
2            0.063425  0.012685    0.500000   7.883333  0.011076    1.873150   
3            0.237491  0.048626    0.526718   2.217840  0.026701    1.611107   
4            0.188161  0.011980    0.531250   2.823385  0.007737    1.731924   

   antecedent_len  consequent_len  
0               1               1  
1               1               1  
2               1               1  
3 

In [14]:
def second_rule(data, antecedent_len):
    get_mean_confid = data['confidence'].mean()
    response = data[
        (data['antecedent_len'] > antecedent_len) &
        (data['consequent_len'] == 1) &
        (data['confidence'] > (1 - (get_mean_confid)/2))
    ]
    return response.loc[:,['antecedents','consequents']]

"""
 По нескольким  группам определить наиболее вероятную группу, которая может понравится пользователю
"""    
print_res("Apriori", second_rule(data_second_rule_apriori,3))
print_res("FPGrowth", second_rule(data_second_rule_fpgrowth,4))

Algorithm: Apriori
{'arctic monkeys', 'muse', 'oasis', 'coldplay'}  ->  {'bloc party'}
{'muse', 'bloc party', 'oasis', 'coldplay'}  ->  {'arctic monkeys'}
{'arctic monkeys', 'bloc party', 'oasis', 'coldplay'}  ->  {'muse'}
{'arctic monkeys', 'muse', 'bloc party', 'oasis'}  ->  {'coldplay'}
{'the killers', 'muse', 'arctic monkeys', 'coldplay'}  ->  {'bloc party'}
{'the killers', 'arctic monkeys', 'bloc party', 'coldplay'}  ->  {'muse'}
{'arctic monkeys', 'muse', 'bloc party', 'coldplay'}  ->  {'the killers'}
{'the killers', 'muse', 'bloc party', 'arctic monkeys'}  ->  {'coldplay'}
{'arctic monkeys', 'muse', 'the strokes', 'coldplay'}  ->  {'bloc party'}
{'muse', 'bloc party', 'the strokes', 'coldplay'}  ->  {'arctic monkeys'}
{'arctic monkeys', 'bloc party', 'the strokes', 'coldplay'}  ->  {'muse'}
{'arctic monkeys', 'muse', 'bloc party', 'the strokes'}  ->  {'coldplay'}
{'the killers', 'arctic monkeys', 'oasis', 'coldplay'}  ->  {'bloc party'}
{'the killers', 'bloc party', 'oasis', 'co

#### 3 правило:

In [15]:
def third_rule(data, group):
    get_mean_confid =  response = data[(data['antecedents'] == group)]['confidence'].mean()
    response = data[
        (data['antecedents'] == group) &
        (data['confidence'] >= get_mean_confid)
    ]
    return response.loc[:,['antecedents','consequents']]
    
"""
 По нескольким  группам определить наиболее вероятную группу, которая может понравится подьзователю
"""
group_or_artist = {'30 seconds to mars'}
print_res("Apriori", third_rule(data_second_rule_apriori,group_or_artist))
print_res("FPGrowth", third_rule(data_second_rule_fpgrowth,group_or_artist))

Algorithm: Apriori
{'30 seconds to mars'}  ->  {'linkin park'}
Algorithm: FPGrowth
{'30 seconds to mars'}  ->  {'muse'}
{'30 seconds to mars'}  ->  {'linkin park'}


#### 4 правило:

In [16]:
frequent_itemsets_fpmax = fpmax(data_to_process, min_support=0.05,max_len=3, use_colnames=True)
frequent_itemsets_fpmax

Unnamed: 0,support,itemsets
0,0.050035,(gorillaz)
1,0.050035,(pj harvey)
2,0.050035,(editors)
3,0.050740,(kaiser chiefs)
4,0.050740,(snow patrol)
...,...,...
67,0.054968,"(radiohead, pink floyd)"
68,0.070472,"(radiohead, the beatles)"
69,0.068358,"(muse, coldplay)"
70,0.080338,"(radiohead, muse)"


In [17]:
def forth_rule(data_for_this_rule, min_t):
    return association_rules(data_for_this_rule, metric="support",support_only=True, min_threshold=min_t) 

data_forth_rule_fpmax = forth_rule(frequent_itemsets_fpmax,0.06)
data_forth_rule_fpmax.loc[:,['antecedents','consequents','support']]

Unnamed: 0,antecedents,consequents,support
0,(radiohead),(the beatles),0.070472
1,(the beatles),(radiohead),0.070472
2,(muse),(coldplay),0.068358
3,(coldplay),(muse),0.068358
4,(radiohead),(muse),0.080338
5,(muse),(radiohead),0.080338
6,(radiohead),(coldplay),0.083157
7,(coldplay),(radiohead),0.083157


In [18]:
"""
Определение наиболее часто всречающихся групп/артистов и их похожих групп/артистов
"""
print_res("FPmax", data_forth_rule_fpmax)

Algorithm: FPmax
{'radiohead'}  ->  {'the beatles'}
{'the beatles'}  ->  {'radiohead'}
{'muse'}  ->  {'coldplay'}
{'coldplay'}  ->  {'muse'}
{'radiohead'}  ->  {'muse'}
{'muse'}  ->  {'radiohead'}
{'radiohead'}  ->  {'coldplay'}
{'coldplay'}  ->  {'radiohead'}


#### 5 правило:

In [19]:
def data_for_fifth_rule(data_for_this_rule):
    new_data = association_rules(data_for_this_rule, metric="lift", min_threshold=2) 
    new_data["antecedent_len"] = new_data["antecedents"].apply(lambda x: len(x))
    new_data["consequent_len"] = new_data["consequents"].apply(lambda x: len(x))
    return new_data 

frequent_itemsets_fpgrowth = fpgrowth(data_to_process, min_support=0.03, use_colnames=True)
frequent_itemsets_apriori = apriori(data_to_process, min_support=0.03, use_colnames=True)

data_fifth_rule_apriori = data_for_fifth_rule(frequent_itemsets_apriori)
print(data_fifth_rule_apriori.head())

data_fifth_rule_fpgrowth = data_for_fifth_rule(frequent_itemsets_fpgrowth)
print(data_fifth_rule_fpgrowth.head())

        antecedents       consequents  antecedent support  consequent support  \
0             (air)  (massive attack)            0.092319            0.088090   
1  (massive attack)             (air)            0.088090            0.092319   
2             (air)       (radiohead)            0.092319            0.237491   
3       (radiohead)             (air)            0.237491            0.092319   
4       (radiohead)     (arcade fire)            0.237491            0.064834   

    support  confidence      lift  leverage  conviction  antecedent_len  \
0  0.033827    0.366412  4.159511  0.025694    1.439279               1   
1  0.033827    0.384000  4.159511  0.025694    1.473509               1   
2  0.048626    0.526718  2.217840  0.026701    1.611107               1   
3  0.048626    0.204748  2.217840  0.026701    1.141376               1   
4  0.033827    0.142433  2.196878  0.018429    1.090487               1   

   consequent_len  
0               1  
1               1  
2 

In [21]:
def fifth_rule(data,min_sup):
    response = data[
        (data['antecedent_len'] == 1) & (data['consequent_len'] == 1) & (data['consequent support'] > min_sup)
    ]
    return response.loc[:,['antecedents','consequents']]

"""
 Определить достаточно популярную группу/артиста
"""    
print_res("Apriori", fifth_rule(data_fifth_rule_apriori, 0.2))
print_res("FPGrowth", fifth_rule(data_fifth_rule_fpgrowth,0.2))

Algorithm: Apriori
{'air'}  ->  {'radiohead'}
{'arcade fire'}  ->  {'radiohead'}
{'beck'}  ->  {'radiohead'}
{'björk'}  ->  {'radiohead'}
{'blur'}  ->  {'radiohead'}
{'death cab for cutie'}  ->  {'radiohead'}
{'franz ferdinand'}  ->  {'radiohead'}
{'interpol'}  ->  {'radiohead'}
{'massive attack'}  ->  {'radiohead'}
{'oasis'}  ->  {'radiohead'}
{'portishead'}  ->  {'radiohead'}
{'queens of the stone age'}  ->  {'radiohead'}
{'sigur rós'}  ->  {'radiohead'}
{'the strokes'}  ->  {'radiohead'}
{'the white stripes'}  ->  {'radiohead'}
Algorithm: FPGrowth
{'sigur rós'}  ->  {'radiohead'}
{'portishead'}  ->  {'radiohead'}
{'air'}  ->  {'radiohead'}
{'björk'}  ->  {'radiohead'}
{'the strokes'}  ->  {'radiohead'}
{'arcade fire'}  ->  {'radiohead'}
{'the white stripes'}  ->  {'radiohead'}
{'queens of the stone age'}  ->  {'radiohead'}
{'death cab for cutie'}  ->  {'radiohead'}
{'beck'}  ->  {'radiohead'}
{'oasis'}  ->  {'radiohead'}
{'franz ferdinand'}  ->  {'radiohead'}
{'blur'}  ->  {'radiohe