In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import time 

In [2]:
plays = pd.read_csv('user_artists.dat', sep='\t')
artists = pd.read_csv('artists.dat', sep='\t', usecols=['id','name'])

# Merge artist and user pref data
ap = pd.merge(artists, plays, how="inner", left_on="id", right_on="artistID")
ap = ap.rename(columns={"weight": "playCount"})

# Group artist by name
artist_rank = ap.groupby(['name']) \
    .agg({'userID' : 'count', 'playCount' : 'sum'}) \
    .rename(columns={"userID" : 'totalUsers', "playCount" : "totalPlays"}) \
    .sort_values(['totalPlays'], ascending=False)

artist_rank['avgPlays'] = artist_rank['totalPlays'] / artist_rank['totalUsers']
print(artist_rank)

                    totalUsers  totalPlays     avgPlays
name                                                   
Britney Spears             522     2393140  4584.559387
Depeche Mode               282     1301308  4614.567376
Lady Gaga                  611     1291387  2113.563011
Christina Aguilera         407     1058405  2600.503686
Paramore                   399      963449  2414.659148
...                        ...         ...          ...
Morris                       1           1     1.000000
Eddie Kendricks              1           1     1.000000
Excess Pressure              1           1     1.000000
My Mine                      1           1     1.000000
A.M. Architect               1           1     1.000000

[17632 rows x 3 columns]


In [3]:
# Merge into ap matrix
ap = ap.join(artist_rank, on="name", how="inner") \
    .sort_values(['playCount'], ascending=False)

# Preprocessing
pc = ap.playCount
play_count_scaled = (pc - pc.min()) / (pc.max() - pc.min())
ap = ap.assign(playCountScaled=play_count_scaled)
print(type(ap))

# Build a user-artist rating matrix 
ratings_df = ap.pivot(index='userID', columns='artistID', values='playCountScaled')
ratings = ratings_df.fillna(0).values

# Show sparsity
sparsity = float(len(ratings.nonzero()[0])) / (ratings.shape[0] * ratings.shape[1]) * 100
print("sparsity: %.2f" % sparsity)
ap.describe()
print(ap.id)

<class 'pandas.core.frame.DataFrame'>
sparsity: 0.28
2800        72
35843      792
27302      511
8152       203
26670      498
         ...  
38688      913
32955      697
71811     4988
91319    17080
63982     3201
Name: id, Length: 92834, dtype: int64


In [4]:
from scipy.sparse import csr_matrix

# Build a sparse matrix
X = csr_matrix(ratings)

n_users, n_items = ratings_df.shape
print("rating matrix shape", ratings_df.shape)

user_ids = ratings_df.index.values
artist_names = ap.sort_values("artistID")["name"].unique()

rating matrix shape (1892, 17632)


In [5]:
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from lightfm.data import Dataset

# Build data references + train test
Xcoo = X.tocoo()
data = Dataset()
data.fit(np.arange(n_users), np.arange(n_items))
interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data))

print(Xcoo.data.shape)
print(type(data))
print(1892*1892)
#train, test = random_train_test_split(interactions,test_percentage=0.2)
train, test = random_train_test_split(interactions)
print(train.shape)
print(test.shape)
# Ignore that (weight seems to be ignored...)
#train = train_.tocsr()
#test = test_.tocsr()
#train[train==1] = X[train==1]
#test[test==1] = X[test==1]

# To be completed...

(92198,)
<class 'lightfm.data.Dataset'>
3579664
(1892, 17632)
(1892, 17632)


In [6]:
# Train
model = LightFM(learning_rate=0.05, loss='warp')
model.fit(train, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7fd1f0ef2370>

In [7]:
# Evaluate
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.38, test 0.13.
AUC: train 0.96, test 0.86.


In [8]:
# Predict
scores = model.predict(0, np.arange(n_items))
top_items = artist_names[np.argsort(-scores)]
print(top_items)

['Madonna' 'Björk' 'David Bowie' ... 'Big Brotherz' 'In Mourning'
 'Celestia']


# New Travail

In [9]:
def model_01(loss):
    model = LightFM(learning_rate=0.05, loss=loss)
    return model
    

In [10]:
def train_01(model,train,epoches=10,num_threads=2):
    return model.fit(train, epochs=10, num_threads=2)

In [11]:
def evaluate(model, train,test,k=10):
    train_precision = precision_at_k(model, train, k=10).mean()
    test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

    train_auc = auc_score(model, train).mean()
    test_auc = auc_score(model, test, train_interactions=train).mean()
    return train_precision, test_precision, train_auc, test_auc

In [12]:
def predict_01(model, n_items=n_items):
    
    scores = model.predict(0, np.arange(n_items))
    top_items = artist_names[np.argsort(-scores)]
    return top_items

In [13]:
loss='warp'
model=model_01(loss)
train=train

train_01(model,train,epoches=10,num_threads=2)

<lightfm.lightfm.LightFM at 0x7fd1f0ef26d0>

In [14]:
train=train
test=test
train_precision, test_precision, train_auc, test_auc = evaluate(model, train,test,k=10)

In [15]:
train_precision

0.38397878

In [16]:
test_precision

0.13107677

In [17]:
top_items=predict_01(model, n_items=n_items)

In [18]:
top_items

array(['Radiohead', 'Depeche Mode', 'The Beatles', ...,
       'Ali Farka Touré & Toumani Diabaté', 'ヴァルナ', 'The Contortionist'],
      dtype=object)

In [19]:
loss_s=['logistic','BPR','WARP','k-OS WARP']

In [20]:
# split the data
def split_data(X):
    train, test = random_train_test_split(interactions)
    return train, test


In [21]:
# construire plusieurs modèles avec différentes valeurs de max_features
def const_modeles_recom():
    modeles = dict()
    loss_s=["logistic", "warp", "bpr", "warp-kos"]
    for losss in loss_s:
        modeles[losss] = LightFM(learning_rate=0.05, loss=losss)
    return modeles

In [22]:
def eval_model(model, train,test,k=10):
    train_precision = precision_at_k(model, train, k=10).mean()
    test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

    train_auc = auc_score(model, train).mean()
    test_auc = auc_score(model, test, train_interactions=train).mean()
    return train_precision, test_precision, train_auc, test_auc



In [23]:
def predicts(model, n_items=n_items):
    
    scores = model.predict(0, np.arange(n_items))
    top_items = artist_names[np.argsort(-scores)]
    return top_items

In [24]:
def run_process(X):
    res_eval = defaultdict(list)
    res_top_items = defaultdict(list)
    resultas, names = list(), list()
    train, test = split_data(X)
    modeles= const_modeles_recom()
    
    for name, modele in modeles.items():
        start = time.time()
        modele.fit(train, epochs=10, num_threads=2)
        fit_time = time.time() - start
        
        train_precision, test_precision , train_auc, test_auc = eval_model(modele, train,test,k=10)
        res_eval[name].append([fit_time,train_precision,
                          test_precision,
                          train_auc,
                          test_auc ])
        
        top_items = predicts(modele, n_items=n_items)
        res_top_items[name]=top_items
    return res_eval, res_top_items
        

In [25]:
res_val, res_top_items = run_process(interactions)

In [26]:
res_val

defaultdict(list,
            {'logistic': [[0.5441956520080566,
               0.20238476,
               0.069263615,
               0.88758904,
               0.80862176]],
             'warp': [[0.5018823146820068,
               0.37604663,
               0.12753469,
               0.96648985,
               0.855753]],
             'bpr': [[0.4075291156768799,
               0.38373077,
               0.12449307,
               0.85664624,
               0.78207]],
             'warp-kos': [[0.8223631381988525,
               0.32888183,
               0.11654216,
               0.8879132,
               0.8185759]]})

In [27]:
res_top_items

defaultdict(list,
            {'logistic': array(['Lady Gaga', 'Britney Spears', 'Katy Perry', ...,
                    'Le Mystère des voix Bulgares', 'Pezet-Noon', 'Die Echocords'],
                   dtype=object),
             'warp': array(['Katy Perry', 'Lady Gaga', 'Depeche Mode', ..., 'Pulling Teeth',
                    'Automatikk', 'Berlins Most Wanted'], dtype=object),
             'bpr': array(['Depeche Mode', 'Björk', 'Duran Duran', ..., 'Katy Perry', 'Ke$ha',
                    'Paramore'], dtype=object),
             'warp-kos': array(['David Bowie', 'The Beatles', 'Michael Jackson', ..., 'GZA/Genius',
                    'Jadakiss', 'All For Nothing'], dtype=object)})

In [28]:
from itertools import product 
import copy 
# construire plusieurs modèles avec différentes valeurs de max_features

def const_modeles_recom_grid(param_grid,base_model):
    
    #modeles= dict()
    modeles= defaultdict(object)
    # name of the model
    
    keys, values = zip(*param_grid.items())
    for v in product(*values):
        params = dict(zip(keys, v))
        this_model = copy.deepcopy(base_model)
        name = "-".join([str(x) for x in v])
        for k, v in params.items():
            setattr(this_model, k, v)
        modeles[name]= this_model
    
    return modeles

In [29]:
# define space search
space = dict()
#space['LightFM__learning_rate'] = [0.05, 0.1, 0.5, 0.7, 0.8, 0.9]
#space['LightFM__loss'] = ["logistic", "warp", "bpr", "warp-kos"]
space['learning_rate'] = [0.05, 0.1]
space['loss'] = ["warp", "bpr"]

#### base model
base_model=LightFM()

In [30]:
from itertools import product 
import copy 
# construire plusieurs modèles avec différentes valeurs de max_features

    
modeles = dict()
    # name of the model
    
keys, values = zip(*space.items())


for v in product(*values):
    print(v)
    params = dict(zip(keys, v))
    print(params)
    this_model = copy.deepcopy(base_model)
    
    name = "-".join([str(x) for x in v])
    for k, v in params.items():
        
        setattr(this_model, k, v)
    modeles[name]= this_model
    print(modeles)
 

(0.05, 'warp')
{'learning_rate': 0.05, 'loss': 'warp'}
{'0.05-warp': <lightfm.lightfm.LightFM object at 0x7fd1a1e0e3a0>}
(0.05, 'bpr')
{'learning_rate': 0.05, 'loss': 'bpr'}
{'0.05-warp': <lightfm.lightfm.LightFM object at 0x7fd1a1e0e3a0>, '0.05-bpr': <lightfm.lightfm.LightFM object at 0x7fd1a1e0e3d0>}
(0.1, 'warp')
{'learning_rate': 0.1, 'loss': 'warp'}
{'0.05-warp': <lightfm.lightfm.LightFM object at 0x7fd1a1e0e3a0>, '0.05-bpr': <lightfm.lightfm.LightFM object at 0x7fd1a1e0e3d0>, '0.1-warp': <lightfm.lightfm.LightFM object at 0x7fd1a1e0e850>}
(0.1, 'bpr')
{'learning_rate': 0.1, 'loss': 'bpr'}
{'0.05-warp': <lightfm.lightfm.LightFM object at 0x7fd1a1e0e3a0>, '0.05-bpr': <lightfm.lightfm.LightFM object at 0x7fd1a1e0e3d0>, '0.1-warp': <lightfm.lightfm.LightFM object at 0x7fd1a1e0e850>, '0.1-bpr': <lightfm.lightfm.LightFM object at 0x7fd1a1e0e9d0>}


In [31]:
modeles=const_modeles_recom_grid(space,base_model)

In [32]:
modeles

defaultdict(object,
            {'0.05-warp': <lightfm.lightfm.LightFM at 0x7fd1a1e0a340>,
             '0.05-bpr': <lightfm.lightfm.LightFM at 0x7fd1a1e0a310>,
             '0.1-warp': <lightfm.lightfm.LightFM at 0x7fd1a1e0a3d0>,
             '0.1-bpr': <lightfm.lightfm.LightFM at 0x7fd1a1e0ad30>})

In [33]:
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer
import copy 

def run_gridCV(modele, train, test, param_grid):
    
    keys, values = zip(*param_grid.items())
    for v in itertools.product(*values):
        params = dict(zip(keys, v))
        this_model = copy.deepcopy(base_model)
        for k, v in params.items():
            setattr(this_model, k, v)
        
        
                start = time.time()
        modele.fit(train, epochs=10, num_threads=2)
        fit_time = time.time() - start
        
        train_precision, test_precision , train_auc, test_auc = eval_model(modele, train,test,k=10)
        res_eval[name].append([fit_time,train_precision,
                          test_precision,
                          train_auc,
                          test_auc ])
        
        top_items = predicts(modele, n_items=n_items)
        res_top_items[name]=top_items
    return res_eval, res_top_items
        

        # define space search
    space = dict()
    #space['LightFM__learning_rate'] = [0.05, 0.1, 0.5, 0.7, 0.8, 0.9]
    #space['LightFM__loss'] = ["logistic", "warp", "bpr", "warp-kos"]
    space['learning_rate'] = [0.05, 0.1]
    space['loss'] = ["warp", "bpr"]
        
    train_error = make_scorer(train_score(modele,train),greater_is_better=False)    
        # define search
    modele_search = GridSearchCV(modele, space,scoring=train_error)
    
        # execute search
    start = time.time()
    result=modele_search.fit(train,epochs=10, num_threads=2)
    fit_tim = time.time() - start
        
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
        
    return best_model

IndentationError: unexpected indent (<ipython-input-33-21598ea5edfd>, line 16)

In [None]:

def train_score(model, train):
    train_auc = auc_score(model, train).mean()
    return train_auc



def run_gridCV(modele,X):
    
    
    
    train, test = split_data(X)
        

        # define space search
    space = dict()
    #space['LightFM__learning_rate'] = [0.05, 0.1, 0.5, 0.7, 0.8, 0.9]
    #space['LightFM__loss'] = ["logistic", "warp", "bpr", "warp-kos"]
    space['learning_rate'] = [0.05, 0.1]
    space['loss'] = ["warp", "bpr"]
        
    train_error = make_scorer(train_score(modele,train),greater_is_better=True)    
        # define search
    modele_search = GridSearchCV(modele, space,scoring=train_error)
    
        # execute search
    start = time.time()
    result=modele_search.fit(train,epochs=10, num_threads=2)
    fit_tim = time.time() - start
        
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
        
    return best_model
    
        
    

In [None]:
modele=LightFM()
LightFM().get_params().keys()


In [None]:
X=interactions
best=run_gridCV(modele,X)

In [None]:
# evaluate model on the hold out dataset
        yhat = best_model.predict(test)
    # evaluate the model
        acc = accuracy_score(y_test, yhat)
    # store the result
        outer_results.append(acc)
    # report progress
    
        print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
    print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))