In [1]:
import pandas as pd
import scipy.sparse as sp
import numpy as np
import implicit


In [2]:
def normalize_data(score):
    return (score - (-100)) / (100 - (-100))

def convert_to_original(score):
    """Convert normalized scores back to the original scale."""
    return (score * (100 - (-100))) + (-100)

In [3]:
file = "tournesol_dataset/individual_criteria_scores.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,public_username,video,criteria,score,uncertainty,voting_right
0,0,cCi9Iwj55EU,backfire_risk,-69.58,98.44,0.8
1,0,cCi9Iwj55EU,better_habits,-77.54,98.61,0.8
2,0,cCi9Iwj55EU,diversity_inclusion,-75.38,98.48,0.8
3,0,cCi9Iwj55EU,engaging,-75.38,98.48,0.8
4,0,cCi9Iwj55EU,entertaining_relaxing,-79.25,98.57,0.8


In [4]:
df = df[df['criteria'] == 'largely_recommended']

#count number of line per user and add a column with the count
df['count'] = df.groupby('public_username')['public_username'].transform('count')

#sort by count
df = df.sort_values(by=['count'], ascending=False)

#create a new df with only the username and the count
df2 = df[['public_username', 'count']].drop_duplicates()
print(len(df2))

#count the number of user with more than 20 videos
len(df2[df2['count'] > 20])

dfML = df[df['count'] > 20]
dfML = dfML[['public_username', 'video', 'score']]
print(dfML)

6257
       public_username        video  score
214839        lpfaucon  mI8b8Zjq8-Y -74.77
215746        lpfaucon  OmB7fmi8JwY  62.38
215925        lpfaucon  Ovhec5snyk8  75.59
215918        lpfaucon  oUjfxRQd6Uc -58.73
215917        lpfaucon  OTIGYRa_WXY -26.88
...                ...          ...    ...
175088      KinoxKlark  Zbj8pMfK9Ek   0.18
175089      KinoxKlark  ZKhqVny4xSA  83.22
150871     Humainlamba  -3Dn7coSFQc -83.70
175073      KinoxKlark  CX7cKDfhkDs  91.04
46331          astroJR  NLwqzgniqpA  88.20

[35069 rows x 3 columns]


In [5]:
# Get unique usernames and videos
users = dfML['public_username'].unique()
videos = dfML['video'].unique()

print(len(users))
print(len(videos))

# Create an empty matrix
matrix = pd.DataFrame(index=users, columns=videos)

# Fill in matrix with scores
for i, row in dfML.iterrows():
    matrix.loc[row['public_username'], row['video']] = normalize_data(row['score'])
    
matrix = matrix.fillna(0)
# Print matrix
print(matrix)


373
15650
                   mI8b8Zjq8-Y  OmB7fmi8JwY  Ovhec5snyk8  oUjfxRQd6Uc  \
lpfaucon               0.12615       0.8119      0.87795      0.20635   
aidjango               0.13610       0.0000      0.00000      0.00000   
white                  0.00000       0.0000      0.00000      0.00000   
emmanuel.chambost      0.00000       0.0000      0.00000      0.00000   
le_science4all         0.00000       0.0000      0.00000      0.00000   
...                        ...          ...          ...          ...   
Pohoua                 0.00000       0.0000      0.00000      0.00000   
sam                    0.00000       0.0000      0.00000      0.00000   
Helm                   0.00000       0.0000      0.00000      0.00000   
KinoxKlark             0.00000       0.0000      0.00000      0.00000   
astroJR                0.00000       0.0000      0.00000      0.00000   

                   OTIGYRa_WXY  otFdQ-sfr4w  ot0vsLHwLO0  OSk0AUT4_8o  \
lpfaucon               0.36560      0.17

In [6]:
min_value = dfML['score'].min()
max_value = dfML['score'].max()

dfML['score']  = (dfML['score'] - min_value + 0.01) / (0.01 + max_value - min_value)

In [7]:
# Create a dictionary to map usernames to user ids
user_dict = {user_id: i for i, user_id in enumerate(dfML['public_username'].unique())}

print(user_dict)

# Create a dictionary to map videos to item ids
item_dict = {item_id: i for i, item_id in enumerate(dfML['video'].unique())}

# Create an empty matrix for scores
num_users = len(user_dict)
num_items = len(item_dict)
scores = np.empty((num_users, num_items), dtype=np.float32)
scores.fill(np.nan)


# Fill in matrix with scores
for i, row in dfML.iterrows():
    user_id = user_dict[row['public_username']]
    item_id = item_dict[row['video']]
    score = row['score']
    scores[user_id, item_id] = score

print(scores)


{'lpfaucon': 0, 'aidjango': 1, 'white': 2, 'emmanuel.chambost': 3, 'le_science4all': 4, 'biscuissec': 5, 'Zekk': 6, 'ThugFou': 7, 'Pierre M': 8, 'alexandrerfst': 9, 'Sciencecool': 10, 'thib': 11, 'bastianlouis': 12, 'pedro_rouge': 13, 'amatissart': 14, 'Pardel': 15, 'Fungus-Bob': 16, 'mserranorichez': 17, 'ABX': 18, 'LautreFrançois': 19, 'Guillaume': 20, 'Guigui220D': 21, 'jrmouraz': 22, 'from_france': 23, 'gbreteau': 24, 'Amaresh': 25, 'joleenj': 26, 'FalafelAuxCarottes': 27, 'strangery': 28, 'NatNgs': 29, 'Mutre': 30, 'Loulalie': 31, 'Jumo004': 32, 'CB': 33, 'tesseract': 34, 'DexterIzzie': 35, 'umzhefmibqevkjnslj': 36, 'Helios-火リオス': 37, 'Foebus': 38, 'leDéfaillant': 39, 'Dorsan': 40, 'samuelnihoul1': 41, 'KronosIII': 42, 'Intégralecurviligne': 43, 'Everyatis': 44, 'MiaAnge': 45, 'user7123': 46, 'megaLUS13': 47, 'F.Marchal-Bornert': 48, 'Antoine_Jamelot': 49, 'Arthur': 50, 'Riri': 51, 'wareita': 52, 'Bertrand Seguy': 53, 'DimLight12': 54, 'AntoineJ': 55, 'Marlene': 56, 'tl': 57, 'mag

In [8]:
# Fill NaN values with zeros
scores = np.nan_to_num(scores, nan=0.0)
print(scores)

[[0.11351015 0.82534903 0.8939119  ... 0.         0.         0.        ]
 [0.12383869 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.1993564  0.90216434 0.23548035]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [14]:
from scipy.sparse import csr_matrix

def train_test_split(ratings, split_count, fraction=None):
    """
    Split recommendation data into train and test sets
    
    Params
    ------
    ratings : scipy sparse csr_matrix
        Interactions between users and items.
        
    split_count : int
        Number of user-item-interactions per user to move
        from training to test set.
        
    fractions : float
        Fraction of users to split off some of their
        interactions into test set. If None, then all 
        users are considered.
    """
    # Note the ordering
    train = ratings.copy().tocoo()
    test = csr_matrix(train.shape)
    
    if fraction:
        try:
            user_index = np.random.choice(
                np.where(np.bincount(train.row) >= split_count * 2)[0], 
                replace=False,
                size=np.int32(np.floor(fraction * train.shape[0]))
            ).tolist()
        except:
            print(('Not enough users with > {} '
                  'interactions for fraction of {}')\
                  .format(2*split_count, fraction))
            raise
    else:
        user_index = range(train.shape[0])
        
    train = train.tolil()

    for user in user_index:
        test_interactions = np.random.choice(ratings.getrow(user).indices, 
                                            size=split_count, 
                                            replace=False)
        train[user, test_interactions] = 0.
        # These are just 1.0 right now
        test[user, test_interactions] = ratings[user, test_interactions]
   
    
    # Test and training are truly disjoint
    assert(train.multiply(test).nnz == 0)
    return train.tocsr(), test.tocsr(), user_index

In [15]:
# Transpose the matrix to get an item-user interaction matrix
#item_user_matrix = scores.T

# Convert the transposed matrix to a sparse CSR matrix
item_user_data = sp.csr_matrix(scores)

train_item_user_data, test_item_user_data, user_index = train_test_split(item_user_data, 5, fraction=0.2)

  self._set_arrayXarray_sparse(i, j, x)


In [16]:
from sklearn.model_selection import ParameterGrid
from implicit.evaluation import precision_at_k
from implicit.als import AlternatingLeastSquares

param_grid = {
    'factors': [100, 200, 300],
    'regularization': [0.01, 0.1, 1.0],
    'iterations': [20, 40, 60]
}

# Initialize variables to store the best parameters and the best precision
best_params = None
best_precision = 0.0

for params in ParameterGrid(param_grid):
    # Initialize ALS model with the current parameters
    model = AlternatingLeastSquares(
        factors=params['factors'],
        regularization=params['regularization'],
        iterations=params['iterations']
    )
    
    # Fit the model to the training interaction matrix
    model.fit(train_item_user_data)
    
    # Evaluate the model using precision at K
    precision = precision_at_k(model, train_item_user_data, test_item_user_data, K=5)
    
    # Print the parameters and precision for reference
    print(params, precision)
    
    # Update the best parameters if the current precision is higher
    if precision > best_precision:
        best_params = params
        best_precision = precision

# Print the best parameters and precision
print("Best Parameters:", best_params)
print("Best Precision:", best_precision)


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 100, 'iterations': 20, 'regularization': 0.01} 0.02702702702702703


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 100, 'iterations': 20, 'regularization': 0.1} 0.04594594594594595


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 100, 'iterations': 20, 'regularization': 1.0} 0.03513513513513514


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 100, 'iterations': 40, 'regularization': 0.01} 0.032432432432432434


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 100, 'iterations': 40, 'regularization': 0.1} 0.043243243243243246


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 100, 'iterations': 40, 'regularization': 1.0} 0.03783783783783784


  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 100, 'iterations': 60, 'regularization': 0.01} 0.03513513513513514


  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 100, 'iterations': 60, 'regularization': 0.1} 0.04054054054054054


  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 100, 'iterations': 60, 'regularization': 1.0} 0.04054054054054054


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 200, 'iterations': 20, 'regularization': 0.01} 0.021621621621621623


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 200, 'iterations': 20, 'regularization': 0.1} 0.021621621621621623


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 200, 'iterations': 20, 'regularization': 1.0} 0.021621621621621623


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 200, 'iterations': 40, 'regularization': 0.01} 0.021621621621621623


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 200, 'iterations': 40, 'regularization': 0.1} 0.021621621621621623


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 200, 'iterations': 40, 'regularization': 1.0} 0.01891891891891892


  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 200, 'iterations': 60, 'regularization': 0.01} 0.021621621621621623


  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 200, 'iterations': 60, 'regularization': 0.1} 0.024324324324324326


  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 200, 'iterations': 60, 'regularization': 1.0} 0.024324324324324326


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 300, 'iterations': 20, 'regularization': 0.01} 0.008108108108108109


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 300, 'iterations': 20, 'regularization': 0.1} 0.008108108108108109


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 300, 'iterations': 20, 'regularization': 1.0} 0.016216216216216217


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 300, 'iterations': 40, 'regularization': 0.01} 0.010810810810810811


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 300, 'iterations': 40, 'regularization': 0.1} 0.008108108108108109


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 300, 'iterations': 40, 'regularization': 1.0} 0.013513513513513514


  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 300, 'iterations': 60, 'regularization': 0.01} 0.008108108108108109


  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 300, 'iterations': 60, 'regularization': 0.1} 0.008108108108108109


  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

{'factors': 300, 'iterations': 60, 'regularization': 1.0} 0.021621621621621623
Best Parameters: {'factors': 100, 'iterations': 20, 'regularization': 0.1}
Best Precision: 0.04594594594594595


In [17]:
model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.1, iterations=20)
    
# Fit the model to the interaction matrix
model.fit(item_user_data)

  0%|          | 0/20 [00:00<?, ?it/s]

In [18]:
# Get recommendations for a user
userid = 0
recommended_items = model.recommend(userid, user_items=item_user_data[userid])

# Print recommended items
print(recommended_items[1])

[0.00985612 0.0081868  0.00808352 0.00623772 0.00583398 0.00533208
 0.00515628 0.00506449 0.0048351  0.00468449]


In [73]:
# Save the trained model to a file
model.save('tournesol_model_v1')

<implicit.cpu.als.AlternatingLeastSquares at 0x7fe5665645e0>

In [95]:
#load the model .npz
model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.01)
model.load("tournesol_model_v1.npz")

<implicit.cpu.als.AlternatingLeastSquares at 0x7fbd1db79220>