<a href="https://colab.research.google.com/github/amadousysada/content-recommendation/blob/main/analyse-exploratoire.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import time
init_time = time.time()

import os
import scipy
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import sklearn
print("Versions:")
print(f"{' '*2}- Numpy        =>: {np.__version__}")
print(f"{' '*2}- Pandas   =>: {pd.__version__}")
print(f"{' '*2}- sklearn   =>: {sklearn.__version__}")

Versions:
  - Numpy        =>: 2.0.2
  - Pandas   =>: 2.2.2
  - sklearn   =>: 1.6.1


In [3]:
base_path = "/content/drive/MyDrive/data/p10_content_recommendation"

ARTICLE_EMBEDDINGS_FILE = os.path.join(base_path, "articles_embeddings.pickle")
ARTICLE_METADATA_FILE = os.path.join(base_path, "articles_metadata.csv")
CLICKS_SAMPLES_FILE = os.path.join(base_path, "clicks_sample.csv")
CLICKS_FOLDER_PATH = os.path.join(base_path, "clicks")

In [4]:
print("ARTICLE_EMBEDDINGS_FILE: "+ str(ARTICLE_EMBEDDINGS_FILE))
print("ARTICLE_METADATA_FILE: "+ str(ARTICLE_METADATA_FILE))
print("CLICKS_FOLDER_PATH: "+ str(CLICKS_FOLDER_PATH))

ARTICLE_EMBEDDINGS_FILE: /content/drive/MyDrive/data/p10_content_recommendation/articles_embeddings.pickle
ARTICLE_METADATA_FILE: /content/drive/MyDrive/data/p10_content_recommendation/articles_metadata.csv
CLICKS_FOLDER_PATH: /content/drive/MyDrive/data/p10_content_recommendation/clicks


In [5]:
embeddings = pd.read_pickle(ARTICLE_EMBEDDINGS_FILE)
embeddings.shape

(364047, 250)

In [6]:
articles_metadata = pd.read_csv(ARTICLE_METADATA_FILE)

In [7]:
articles_metadata.head(10)

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,1513144419000,0,168
1,1,1,1405341936000,0,189
2,2,1,1408667706000,0,250
3,3,1,1408468313000,0,230
4,4,1,1407071171000,0,162
5,5,1,1407413929000,0,196
6,6,1,1409896802000,0,203
7,7,1,1412559620000,0,154
8,8,1,1414351550000,0,209
9,9,1,1412526792000,0,181


In [8]:
articles_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364047 entries, 0 to 364046
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype
---  ------         --------------   -----
 0   article_id     364047 non-null  int64
 1   category_id    364047 non-null  int64
 2   created_at_ts  364047 non-null  int64
 3   publisher_id   364047 non-null  int64
 4   words_count    364047 non-null  int64
dtypes: int64(5)
memory usage: 13.9 MB


In [9]:
print(f"Nombre total de category: {len(articles_metadata.category_id.unique())}")

Nombre total de category: 461


In [10]:
clicks_sample = pd.read_csv(CLICKS_SAMPLES_FILE)

In [11]:
clicks_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1883 entries, 0 to 1882
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   user_id              1883 non-null   int64
 1   session_id           1883 non-null   int64
 2   session_start        1883 non-null   int64
 3   session_size         1883 non-null   int64
 4   click_article_id     1883 non-null   int64
 5   click_timestamp      1883 non-null   int64
 6   click_environment    1883 non-null   int64
 7   click_deviceGroup    1883 non-null   int64
 8   click_os             1883 non-null   int64
 9   click_country        1883 non-null   int64
 10  click_region         1883 non-null   int64
 11  click_referrer_type  1883 non-null   int64
dtypes: int64(12)
memory usage: 176.7 KB


In [12]:
clicks_sample.head(10)

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2
1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2
2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2
3,1,1506825426267738,1506825426000,2,96663,1506827047951,4,1,17,1,16,2
4,2,1506825435299739,1506825435000,2,119592,1506827090575,4,1,17,1,24,2
5,2,1506825435299739,1506825435000,2,30970,1506827120575,4,1,17,1,24,2
6,3,1506825442704740,1506825442000,2,236065,1506827536942,4,3,2,1,21,1
7,3,1506825442704740,1506825442000,2,236294,1506827566942,4,3,2,1,21,1
8,4,1506825528135741,1506825528000,2,48915,1506826927593,4,1,17,1,17,1
9,4,1506825528135741,1506825528000,2,44488,1506826957593,4,1,17,1,17,1


In [13]:
i=157541
article_ids = clicks_sample.click_article_id.tolist()

In [14]:
def get_article_profile(article_id):
    article_profile = embeddings[article_id]
    return article_profile

def get_article_profiles(ids):
    article_profiles_list = [get_article_profile(x) for x in ids]
    # Convert each NumPy array to a sparse matrix before stacking
    #sparse_article_profiles_list = [scipy.sparse.csr_matrix(arr) for arr in article_profiles_list]
    article_profiles = np.vstack(article_profiles_list)
    return article_profiles

def build_users_profile(person_id, interactions_indexed_df):
    clicks_person_df = clicks_sample.loc[clicks_sample.user_id==person_id]
    user_article_profiles = get_article_profiles(clicks_person_df['click_article_id'])

    #user_article_strengths = np.array(interactions_person_df['eventStrength']).reshape(-1,1)
    #Weighted average of article profiles by the interactions strength
    #user_article_strengths_weighted_avg = np.sum(user_article_profiles.multiply(user_article_strengths), axis=0) / np.sum(user_article_strengths)
    #user_profile_norm = sklearn.preprocessing.normalize(user_article_strengths_weighted_avg)
    #return user_profile_norm
    w = np.ones((user_article_profiles.shape[0], 1), dtype=float) / user_article_profiles.shape[0]
    user_article_profiles = (user_article_profiles * w).sum(axis=0, keepdims=True)

    # normalisation L2 (utile pour cosine)
    user_article_profiles = user_article_profiles / (np.linalg.norm(user_article_profiles) + 1e-12)
    return user_article_profiles

def build_users_profiles():
    user_profiles = {}
    for person_id in clicks_sample.user_id.unique():
        user_profiles[person_id] = build_users_profile(person_id, clicks_sample)
    return user_profiles

In [15]:
user_profiles = build_users_profiles()

In [16]:
len(user_profiles)

707

In [17]:
myprofile = user_profiles[50]
myprofile.shape

(1, 250)

In [18]:
pd.DataFrame(user_profiles[50].flatten().tolist(), columns=['vector'])

Unnamed: 0,vector
0,-0.101656
1,-0.167328
2,-0.082715
3,-0.054706
4,-0.042612
...,...
245,0.099549
246,-0.087196
247,-0.025725
248,0.025121


In [19]:
class ContentBasedRecommender:

    MODEL_NAME = 'Content-Based'

    def __init__(self, items_df=None):
        self.item_ids = articles_metadata.article_id.unique()
        self.items_df = items_df

    def get_model_name(self):
        return self.MODEL_NAME

    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[person_id], embeddings)
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(self.item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items

    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))

        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['article_id', 'score']) \
                                    .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left',
                                                          left_on = 'article_id',
                                                          right_on = 'article_id')[['article_id', 'score','category_id', 'created_at_ts', 'publisher_id', 'words_count']]

        return recommendations_df

content_based_recommender_model = ContentBasedRecommender(articles_metadata)

In [20]:
already_seen = clicks_sample.loc[clicks_sample.user_id==50, 'click_article_id'].tolist()
content_based_recommender_model.recommend_items(user_id=50, items_to_ignore=already_seen, topn=10, verbose=True)

Unnamed: 0,article_id,score,category_id,created_at_ts,publisher_id,words_count
0,94422,0.808407,209,1470513987000,0,181
1,89932,0.785339,199,1513612286000,0,159
2,90272,0.779438,199,1516551895000,0,177
3,94255,0.777987,209,1466021096000,0,151
4,95216,0.77733,209,1486224882000,0,156
5,90735,0.774066,199,1514986582000,0,159
6,91087,0.773101,199,1512846396000,0,158
7,90275,0.772017,199,1516622525000,0,165
8,89871,0.768874,199,1514458363000,0,225
9,90172,0.767919,199,1511515999000,0,174


In [46]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

pca = PCA(n_components=128, random_state=42)
reduced_embeddings = pca.fit_transform(embeddings)

In [44]:
reduced_embeddings.shape

(364047, 128)

In [47]:
norm_reduced_embeddings = normalize(reduced_embeddings, axis=1)

In [48]:
np.save(os.path.join(base_path, "norm_reduced_embeddings.npy"), norm_reduced_embeddings)