<div>
<table border="1" style="border-collapse: collapse; width: 100%;">
<tbody>
<tr>
<td style="width: 100%; text-align: center;" colspan="4">
<h1><em><span style="color: #808080;"><strong>PROJET 9 - </strong></span></em><em><span style="color: #808080;"><strong>Parcours Ing&eacute;nieur IA</strong></span></em></h1>
</td>
</tr>
<tr style="text-align: center;">
<td colspan="4" style="width: 100%;"><img src="https://user.oc-static.com/upload/2019/10/24/15719078448088_Capture%20d%E2%80%99e%CC%81cran%202019-10-24%20a%CC%80%2010.50.32.png" alt="Logo My Content"></td>
</tr>
<tr>
<td style="width: 100%; text-align: center;" colspan="4">
<h1><span style="color: #808080;"><strong><em>Réalisez une application de recommandation de contenu</em></strong></span></h1>
</td>
</tr>
<tr>
<td style="width: 50%; text-align: center;" colspan="2"><img src="https://consent.trustarc.com/get?name=oc_logo.png" width="250px" /></td>
<td style="width: 50%; text-align: center;" colspan="2"><img src="Microsoft-Azure-Logo.png" width="250px" /></td>
</tr>
</tbody>
</table>
</div>

<hr /><h1 style="text-align: center;"><span style="color: #666699;"> <em><strong>Modélisation des système de recommandation</strong></em></span></h1><hr />

In [135]:
import logging
import random
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
from pandas.api.types import is_numeric_dtype
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import implicit
from implicit import evaluation

from scipy import sparse
pd.options.plotting.backend = "plotly"

In [3]:
DATA_PATH = "../../data/news-portal-user-interactions-by-globocom"

<h2 style="text-align: left;"><span style="color: #666699;"> <em><strong>Chargement des articles</strong></em></span></h2>

In [8]:
articles = pd.concat(
    [
        pd.read_csv(
            Path(DATA_PATH, "articles_metadata.csv"),
            parse_dates=["created_at_ts"],
            date_parser=lambda x: datetime.fromtimestamp(int(x) / 1000),
            dtype={
                "article_id": "category",
                "category_id": "category",
                "publisher_id": "category",
                "words_count": "int",
            },
        ),
        pd.DataFrame(
            pd.read_pickle(Path(DATA_PATH, "articles_embeddings.pickle")),
            columns=["embedding_" + str(i) for i in range(250)],
        ),
    ],
    axis=1,
)

articles = articles.astype({"created_at_ts": "datetime64[ns]"})

articles_sample = articles.sample(frac=0.01, random_state=1)

<h2 style="text-align: left;"><span style="color: #666699;"> <em><strong>Chargement des metadonnées articles</strong></em></span></h2>

In [7]:
articles_metadata = pd.read_csv(
    Path(DATA_PATH, "articles_metadata.csv"),
    parse_dates=["created_at_ts"],
    date_parser=lambda x: datetime.fromtimestamp(int(x) / 1000),
    dtype={
        "article_id": "category",
        "category_id": "category",
        "publisher_id": "category",
        "words_count": "int",
    },
)

articles_metadata = articles_metadata.astype({"created_at_ts": "datetime64[ns]"})

<h2 style="text-align: left;"><span style="color: #666699;"> <em><strong>Chargement des interactions Clicks</strong></em></span></h2>

In [10]:
clicks = pd.concat(
    [
        pd.read_csv(
            click_file_path,
            parse_dates=["session_start", "click_timestamp"],
            date_parser=lambda x: datetime.fromtimestamp(int(int(x) / 1000)),
            dtype={
                "user_id": "category",
                "session_id": "category",
                "session_size": "int",
                "click_article_id": "category",
                "click_environment": "category",
                "click_deviceGroup": "category",
                "click_os": "category",
                "click_country": "category",
                "click_region": "category",
                "click_referrer_type": "category",
            },
        ).replace(
            {
                "click_environment": {
                    "1": "1 - Facebook Instant Article",
                    "2": "2 - Mobile App",
                    "3": "3 - AMP (Accelerated Mobile Pages)",
                    "4": "4 - Web",
                },
                "click_deviceGroup": {
                    "1": "1 - Tablet",
                    "2": "2 - TV",
                    "3": "3 - Empty",
                    "4": "4 - Mobile",
                    "5": "5 - Desktop",
                },
                "click_os": {
                    "1": "1 - Other",
                    "2": "2 - iOS",
                    "3": "3 - Android",
                    "4": "4 - Windows Phone",
                    "5": "5 - Windows Mobile",
                    "6": "6 - Windows",
                    "7": "7 - Mac OS X",
                    "8": "8 - Mac OS",
                    "9": "9 - Samsung",
                    "10": "10 - FireHbbTV",
                    "11": "11 - ATV OS X",
                    "12": "12 - tvOS",
                    "13": "13 - Chrome OS",
                    "14": "14 - Debian",
                    "15": "15 - Symbian OS",
                    "16": "16 - BlackBerry OS",
                    "17": "17 - Firefox OS",
                    "18": "18 - Android",
                    "19": "19 - Brew MP",
                    "20": "20 - Chromecast",
                    "21": "21 - webOS",
                    "22": "22 - Gentoo",
                    "23": "23 - Solaris",
                },
            }
        )
        for click_file_path in tqdm(
            sorted(Path(DATA_PATH, "clicks").glob("clicks_hour_*.csv"))
        )
    ],
    sort=False,
    ignore_index=True,
    verify_integrity=True,
)

clicks = clicks.astype(
    {"session_start": "datetime64[ns]", "click_timestamp": "datetime64[ns]"}
)

100%|██████████| 385/385 [01:19<00:00,  4.85it/s]


<h2 style="text-align: left;"><span style="color: #666699;"> <em><strong>Fusion des DataFrames Clicks et Articles Metadata</strong></em></span></h2>

In [21]:
df_merged = clicks.merge(articles_metadata, left_on='click_article_id', right_on='article_id')#, how='right')

In [22]:
columns_to_drop = ['click_environment', 'click_deviceGroup', 'click_os', 'click_referrer_type', 'publisher_id', 'click_article_id', 'created_at_ts', 'session_id', 'session_start', 'session_size'] 

In [23]:
df_merged = df_merged.drop(columns=columns_to_drop)

<hr /><h1 style="text-align: center;"><span style="color: #666699;"> <em><strong>Content-based Filtering</strong></em></span></h1>

<h2 style="text-align: left;"><span style="color: #666699;"> <em><strong>Modèle basé sur la popularité des articles</strong></em></span></h2>

In [42]:
#most popular articles
def most_popular_article():
    most_popular_articles = df_merged['article_id'].value_counts().index.to_list()
    return most_popular_articles

In [49]:
#articles already read for a user
def article_readed(user_id):
    articles_readed = df_merged[df_merged['user_id']==user_id]['article_id'].unique().tolist()
    return articles_readed

In [50]:
# Top-n recommandations based on articles popularity
def recommandation_popular(user_id, top_n):
    most_popular_articles = most_popular_article()
    articles_readed = article_readed(user_id)
    recommandation_popular = [i for i in most_popular_articles if i not in set(articles_readed)]
    top_n = recommandation_popular[:top_n]
    
    return top_n

In [76]:
%%time
recommandation_popular(1, 5)

CPU times: user 532 ms, sys: 0 ns, total: 532 ms
Wall time: 529 ms


['160974', '272143', '336221', '234698', '123909']

<h2 style="text-align: left;"><span style="color: #666699;"> <em><strong>Modèle basé sur la catégorie des articles </strong></em></span></h2>

In [56]:
def popular_category(user_id):
    populars_category = df_merged[df_merged['user_id']==user_id].groupby(['category_id']).size().sort_values(ascending=False).index[0]
    return populars_category

In [61]:
def popular_articles(category_id):
    populars_articles = df_merged[df_merged['category_id']==category_id].groupby(['article_id']).size().sort_values(ascending=False).index.to_list()
    return populars_articles

In [100]:
def recommendations_category(user_id, top_n):
    if user_id in df_merged['user_id'].values:
        populars_category = popular_category(user_id)
        populars_articles = popular_articles(populars_category)
        articles_readed = article_readed(user_id)    
        popular_articles_for_category = [i for i in populars_articles if i not in set(articles_readed)]
        top_n = popular_articles_for_category[:top_n]
        return top_n

In [96]:
%%time
recommendations_category('1', 5)

CPU times: user 1.08 s, sys: 3.7 ms, total: 1.08 s
Wall time: 1.08 s


['160974', '162655', '158536', '156560', '160417']

<h2 style="text-align: left;"><span style="color: #666699;"> <em><strong>Modèle basé sur la similarité entre les articles </strong></em></span></h2>

In [111]:
def agg(articles):
    return articles.groupby(lambda x: True).agg(
        {
            col: "mean"
            if is_numeric_dtype(articles.dtypes[col])
            else lambda x: x.mode()[0]
            for col in articles.columns
        }
    )


def interest(user_id, clicks, articles):
    user_id = str(user_id)

    all_article_ids = clicks.query("user_id == @user_id")["click_article_id"]
    interest = agg(
        articles.query("article_id in @all_article_ids")
    ).drop(["article_id"], axis=1)

    return interest


def prepare(articles, category_id):
    articles_copy = articles.drop(["article_id", "similarity"], axis=1, errors="ignore")
    articles_copy["category_id"] = articles_copy["category_id"].apply(
        lambda x: category_id if int(x) == category_id else 0
    )
    articles_copy["created_at_ts"] = articles_copy["created_at_ts"].apply(
        lambda x: x.value
    )

    return articles_copy


def similar_articles(interest, articles, n=10):
    category_id = interest["category_id"].iloc[0]

    scaler = StandardScaler()
    articles_std = scaler.fit_transform(prepare_for_scale(articles, category_id))
    interest_std = scaler.transform(prepare_for_scale(interest, category_id))

    articles = articles.copy()
    articles["similarity"] = cosine_similarity(interest_std, articles_std)[0]

    return (
        articles.sort_values("similarity", ascending=False).iloc[:n],
        scaler,
        articles_std,
        interest_std,
    )

In [121]:
def recommendations_similar(user_id, top_n):

    interests = interest(user_id, clicks, articles)
    category_id = interests["category_id"].iloc[0]

    similar, scaler, articles_std, interest_std = similar_articles(
        interests, articles, n=5
    )

    articles_sample_std = scaler.transform(prepare(articles_sample, category_id))
    similar_std = scaler.transform(
        prepare(similar, category_id)
    )

    return similar.article_id.values.tolist()

In [123]:
%%time
recommendations_similar(1, 5)

CPU times: user 11 s, sys: 4.49 s, total: 15.5 s
Wall time: 13.9 s


['284844', '346278', '345484', '238038', '285228']

<hr /><h1 style="text-align: center;"><span style="color: #666699;"> <em><strong>Collaborative-based Filtering</strong></em></span></h1>

<h2 style="text-align: left;"><span style="color: #666699;"><em><strong>Création **Implicit Ratings**</strong></em></span></h2>

In [129]:
ratings = (
    clicks.reset_index()
    .groupby(["user_id", "click_article_id"])
    .agg(
        rating=("index", "count"),
    )
    .reset_index()
)
ratings.shape

(2950710, 3)

<h2 style="text-align: left;"><span style="color: #666699;"><em><strong>Création **Implicit Ratings CSR Matrix**</strong></em></span></h2>

In [128]:
ratings_sparse = sparse.csr_matrix(
    (
        ratings["rating"],
        (ratings["user_id"].astype("int"), ratings["click_article_id"].astype("int")),
    ),
)
ratings_sparse.shape

(322897, 364047)

<h2 style="text-align: left;"><span style="color: #666699;"><em><strong>Création Train Set & Test Set</strong></em></span></h2>

In [137]:
train_set, test_set = evaluation.train_test_split(ratings_sparse, train_percentage=0.8, random_state=None)

<hr /><h1 style="text-align: center;"><span style="color: #666699;"> <em><strong>Model based</strong></em></span></h1>

<h2 style="text-align: center;"><span style="color: #666699;"><em><strong>Alternating Least Squares Model (GPU)</strong></em></span></h2>


In [139]:
model = implicit.gpu.als.AlternatingLeastSquares()
model.fit(train_set, show_progress = True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [140]:
result = evaluation.ranking_metrics_at_k(model, train_set, test_set, K=10,
                         show_progress=True, num_threads=1)

  0%|          | 0/210272 [00:00<?, ?it/s]

In [206]:
model.recommend(16280, ratings_sparse[16280], N=5)[0].tolist()

[237524, 156355, 58580, 284547, 123757]

In [207]:
bench = pd.DataFrame(columns=pd.DataFrame(result.items())[0])
bench.index.name = 'Modèle'

In [208]:
def append_result():
    name = str(model).split(" ")[0].split(".")[-1:][0]
    bench.loc[name]= result.values()

In [209]:
append_result()
bench

Unnamed: 0_level_0,precision,map,ndcg,auc
Modèle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AlternatingLeastSquares,0.141625,0.085233,0.123204,0.590079


<h2 style="text-align: center;"><span style="color: #666699;"><em><strong>Bayesian Personalized Ranking Model (GPU)</strong></em></span></h2>


In [210]:
model = implicit.gpu.bpr.BayesianPersonalizedRanking()
model.fit(train_set, show_progress = True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [212]:
model.recommend(16280, ratings_sparse[16280], N=5)[0].tolist()

[234047, 235689, 240233, 79851, 235028]

In [211]:
result = evaluation.ranking_metrics_at_k(model, train_set, test_set, K=10, num_threads=1)
append_result()
bench

  0%|          | 0/210272 [00:00<?, ?it/s]

Unnamed: 0_level_0,precision,map,ndcg,auc
Modèle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AlternatingLeastSquares,0.141625,0.085233,0.123204,0.590079
BayesianPersonalizedRanking,0.119299,0.079672,0.11229,0.583227


<h2 style="text-align: center;"><span style="color: #666699;"><em><strong>Logistic Matrix Factorization Model</strong></em></span></h2>


In [247]:
model = implicit.cpu.lmf.LogisticMatrixFactorization()
model.fit(train_set, show_progress = True)

  0%|          | 0/30 [00:00<?, ?it/s]

In [248]:
model.recommend(16280, ratings_sparse[16280], N=5)[0].tolist()

[168784, 31278, 172992, 96560, 284096]

In [249]:
result = evaluation.ranking_metrics_at_k(model, train_set, test_set, K=10, num_threads=1)
append_result()
bench

  0%|          | 0/210272 [00:00<?, ?it/s]

Unnamed: 0_level_0,precision,map,ndcg,auc
Modèle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AlternatingLeastSquares,0.141625,0.085233,0.123204,0.590079
BayesianPersonalizedRanking,0.119299,0.079672,0.11229,0.583227
LogisticMatrixFactorization,0.028232,0.010633,0.018802,0.515468
CosineRecommender,0.207101,0.063243,0.126449,0.635363
ItemItemRecommender,0.176671,0.060964,0.114564,0.614395
TFIDFRecommender,0.216351,0.06607,0.132075,0.641347
BM25Recommender,0.025482,0.009987,0.018954,0.520581


<hr /><h1 style="text-align: center;"><span style="color: #666699;"> <em><strong>Memory based</strong></em></span></h1>

<h2 style="text-align: center;"><span style="color: #666699;"><em><strong>Nearest Neighbours with Cosine Recommender Model</strong></em></span></h2>


In [240]:
model = implicit.nearest_neighbours.CosineRecommender(K=10, num_threads=1)
model.fit(train_set.astype('double'), show_progress = True)

  0%|          | 0/364047 [00:00<?, ?it/s]

In [241]:
model.recommend(16280, train_set.astype('double'), N=5)[0].tolist()

[235689, 96755, 160940, 337735, 87231]

In [217]:
result = evaluation.ranking_metrics_at_k(model, train_set.astype('double'), test_set.astype('double'), K=10, num_threads=1)
append_result()
bench

  0%|          | 0/210272 [00:00<?, ?it/s]

Unnamed: 0_level_0,precision,map,ndcg,auc
Modèle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AlternatingLeastSquares,0.141625,0.085233,0.123204,0.590079
BayesianPersonalizedRanking,0.119299,0.079672,0.11229,0.583227
LogisticMatrixFactorization,0.027707,0.010459,0.018509,0.515394
CosineRecommender,0.207101,0.063243,0.126449,0.635363


<h2 style="text-align: center;"><span style="color: #666699;"><em><strong>Nearest Neighbours with Item/Item Recommender Model</strong></em></span></h2>


In [252]:
model = implicit.nearest_neighbours.ItemItemRecommender(K=10)
model.fit(train_set.astype('double'))

  0%|          | 0/364047 [00:00<?, ?it/s]

In [253]:
model.recommend(16280, train_set.astype('double'), N=5)[0].tolist()

[160940, 124749, 293114, 272143, 160974]

In [219]:
result = evaluation.ranking_metrics_at_k(model, train_set.astype('double'), test_set.astype('double'), K=10, num_threads=1)
append_result()
bench

  0%|          | 0/210272 [00:00<?, ?it/s]

Unnamed: 0_level_0,precision,map,ndcg,auc
Modèle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AlternatingLeastSquares,0.141625,0.085233,0.123204,0.590079
BayesianPersonalizedRanking,0.119299,0.079672,0.11229,0.583227
LogisticMatrixFactorization,0.027707,0.010459,0.018509,0.515394
CosineRecommender,0.207101,0.063243,0.126449,0.635363
ItemItemRecommender,0.176671,0.060964,0.114564,0.614395


<h2 style="text-align: center;"><span style="color: #666699;"><em><strong>Nearest Neighbours with TF/IDF Recommender Model</strong></em></span></h2>


In [254]:
model = implicit.nearest_neighbours.TFIDFRecommender(K=10, num_threads=1)
model.fit(train_set.astype('double'), show_progress = True)

  0%|          | 0/364047 [00:00<?, ?it/s]

In [255]:
model.recommend(16280, train_set.astype('double'), N=5)[0].tolist()

[158906, 96755, 337735, 160940, 87231]

In [221]:
result = evaluation.ranking_metrics_at_k(model, train_set.astype('double'), test_set.astype('double'), K=10, num_threads=1)
append_result()
bench

  0%|          | 0/210272 [00:00<?, ?it/s]

Unnamed: 0_level_0,precision,map,ndcg,auc
Modèle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AlternatingLeastSquares,0.141625,0.085233,0.123204,0.590079
BayesianPersonalizedRanking,0.119299,0.079672,0.11229,0.583227
LogisticMatrixFactorization,0.027707,0.010459,0.018509,0.515394
CosineRecommender,0.207101,0.063243,0.126449,0.635363
ItemItemRecommender,0.176671,0.060964,0.114564,0.614395
TFIDFRecommender,0.216351,0.06607,0.132075,0.641347


<h2 style="text-align: center;"><span style="color: #666699;"><em><strong>Conclusion</strong></em></span></h2>


<span style="color: #666699;"><em><strong>Le meilleur modèle au vue des resultats de la Précision et du MAP Mean Average Precision est le modèle AlternatingLeastSquares</strong></em></span>
