# Введение

In [1]:
import random
import torch
import numpy as np
import pandas as pd
import math

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True

## Загрузка и предобработка данных

Загрузим данные и проведем предобраотку данных как на семинаре.

In [2]:
!wget -q -N https://www.dropbox.com/s/z8syrl5trawxs0n/articles.zip?dl=0 -O articles.zip
!unzip -o -q articles.zip

In [3]:
articles_df = pd.read_csv('articles/shared_articles.csv')
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df.head(2)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en


In [4]:
interactions_df = pd.read_csv('articles/users_interactions.csv')
interactions_df.head(2)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US


In [5]:
interactions_df.personId = interactions_df.personId.astype(str)
interactions_df.contentId = interactions_df.contentId.astype(str)
articles_df.contentId = articles_df.contentId.astype(str)

In [6]:
# зададим словарь определяющий силу взаимодействия
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

interactions_df['eventStrength'] = interactions_df.eventType.apply(lambda x: event_type_strength[x])

Оставляем только тех пользователей, которые произамодействовали более чем с пятью статьями.

In [7]:
users_interactions_count_df = (
    interactions_df
    .groupby(['personId', 'contentId'])
    .first()
    .reset_index()
    .groupby('personId').size())
print('# users:', len(users_interactions_count_df))

users_with_enough_interactions_df = \
    users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['personId']]
print('# users with at least 5 interactions:',len(users_with_enough_interactions_df))

# users: 1895
# users with at least 5 interactions: 1140


Оставляем только те взаимодействия, которые относятся к отфильтрованным пользователям.

In [8]:
interactions_from_selected_users_df = interactions_df.loc[np.in1d(interactions_df.personId,
            users_with_enough_interactions_df)]

In [9]:
print('# interactions before:', interactions_df.shape)
print('# interactions after:', interactions_from_selected_users_df.shape)

# interactions before: (72312, 9)
# interactions after: (69868, 9)


Объединяем все взаимодействия пользователя по каждой статье и сглажиываем полученный результат, взяв от него логарифм.

In [10]:
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_full_df = (
    interactions_from_selected_users_df
    .groupby(['personId', 'contentId']).eventStrength.sum()
    .apply(smooth_user_preference)
    .reset_index().set_index(['personId', 'contentId'])
)
interactions_full_df['last_timestamp'] = (
    interactions_from_selected_users_df
    .groupby(['personId', 'contentId'])['timestamp'].last()
)
        
interactions_full_df = interactions_full_df.reset_index()
interactions_full_df.head(5)

Unnamed: 0,personId,contentId,eventStrength,last_timestamp
0,-1007001694607905623,-5065077552540450930,1.0,1470395911
1,-1007001694607905623,-6623581327558800021,1.0,1487240080
2,-1007001694607905623,-793729620925729327,1.0,1472834892
3,-1007001694607905623,1469580151036142903,1.0,1487240062
4,-1007001694607905623,7270966256391553686,1.584963,1485994324


Разобьём выборку на обучение и контроль по времени.

In [11]:
from sklearn.model_selection import train_test_split

split_ts = 1475519530
interactions_train_df = interactions_full_df.loc[interactions_full_df.last_timestamp < split_ts].copy()
interactions_test_df = interactions_full_df.loc[interactions_full_df.last_timestamp >= split_ts].copy()

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

interactions_train_df

# interactions on Train set: 29329
# interactions on Test set: 9777


Unnamed: 0,personId,contentId,eventStrength,last_timestamp
0,-1007001694607905623,-5065077552540450930,1.0,1470395911
2,-1007001694607905623,-793729620925729327,1.0,1472834892
6,-1032019229384696495,-1006791494035379303,1.0,1469129122
7,-1032019229384696495,-1039912738963181810,1.0,1459376415
8,-1032019229384696495,-1081723567492738167,2.0,1464054093
...,...,...,...,...
39099,997469202936578234,9112765177685685246,2.0,1472479493
39100,998688566268269815,-1255189867397298842,1.0,1474567164
39101,998688566268269815,-401664538366009049,1.0,1474567449
39103,998688566268269815,6881796783400625893,1.0,1474567675


Для удобства подсчёта качества запишем данные в формате, где строка соответствует пользователю, а столбцы будут истинными метками и предсказаниями в виде списков.

In [12]:
interactions = (
    interactions_train_df
    .groupby('personId')['contentId'].agg(lambda x: list(x))
    .reset_index()
    .rename(columns={'contentId': 'true_train'})
    .set_index('personId')
)

interactions['true_test'] = (
    interactions_test_df
    .groupby('personId')['contentId'].agg(lambda x: list(x))
)

# заполнение пропусков пустыми списками
interactions.loc[pd.isnull(interactions.true_test), 'true_test'] = [
    list() for x in range(len(interactions.loc[pd.isnull(interactions.true_test), 'true_test']))]

interactions.head(1)

Unnamed: 0_level_0,true_train,true_test
personId,Unnamed: 1_level_1,Unnamed: 2_level_1
-1007001694607905623,"[-5065077552540450930, -793729620925729327]","[-6623581327558800021, 1469580151036142903, 72..."


## Библиотека LightFM

Для рекомендации Вы будете пользоваться библиотекой [LightFM](https://making.lyst.com/lightfm/docs/home.html), в которой реализованы популярные алгоритмы. Для оценивания качества рекомендации, как и на семинаре, будем пользоваться метрикой *precision@10*.

In [13]:
!pip install lightfm
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

Collecting lightfm
[?25l  Downloading https://files.pythonhosted.org/packages/5e/fe/8864d723daa8e5afc74080ce510c30f7ad52facf6a157d4b42dec83dfab4/lightfm-1.16.tar.gz (310kB)
[K     |█                               | 10kB 16.1MB/s eta 0:00:01[K     |██▏                             | 20kB 14.6MB/s eta 0:00:01[K     |███▏                            | 30kB 10.1MB/s eta 0:00:01[K     |████▎                           | 40kB 8.1MB/s eta 0:00:01[K     |█████▎                          | 51kB 5.2MB/s eta 0:00:01[K     |██████▍                         | 61kB 4.9MB/s eta 0:00:01[K     |███████▍                        | 71kB 5.5MB/s eta 0:00:01[K     |████████▌                       | 81kB 5.9MB/s eta 0:00:01[K     |█████████▌                      | 92kB 6.2MB/s eta 0:00:01[K     |██████████▋                     | 102kB 5.1MB/s eta 0:00:01[K     |███████████▋                    | 112kB 5.1MB/s eta 0:00:01[K     |████████████▊                   | 122kB 5.1MB/s eta 0:00:01[K

## Задание 1. (2 балла)

Модели в LightFM работают с разреженными матрицами. Создайте разреженные матрицы `data_train` и `data_test` (размером количество пользователей на количество статей), такие что на пересечении строки пользователя и столбца статьи стоит сила их взаимодействия, если взаимодействие было, и стоит ноль, если взаимодействия не было.

In [14]:
from scipy.sparse import csr_matrix

In [15]:
content_unique = interactions_full_df['contentId'].unique()
person_unique = interactions_full_df['personId'].unique()


In [16]:
data_train = pd.pivot_table(
    interactions_train_df,
    values='eventStrength',
    index='personId',
    columns='contentId',
    fill_value=0
)

data_test = pd.pivot_table(
    interactions_test_df,
    values='eventStrength',
    index='personId',
    columns='contentId',
    fill_value=0
)

In [17]:
data_train = pd.DataFrame(
    data_train,
    index=person_unique,
    columns=content_unique, 
).fillna(0)

data_test = pd.DataFrame(
    data_test,
    index=person_unique,
    columns=content_unique, 
).fillna(0)

In [18]:
data_train.head()

Unnamed: 0,-5065077552540450930,-6623581327558800021,-793729620925729327,1469580151036142903,7270966256391553686,8729086959762650511,-1006791494035379303,-1039912738963181810,-1081723567492738167,-1111518890369033396,-1114438937697017987,-1137602700803601559,-1151034582628982912,-1249582672736761858,-1254906787526072320,-1313614305945895108,-1415040208471067980,-1453783314552286835,-1470592927114056630,-1556169727291354289,-158184760257182670,-1590585250246572231,-1622037268576555626,-1634742667970363668,-1637159115260338032,-1654063646246197191,-1706114177222872702,-1730766821655383888,-1737937277055036780,-1753606726398516179,-1901742495252324928,-1917202688559171732,-1981734999963962468,-1995591062742965408,-2038869595290705317,-205193648629294862,-2061825422128752184,-2081760549863309770,-2083103312491589695,-2097626568191556277,...,-8686523832043452855,6240076106289531207,-9007594455502730692,4804093434821394840,2633033854118851671,8326017498198914888,-4351117979148287331,-5820943153819992582,-2377881752614744441,-2976309714446243509,-9033346036688923648,6075137928366965854,-2577148567706202814,-5581910915745827384,-6245524727898921842,-7905002887579197656,-7732246497578572511,7345394467579731315,-5482295111885355605,-6215634663594744290,-6742146164569753679,2572662123697831414,6716649347760033969,-3528088210002754978,286115624311598644,-7292285110016212249,-86767468210285959,5929055844564382383,6800965548226529308,-5345231991504930502,-6180514744351188891,-894273917655637623,1719976830095479814,7083316110921342538,-3113913063173722290,4106497696154898573,-8464215556093549753,-8202212195240926680,5518462222339671372,5937899505996968869
-1007001694607905623,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-1032019229384696495,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,3.584963,2.70044,2.0,1.0,2.321928,2.584963,0.0,5.247928,1.0,3.807355,2.807355,1.0,2.807355,1.0,2.0,2.321928,1.0,1.0,1.807355,2.321928,3.459432,1.0,2.0,2.0,3.0,2.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-108842214936804958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.321928,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-1119397949556155765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-1130272294246983140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
data_train = csr_matrix(data_train.values)
data_test = csr_matrix(data_test.values)

In [20]:
data_train.shape

(1140, 2984)

In [21]:
data_test.shape

(1140, 2984)

## Задание 2. (1 балл)

Обучите модель LightFM с `loss='warp'` и посчитайте *precision@10* на тесте.

In [22]:
model = LightFM(loss='warp')
model.fit(data_train)

<lightfm.lightfm.LightFM at 0x7f76ac595890>

In [23]:
precision_train = precision_at_k(model, data_train, k=10).mean()
precision_test = precision_at_k(model, data_test, k=10).mean()

print('Train:{}, Test:{}'.format(precision_train, precision_test))

Train:0.12329137325286865, Test:0.002851323690265417


## Задание 3. (3 балла)

При вызове метода `fit` LightFM позволяет передавать в `item_features` признаковое описание объектов. Воспользуемся этим. Будем получать признаковое описание из текста статьи в виде [TF-IDF](https://ru.wikipedia.org/wiki/TF-IDF) (можно воспользоваться `TfidfVectorizer` из scikit-learn). Создайте матрицу `feat` размером количесвто статей на размер признакового описание и обучите LightFM с `loss='warp'` и посчитайте precision@10 на тесте.

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
item = interactions_full_df[['contentId']].drop_duplicates()
item_1 = articles_df[['contentId', 'text', 'title', 'lang']].drop_duplicates()
df = item.merge(item_1, on='contentId', how='left').fillna('')
df['final'] = df.apply(lambda x: str(x['title']) + ' ' + str(x['text']), axis=1)



In [28]:
# Ваш код здесь
feat = TfidfVectorizer().fit_transform(df['final'])

model_tf_idf = LightFM(loss='warp')
model_tf_idf.fit(data_train, item_features=feat)

<lightfm.lightfm.LightFM at 0x7f76ab960710>

In [29]:
precision_train = precision_at_k(model_tf_idf, data_train, k=10, item_features=feat).mean()
precision_test = precision_at_k(model_tf_idf, data_test, k=10, item_features=feat).mean()

print('Train:{}, Test:{}'.format(precision_train, precision_test))

Train:0.12985610961914062, Test:0.0047861505299806595


## Задание 4. (2 балла)

В задании 3 мы использовали сырой текст статей. В этом задании необходимо сначала сделать предобработку текста (привести к нижнему регистру, убрать стоп слова, привести слова к номральной форме и т.д.), после чего обучите модель и оценить качество на тестовых данных.

In [30]:
import nltk
import string

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [31]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [32]:
nltk.download("stopwords")
from nltk.corpus import stopwords
stopwords_english = stopwords.words("english")
stopwords_port = stopwords.words("portuguese")

stopwords = stopwords_english + stopwords_port

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [33]:
df['lang'].value_counts()

en    2148
pt     822
         8
la       2
ja       2
es       2
Name: lang, dtype: int64

In [34]:
def process_text(text):
  wordnet_lemmatizer = WordNetLemmatizer()
  change_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(text.lower()) if word not in string.punctuation]
  change_word = [word for word in change_word if (word not in stopwords) and word.isalpha()]
  return ' '.join(change_word)

In [35]:
df['final_preprocessing'] = df['final'].apply(process_text)

In [36]:
df.head()

Unnamed: 0,contentId,text,title,lang,final,final_preprocessing
0,-5065077552540450930,A AXA se manteve na liderança do ranking de ma...,Ranking das maiores seguradoras da Europa - 20...,pt,Ranking das maiores seguradoras da Europa - 20...,ranking maiores seguradoras europa sonho segur...
1,-6623581327558800021,"About a decade ago, a handful of Google's most...","Spanner, the Google Database That Mastered Tim...",en,"Spanner, the Google Database That Mastered Tim...",spanner google database mastered time open eve...
2,-793729620925729327,"Posted by Sam Thorogood , Developer Programs E...",Closure Compiler in JavaScript,en,Closure Compiler in JavaScript Posted by Sam T...,closure compiler javascript posted sam thorogo...
3,1469580151036142903,This is one of the great discussions among dev...,Don't document your code. Code your documentat...,en,Don't document your code. Code your documentat...,document code code documentation one great dis...
4,7270966256391553686,We are excited to announce the release of .NET...,Announcing .NET Core 1.0,en,Announcing .NET Core 1.0 We are excited to ann...,announcing core excited announce release core ...


In [145]:
feat_preprocessing = TfidfVectorizer().fit_transform(df['final_preprocessing'])

In [146]:
model_preprocessing = LightFM(loss='warp')
model_preprocessing.fit(data_train, item_features=feat_preprocessing)

<lightfm.lightfm.LightFM at 0x7f7663786f90>

In [147]:
precision_train = precision_at_k(model_preprocessing, data_train, k=10, item_features=feat_preprocessing).mean()
precision_test = precision_at_k(model_preprocessing, data_test, k=10, item_features=feat_preprocessing).mean()

print('Train:{}, Test:{}'.format(precision_train, precision_test))

Train:0.1403777003288269, Test:0.0046843173913657665


Улучшилось ли качество предсказания?

Немного хуже

## Задание 5. (2 балла)

Подберите гиперпараметры модели LightFM (`n_components` и др.) для улучшения качества модели.

In [61]:
model_preprocessing_better = LightFM(
                                      loss='warp', 
                                      learning_rate=0.005, 
                                      no_components=6)
model_preprocessing_better.fit(
                                data_train, 
                                item_features=feat_preprocessing,  
                                epochs=40,
                               num_threads=4)

<lightfm.lightfm.LightFM at 0x7f76a15ea4d0>

In [62]:
precision_train = precision_at_k(model_preprocessing_better, data_train, k=10, item_features=feat_preprocessing).mean()
precision_test = precision_at_k(model_preprocessing_better, data_test, k=10, item_features=feat_preprocessing).mean()

print('Train:{}, Test:{}'.format(precision_train, precision_test))

Train:0.14289569854736328, Test:0.004989816807210445


## Бонусное задание. (3 балла)

Выше мы использовали достаточно простое представление текста статьи в виде TF-IDF. В этом задании Вам нужно представить текст статьи (можно вместе с заголовком) в виде эмбеддинга полученного с помощью рекуррентной сети или трансформера (можно использовать любую предобученную модель, которая Вам нравится). Обучите модель с ипользованием этих эмеддингов и сравните результаты с предыдущими.

https://github.com/UKPLab/sentence-transformers - будем использовать отсюда код

In [134]:
!pip install -U sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/14/9d/abacb6f7bb63df39285c55bb51b6403a7fd93ac2aea48b01f6215175446c/sentence-transformers-1.1.1.tar.gz (81kB)
[K     |████████████████████████████████| 81kB 2.8MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/b0/9e/5b80becd952d5f7250eaf8fc64b957077b12ccfe73e9c03d37146ab29712/transformers-4.6.0-py3-none-any.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 8.7MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 

In [135]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

HBox(children=(FloatProgress(value=0.0, max=305584576.0), HTML(value='')))




In [137]:
sentence_embeddings = model.encode(df['final'])

Теперь сделаем разряженную матрицу и обучим на LightFM

In [138]:
feat_trans = csr_matrix(sentence_embeddings)

In [141]:
model_trans = LightFM(loss='warp', learning_rate=0.005, no_components=50)
model_trans.fit(data_train, item_features=feat_trans, num_threads=4, epochs=30)

<lightfm.lightfm.LightFM at 0x7f7667dfc310>

In [142]:
precision_train = precision_at_k(model_trans, data_train, k=10, item_features=feat_trans).mean()
precision_test = precision_at_k(model_trans, data_test, k=10, item_features=feat_trans).mean()

print('Train:{}, Test:{}'.format(precision_train, precision_test))

Train:0.1419064849615097, Test:0.008655804209411144


Стало немного лучше:)