# Laboratorium 4 - rekomendacje dla portali informacyjnych

## Przygotowanie

 * pobierz i wypakuj dataset: https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip
   * więcej możesz poczytać tutaj: https://learn.microsoft.com/en-us/azure/open-datasets/dataset-microsoft-news
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab4`
 * zainstaluj potrzebne biblioteki:
 `pip install nltk sklearn`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

import codecs
from collections import defaultdict # mozesz uzyc zamiast zwyklego slownika, rozwaz wplyw na czas obliczen
import math
import re
from string import punctuation

import nltk
nltk.download('stopwords')
nltk.download('rslp')

from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

# mozesz uzyc do obliczania najbardziej podobnych tekstow zamiast liczenia "na piechote"
# ale pamietaj o dostosowaniu formatu danych
from sklearn.neighbors import NearestNeighbors

# na potrzeby wizualizacji
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from math import sqrt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Main_User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\Main_User\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [2]:
# definiujemy potrzebne zmienne

PATH = './data'
STOPWORDS = set(stopwords.words('english'))

In [3]:
# wczytujemy metadane artykułów

def parse_news_entry(entry):
    news_id, category, subcategory, title, abstract = entry.split('\t')[:5]
    return {
        'news_id': news_id,
        'category': category,
        'subcategory': subcategory,
        'title': title,
        'abstract': abstract
    }

def get_news_metadata():
    with codecs.open(f'{PATH}/news.tsv', 'r', 'UTF-8') as f:
        raw = [x for x in f.read().split('\n') if x]
        parsed_entries = [parse_news_entry(entry) for entry in raw]
        return {x['news_id']: x for x in parsed_entries}

news = get_news_metadata()
news_ids = sorted(list(news.keys()))
news_indices = {x[1]: x[0] for x in enumerate(news_ids)}
print(len(news))

51282


In [4]:
def parse_history_entry(entry):
    _id, user_id, _time, history, _impressions = entry.split('\t')
    history = [x for x in history.split() if x]
    return user_id, history

def get_users_history():
    with codecs.open(f'{PATH}/behaviors.tsv', 'r', 'UTF-8') as f:
        lines = [x for x in f.read().split('\n') if x]
        entries = [parse_history_entry(x) for x in lines]
        return dict(entries)

users_history = get_users_history()
test_users = 'U53231', 'U89744', 'U10045', 'U92486', 'U70879'
print(len(users_history))

50000


## Część 2. - TF-IDF

In [5]:
# normalizujemy teksty na potrzeby dalszego przetwarzania

def preprocess_text(text):
    # zamieniamy wszystkie ciagi bialych znakow na pojedyncze spacje
    text = " ".join(text.split())
    # usuwamy znaki interpunkcyjne
    text = re.sub('\.', '', text)
    text = re.sub('!', '', text)
    text = re.sub("\?", '', text)
    text = re.sub(',', '', text)
    text = re.sub('-', '', text)
    # usuwamy wszystkie liczby
    text = re.sub(r'\d+', '', text)
    # podmieniamy wszystkie wielkie litery
    text = text.lower()
    # dzielimy na tokeny
    tokens = text.split()
    # usuwamy stopwords
    tokens = [x for x in tokens if x not in STOPWORDS]
    return tokens


def stem_texts(corpus):
    stemmer = PorterStemmer() # przetestuj rozne stemmery
    return [[stemmer.stem(word) for word in preprocess_text(text)] for text in corpus]

texts = [news[news_id]['abstract'] for news_id in news_ids]
stemmed_texts = stem_texts(texts)

In [6]:
# porownajmy teksty przed i po przetworzeniu

print(texts[2] + '\n')
print(' '.join(stemmed_texts[2]))

"I think we have a really good team, and a team that can really do some special, good things because that group is very close in there." - Brian Schmetzer

"i think realli good team team realli special good thing group close there" brian schmetzer


In [7]:
# tworzymy liste wszystkich slow w korpusie

def get_all_words_sorted(corpus):
    result = []
    for i in range(len(corpus)):
        for word in corpus[i]:
            if word not in result:
                result.append(word)
    return sorted(result)

wordlist = get_all_words_sorted(stemmed_texts)
word_indices = {x[1]: x[0] for x in enumerate(wordlist)}
print(len(wordlist))

55603


In [None]:
# obliczamy liczbe tekstow, w ktorych wystapilo kazde ze slow
# pamietaj, ze jesli slowo wystapilo w danym tekscie wielokrotnie, to liczymy je tylko raz

def get_document_frequencies(corpus, wordlist):
    # return {word -> count}
    return {word: len([1 for text in corpus if word in text]) for word in wordlist}

document_frequency = get_document_frequencies(stemmed_texts, wordlist)

In [None]:
# obliczamy liczbe wystapien kazdego slowa w kazdym tekscie

def get_term_frequencies(corpus, news_indices):
    # return {news_id -> {word -> count}}
    return {news_id: {word: text.count(word) for word in wordlist} for news_id, text in zip(news_indices, corpus)}

term_frequency = get_term_frequencies(stemmed_texts, news_indices)

In [None]:
# sprawdzmy wyniki

term_frequency[news_ids[2]]

In [None]:
# obliczamy metryke tf_idf

def calculate_tf_idf(term_frequency, document_frequency, corpus_size):
    # return {news_id -> {word -> tf_idf}}
    return {news_id: {word: term_frequency[news_id][word] * math.log(corpus_size/document_frequency[word]) for word in wordlist} for news_id in news_ids}

tf_idf = calculate_tf_idf(term_frequency, document_frequency, len(news_ids))

In [None]:
# sprawdzmy wyniki

tf_idf[news_ids[2]]

## Część 3. - Podobieństwo tekstów

In [None]:
# obliczmy odleglosc miedzy dwoma artykulami
# przetestuj rozne metryki odleglosci i wybierz najlepsza

def calculate_distance(tf_idf, id1, id2):
    tf_idf1 = tf_idf[id1]
    tf_idf2 = tf_idf[id2]
    sum = 0
    for i in range(len(wordlist)):
        if wordlist[i] not in tf_idf1 or wordlist[i] not in tf_idf2:
            continue
        else:
            sum += tf_idf1[wordlist[i]] * tf_idf2[wordlist[i]]
    magnitude1 = sqrt(sum([x**2 for x in tf_idf1.values()]))
    magnitude2 = sqrt(sum([x**2 for x in tf_idf2.values()]))
    return sum/(magnitude1*magnitude2)

calculate_distance(tf_idf, news_ids[2], news_ids[1])

In [None]:
# funkcja pomocnicza do wyswietlania artykulow
def print_news_entry(n_id, corpus):
    print(f'id: {n_id}\n\ttitle: {corpus[n_id]["title"]}\n\ttext: {corpus[n_id]["abstract"]}')

print_news_entry('N42782', news)

In [None]:
# wyznaczmy k najpodobniejszych tekstow do danego
# pamietaj o odpowiedniej kolejnosci sortowania w zaleznosci od wykorzystanej metryki
# pamietaj, zeby wsrod podobnych tekstow nie bylo danego

def get_k_most_similar_news(tf_idf, n_id, k):
    distances = [(calculate_distance(tf_idf, n_id, x), x) for x in news_ids if x != n_id]
    distances.sort(key=lambda x: x[0], reverse=True)
    return [x[1] for x in distances[:k]]

def print_k_most_similar_news(tf_idf, n_id, k, corpus):
    similar = get_k_most_similar_news(tf_idf, n_id, k)
    print_news_entry(n_id, corpus)
    print(f'\n{k} most similar:')
    for s_id in similar:
       print_news_entry(s_id, corpus)

print_k_most_similar_news(tf_idf, news_ids[42337], 5, news)

## Część 4. - Profile użytkowników

In [None]:
# oblicz srednia z wektorow tf-idf artykulow o zadanych id-kach
def calculate_average_vector(tf_idf, news_ids: list[str]) -> dict[str, float]:
    return {word: sum([tf_idf[n_id][word] for n_id in news_ids])/len(news_ids) for word in wordlist}

# wykorzystaj powyzsza funkcje, by policzyc wektor kazdego uzytkownika
def calculate_users_vectors(tf_idf, users_history) -> dict[str, list]:
    return {user_id: calculate_average_vector(tf_idf, news_ids) for user_id, news_ids in users_history.items()}

user_vectors = calculate_users_vectors(tf_idf, users_history)


In [None]:
# sprawdz wyliczony profil dla przykladowego uzytkownika
print(sorted([(k,v) for k,v in user_vectors[test_users[0]].items() if v], key=lambda x: -x[1]))

In [None]:
# skorzystajmy ze znanej juz biblioteki, by to lepiej zwizualizowac
def plot_vector(tf_idf_vector):
    wordcloud = WordCloud(random_state=42, background_color='black', colormap='Set2')
    wordcloud.generate_from_frequencies(frequencies=tf_idf_vector)
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

plot_vector(user_vectors[test_users[0]])

## Część 5. - Rekomendacje dla użytkowników

In [None]:
# wykorzystujac wektory tresci i profile uzytkownikow,
#   wygeneruj liste k artykulow najlepiej dopasowanych do uzytkownika
#   pamietaj o odsianiu artykulow, ktore uzytkownik juz kliknal

def recommend(tf_idf, user_id, news, users_history, k):
    user_vector = user_vectors[user_id]
    clicked = users_history[user_id]
    distances = [(sum([user_vector[word]*tf_idf[news_id][word] for word in wordlist])
                  / (sqrt(sum([x**2 for x in tf_idf[news_id].values()]))
                     * sqrt(sum([x**2 for x in user_vector.values()]))),
                  news_id) for news_id in news_ids if news_id not in clicked]
    distances.sort(key=lambda x: x[0], reverse=True)
    return [x[1] for x in distances[:k]]

# dla wybranego uzytkownika, korzystajac z juz zaimplementowanych funkcji,
#   pokaz jego historie, profil (wordcloud) i rekomendacje
user_id = test_users[0]
print(f'User: {user_id}')
print('History:')
for n_id in users_history[user_id]:
    print_news_entry(n_id, news)
print('\nProfile:')
wc = WordCloud(random_state=42, background_color='black', colormap='Set2')
wc.generate_from_frequencies(frequencies=user_vectors[user_id], stopwords=STOPWORDS)
plt.figure()
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()
print('\nRecommendations:')
for n_id in recommend(tf_idf, user_id, news, users_history, 5):
    print_news_entry(n_id, news)

## Część 6. - Ocena jakości

In [None]:
# jaccard index to metryka podobienstwa zbiorow, lekko ja zmodyfikujemy
# przeciecie wektorow to minimum po kazdej wspolrzednej
# unia wektorow to maksimum po kazdej wspolrzednej
# jaccard index to iloraz sum tych dwoch wartosci

def jaccard(v1, v2):
    intersection = sum([min(v1[word], v2[word]) for word in wordlist])
    union = sum([max(v1[word], v2[word]) for word in wordlist])
    return intersection/union


In [None]:
# dla kazdego uzytkownika wygeneruj k-elementowa rekomendacje
# policz jaccard index miedzy wektorem uzytkownika a srednim wektorem elementow z rekomendacji
# porownaj wyniki dla dwoch roznych k i dwoch roznych metryk podobienstwa

for k in [5, 10]:
    for user_id in test_users:
        print(f'User: {user_id}')
        recommendations = recommend(tf_idf, user_id, news, users_history, k)
        mean_vector_of_recomm = calculate_average_vector(tf_idf, recommendations)

        jaccard_result = jaccard(user_vectors[user_id], mean_vector_of_recomm)
        print(f'Jaccard index: {jaccard_result}')

        sum = 0
        for i in range(len(wordlist)):
            if wordlist[i] not in user_vectors[user_id] or wordlist[i] not in mean_vector_of_recomm:
                continue
            else:
                sum += user_vectors[user_id][wordlist[i]] * mean_vector_of_recomm[wordlist[i]]
        magnitude1 = sqrt(sum([x**2 for x in user_vectors[user_id].values()]))
        magnitude2 = sqrt(sum([x**2 for x in mean_vector_of_recomm.values()]))
        print(f'Cosine similarity: {sum/(magnitude1*magnitude2)}')