In [1]:
from __future__ import division, print_function
import math, random
from collections import defaultdict, Counter
from scipy import spatial
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from operator import itemgetter, attrgetter
from pprint import pprint
from collections import OrderedDict as od
import numpy as np

In [2]:
# темы повторяющиеся у разных пользователей
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

unique_interests = sorted({interest for users_interests in users_interests for interest in users_interests})

print(unique_interests)
print(len(unique_interests))
print(len(users_interests))

['Big Data', 'C++', 'Cassandra', 'HBase', 'Hadoop', 'Haskell', 'Java', 'Mahout', 'MapReduce', 'MongoDB', 'MySQL', 'NoSQL', 'Postgres', 'Python', 'R', 'Spark', 'Storm', 'artificial intelligence', 'databases', 'decision trees', 'deep learning', 'libsvm', 'machine learning', 'mathematics', 'neural networks', 'numpy', 'pandas', 'probability', 'programming languages', 'regression', 'scikit-learn', 'scipy', 'statistics', 'statsmodels', 'support vector machines', 'theory']
36
15


# Популярность темы
Простой подход  - рекомендуем пользователю самые популярные темы

In [3]:
popular_interests = Counter(
    interest for user_interests in users_interests for interest in user_interests
).most_common()
print('Popular Interests')
print(popular_interests)

Popular Interests
[('Python', 4), ('R', 4), ('Big Data', 3), ('HBase', 3), ('Java', 3), ('statistics', 3), ('regression', 3), ('probability', 3), ('Hadoop', 2), ('Cassandra', 2), ('MongoDB', 2), ('Postgres', 2), ('scikit-learn', 2), ('statsmodels', 2), ('pandas', 2), ('machine learning', 2), ('libsvm', 2), ('C++', 2), ('neural networks', 2), ('deep learning', 2), ('artificial intelligence', 2), ('Spark', 1), ('Storm', 1), ('NoSQL', 1), ('scipy', 1), ('numpy', 1), ('decision trees', 1), ('Haskell', 1), ('programming languages', 1), ('mathematics', 1), ('theory', 1), ('Mahout', 1), ('MapReduce', 1), ('databases', 1), ('MySQL', 1), ('support vector machines', 1)]


In [4]:
def most_popular_new_interests(user_interests, max_results=5):
    suggestions = [(interest, frequency)
                   for interest, frequency in popular_interests
                   if interest not in user_interests
                  ]
    return suggestions[:max_results]

print('Most popular New Interests')
print('already like:', ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'])
print(most_popular_new_interests(['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']))

Most popular New Interests
already like: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
[('Python', 4), ('R', 4), ('Big Data', 3), ('Java', 3), ('statistics', 3)]


Такой подход применим для пользователей, о которых ничего не известно. Так называемый "холодный старт". Почему он
плохо работает для всех?

In [5]:
# составим матрицу интересов пользователей

v = DictVectorizer(sparse=False)

d = [{item: 1 for item in ui} for ui in users_interests]
user_interest_matrix = v.fit_transform(d)

user_interest_matrix

array([[1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1.,
        0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
        1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0.,

In [6]:
# User-based  Collaborative Filtering
#
#
user_similarities = cosine_similarity(user_interest_matrix, user_interest_matrix)

# схожесть первого пользователя
user_similarities[0]


array([1.        , 0.3380617 , 0.        , 0.        , 0.        ,
       0.15430335, 0.        , 0.        , 0.18898224, 0.56694671,
       0.        , 0.        , 0.        , 0.16903085, 0.        ])

In [177]:
def most_similar_users_to(user_id):
    user = user_similarities[user_id]
    
    pairs = [(other_user_id, similarity)
             for other_user_id, similarity in enumerate(user)
             if user_id != other_user_id and similarity > 0]
    
    return sorted(pairs, key=lambda x: x[1], reverse=True)

def user_based_suggestions(user_id, include_current_interests=False):
    suggestions = defaultdict(float)
    for other_user_id, similarity in most_similar_users_to(user_id):
         for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity
#     print(suggestions.items())
    suggestions = sorted(suggestions.items(),
                        key=lambda weight: weight,
                         reverse=True
                        )
    suggestions = od(suggestions)
    
    if include_current_interests:
        return suggestions
    else:
        return [(suggestions, weight)
                for suggestion, weight in suggestions
                if suggestion not in unique_interests[user_id]
               ]

print('User based similarity')
print('most similar to 0')
print(users_interests[0])
print(users_interests[9])
print(most_similar_users_to(0))

print('Suggestions for 0')
print(user_based_suggestions(0))

User based similarity
most similar to 0
['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
['Hadoop', 'Java', 'MapReduce', 'Big Data']
[(9, 0.5669467095138407), (1, 0.3380617018914066), (8, 0.1889822365046136), (13, 0.1690308509457033), (5, 0.1543033499620919)]
Suggestions for 0


ValueError: too many values to unpack (expected 2)