In [155]:
import pandas as pd

df_house = pd.read_csv('../data/house_sales_subset_normed.csv',index_col=0)
df = df_house[['SqFtTotLiving_norm','SqFtLot_norm','AdjSalePrice_norm']].iloc[:10]
print(df.head())

   SqFtTotLiving_norm  SqFtLot_norm  AdjSalePrice_norm
0            0.349914      0.654181          -0.798237
3            1.225407      0.442910          -0.309217
4           -0.394254      0.443470          -0.811184
5           -1.258803     -1.685468          -0.414055
7           -0.241043      2.133356          -0.629701


In [156]:
# using euclidean distance
from sklearn.metrics.pairwise import euclidean_distances

# calculate all pairwise distances between houses
dists = euclidean_distances(df)
dists.shape

(10, 10)

In [157]:
# find houses similar to this house
query_idx = 5
df.iloc[query_idx]

SqFtTotLiving_norm   -1.193141
SqFtLot_norm         -0.478846
AdjSalePrice_norm    -1.039064
Name: 8, dtype: float64

In [158]:
# these are all distance scores for query house
[f'{x:0.1f}' for x in dists[query_idx]]

['1.9', '2.7', '1.2', '1.4', '2.8', '0.0', '2.0', '2.1', '1.0', '0.4']

In [160]:
# find indexes of best scores (for distances, want ascending)
best_idxs_asc = np.argsort(dists[query_idx])
best_idxs_asc

array([5, 9, 8, 2, 3, 0, 6, 7, 1, 4])

In [161]:
# the top 10 recommendations
list(zip(best_idxs_asc,sorted(dists[query_idx])))

[(5, 0.0),
 (9, 0.36454193975111393),
 (8, 0.96489046966839),
 (2, 1.2412959364891165),
 (3, 1.3604716337327696),
 (0, 1.929447464433964),
 (6, 1.9964107048046749),
 (7, 2.0989248372129903),
 (1, 2.6891791893333212),
 (4, 2.810278966827491)]

In [110]:
# what if these were similarities instead of differences?
# would need to reverse this
best_idxs_desc = best_idxs_asc[::-1]

In [119]:
# from Data Science from Scratch by Joel Grus
#https://github.com/joelgrus/data-science-from-scratch.git

users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [120]:
# interests of user0
users_interests[0]

['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']

In [121]:
# want a sorted list of unique interests (here using set)
unique_interests = sorted({interest 
                           for user_interests in users_interests
                           for interest in user_interests})

In [122]:
unique_interests[:5]

['Big Data', 'C++', 'Cassandra', 'HBase', 'Hadoop']

In [123]:
# have user interests and unique interests (columns)

from sklearn.preprocessing import MultiLabelBinarizer
# Transform between lists of strings and fixed length lists of ints

In [124]:
mlb = MultiLabelBinarizer(classes=unique_interests)

In [125]:
user_interest_matrix = mlb.fit_transform(users_interests)

In [126]:
user_interest_matrix[0]

array([1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [190]:
mlb.classes_[np.where(user_interest_matrix[0])].tolist()

['Big Data', 'Cassandra', 'HBase', 'Hadoop', 'Java', 'Spark', 'Storm']

In [188]:
sorted(users_interests[0])

['Big Data', 'Cassandra', 'HBase', 'Hadoop', 'Java', 'Spark', 'Storm']

In [191]:
# do the positions of 1s match the user0 interests?
assert mlb.classes_[np.where(user_interest_matrix[0])].tolist() == sorted(users_interests[0])

In [128]:
# need to find the similarities (1-distance) between users based on interests

from sklearn.metrics.pairwise import cosine_similarity

In [129]:
user_similarities = cosine_similarity(user_interest_matrix)

In [192]:
user_similarities[0]

array([1.        , 0.3380617 , 0.        , 0.        , 0.        ,
       0.15430335, 0.        , 0.        , 0.18898224, 0.56694671,
       0.        , 0.        , 0.        , 0.16903085, 0.        ])

In [196]:
# what users does user0 share interests with?
np.nonzero(user_similarities[0])[0]

array([ 0,  1,  5,  8,  9, 13])

In [197]:
# return a sorted list of users based on similarity
# skip not query user and similarity == 0
def most_similar_users_to(query_idx):
    users_scores = [(idx,sim) 
                    for idx,sim in enumerate(user_similarities[query_idx]) 
                    if idx != query_idx and sim > 0]
    return sorted(users_scores, key=lambda x:x[1])

In [198]:
most_similar_users_to(0)

[(5, 0.1543033499620919),
 (13, 0.1690308509457033),
 (8, 0.1889822365046136),
 (1, 0.3380617018914066),
 (9, 0.5669467095138407)]

In [199]:
most_similar_users_to(5)

[(0, 0.1543033499620919),
 (2, 0.1666666666666667),
 (9, 0.20412414523193154),
 (11, 0.20412414523193154),
 (10, 0.2357022603955159),
 (3, 0.36514837167011077),
 (12, 0.4714045207910318)]

In [133]:
# return the interests of other users
# weight is sum of similarity to other users

from collections import defaultdict

def user_based_suggestions(idx):
    suggestions = defaultdict(float)
    
    # iterate over interests of similar users
    for other_idx, sim in most_similar_users_to(idx):
        for interest in users_interests[other_idx]:
            suggestions[interest] += sim
            
    # sort suggestions based on weight
    suggestions = sorted(suggestions.items(),
                        key=lambda x:x[1],
                        reverse=True)
    
    # return only new interests
    return [(suggestion,weight)
            for suggestion,weight in suggestions
            if suggestion not in users_interests[idx]]

In [134]:
# reminder: original interests
users_interests[0]

['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']

In [135]:
# top 5 new recommended interests
user_based_suggestions(0)[:5]

[('MapReduce', 0.5669467095138407),
 ('Postgres', 0.50709255283711),
 ('MongoDB', 0.50709255283711),
 ('NoSQL', 0.3380617018914066),
 ('neural networks', 0.1889822365046136)]