In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# https://docs.scipy.org/doc/scipy/reference/spatial.distance.html
from scipy.spatial import distance
import seaborn as sns

## Usefull python functions

- [create pairwise distances](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html#scipy.spatial.distance.pdist)
- [convert pairwise distances to distance matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.squareform.html#scipy.spatial.distance.squareform)
- [fill missing values in a dataframe](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html)
- [calculate weighted averages](https://docs.scipy.org/doc/numpy/reference/generated/numpy.average.html)
- [plot a heatmap](https://seaborn.pydata.org/generated/seaborn.heatmap.html)
- [pd.Series/pd.DataFrame to add column/index to a numpy array](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html)

In [2]:
R = pd.read_csv('R.csv', index_col=0)
R

Unnamed: 0_level_0,2001: A Space Odyssey,Ace Ventura: Pet Detective,Austin Powers: The Spy Who Shagged Me,"Big Lebowski, The",Dumb & Dumber,E.T. the Extra-Terrestrial,Four Weddings and a Funeral,Star Wars: Episode I - The Phantom Menace,Titanic
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
68,1.5,2.5,3.5,0.5,3.5,1.5,1.0,3.0,3.0
608,3.0,3.5,3.0,4.0,3.0,2.5,3.0,4.0,2.0
414,5.0,2.0,4.0,5.0,3.0,4.0,4.0,4.0,4.0
307,3.5,4.0,3.5,4.0,2.5,3.0,3.0,4.0,2.0
599,5.0,2.5,3.5,5.0,3.5,4.0,2.5,3.0,3.0
600,4.0,3.0,2.5,3.5,3.5,2.0,1.0,1.0,3.0
45,4.0,4.0,4.0,5.0,5.0,5.0,4.0,4.0,4.0
330,1.5,0.5,3.5,5.0,4.0,4.0,,3.0,3.5
590,5.0,3.0,2.5,4.0,2.0,4.5,,2.5,3.5
474,4.0,1.0,3.0,3.5,,4.0,3.0,4.0,4.0


## Cosine similarity and other similarity/distance metrics

In [3]:
x = np.array([2, 1, 1, 1, 1])
y = np.array([5, 4, 5, 4, 4])
np.sum(y)

22

In [4]:


def cosim(x, y):
    x=x
    #TODO: implement this function, try out other metrics
    num = np.sum(x*y)
    xsum = np.sqrt(np.sum(x**2))
    ysum = np.sqrt(np.sum(y**2))
    return num/(xsum*ysum)

#x = np.array([2, 1, 1, 1, 1])
#y = np.array([5, 4, 5, 4, 4])

solution = 1-distance.cosine(x, y)

print(cosim(x, y))

assert np.round(cosim(x, y), 2) == np.round(solution, 2)

0.9642857142857142


In [5]:
1-distance.correlation(x,y)

0.6123724356957945

In [6]:
1-distance.cosine(x,y)

0.9642857142857143

## Step 0 - Similarity/ Distance Matrices (from scratch)

In [7]:
UU = np.zeros((len(R), len(R)))
UU = pd.DataFrame(UU, index=R.index, columns=R.index)
UU

userId,68,608,414,307,599,600,45,330,590,474,480,19,483,489,380
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
330,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
for u in UU.index:
    for v in UU.columns:
        pass

## Step I - Similarity/ Distance Matrices


calculate a user-user distance matrix using one of the distance matrix that [scipy offers](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html#scipy.spatial.distance.pdist)

- choose a value to fill in the missings (e.g. item-averages or user-averages)
- first use the funtion `pdist` to calculate pairwise distances and then use the function `squareform` to convert the list into a symmetric distance matrix

## Step II -  Neighborhood

use the distance matrix to find the **5 most similar users for user with id 608**

- similarities and distances are the same thing. the larger the distance the less similar two objects are.

- convert the distance matrix into a pandas dataframe
- select the row for the active user
- apply the `sort_values()` function
- post the distance metric and your ranking of user ids in slack

## Cosine distance

## Euclidean distance

## Pearson distance

## Jaccard Distance

Hint: convert your ratings to a boolean matrix first (e.g. by only looking at ratings above a certain treshold)