In [23]:
import numpy as np
import pandas as pd

# Setting up and preprocessing data

In [24]:
n = np.NaN  # setting non scores to np.Nan for better integration with numpy
rows_users_ = ['u1', 'u2', 'u3', 'u4', 'u5', 'u6', 'u7']
columns_series = ['Vikings', 'Breaking Bad', 'The Sopranos', 'Westworld',
                  'Game of Thrones', 'The Witcher', 'Skam', 'Les bureau']
all_users = np.array([
    [5, 4, 1, n, 3, 4, 1],
    [5, 3, 3, 3, 4, 5, 1],
    [n, 3, 3, n, n, 2, n],
    [3, 4, 2, n, 4, 5, n],
    [4, 5, 2, n, 5, 3, 2],
    [n, 2, 3, 5, 5, 2, 1],
    [n, 3, 4, n, 1, 2, n],
    [n, 3, 1, n, n, 3, n],
])
all_users = all_users.T  # transform to get users as rows and series as columns

In [25]:
# using pandas df for visualization
df_all_users = pd.DataFrame(all_users, columns=columns_series, index=rows_users_)
df_all_users

Unnamed: 0,Vikings,Breaking Bad,The Sopranos,Westworld,Game of Thrones,The Witcher,Skam,Les bureau
u1,5.0,5.0,,3.0,4.0,,,
u2,4.0,3.0,3.0,4.0,5.0,2.0,3.0,3.0
u3,1.0,3.0,3.0,2.0,2.0,3.0,4.0,1.0
u4,,3.0,,,,5.0,,
u5,3.0,4.0,,4.0,5.0,5.0,1.0,
u6,4.0,5.0,2.0,5.0,3.0,2.0,2.0,3.0
u7,1.0,1.0,,,2.0,1.0,,


In [26]:
# calculating means for all users
users_mean = np.nanmean(all_users, axis=0).reshape(1, 8)
users_mean

array([[3.        , 3.42857143, 2.66666667, 3.6       , 3.5       ,
        3.        , 2.5       , 2.33333333]])

In [27]:
# calculating variance for all users
all_users_variance = np.empty(all_users.shape)
for i, user_scores in enumerate(all_users):
    all_users_variance[i] = user_scores - users_mean[0, i]
all_users_variance

array([[ 2.        ,  2.        ,         nan,  0.        ,  1.        ,
                nan,         nan,         nan],
       [ 0.57142857, -0.42857143, -0.42857143,  0.57142857,  1.57142857,
        -1.42857143, -0.42857143, -0.42857143],
       [-1.66666667,  0.33333333,  0.33333333, -0.66666667, -0.66666667,
         0.33333333,  1.33333333, -1.66666667],
       [        nan, -0.6       ,         nan,         nan,         nan,
         1.4       ,         nan,         nan],
       [-0.5       ,  0.5       ,         nan,  0.5       ,  1.5       ,
         1.5       , -2.5       ,         nan],
       [ 1.        ,  2.        , -1.        ,  2.        ,  0.        ,
        -1.        , -1.        ,  0.        ],
       [-1.5       , -1.5       ,         nan,         nan, -0.5       ,
        -1.5       ,         nan,         nan]])

In [28]:
# initializing active user
active_user = np.array([3, 2, 3, 0, 4, 1, 0, 5])
active_user

array([3, 2, 3, 0, 4, 1, 0, 5])

In [29]:
active_user_mean = np.nanmean(active_user)
active_user_mean

2.25

In [30]:
active_users_variance = np.zeros((1, 8))
for i, score in enumerate(active_user):
    active_users_variance[0, i] = score - active_user_mean
active_users_variance

array([[ 0.75, -0.25,  0.75, -2.25,  1.75, -1.25, -2.25,  2.75]])

# Solving assignment
- Estimate the score this person is likely to give Westworld and Skam
- Solve the problem using Python and the corrcoef-function from Numpy

In [31]:
correlations = []
for other_users_variance in all_users_variance:
    temp_active = []
    temp_other = []
    for i in range(len(active_users_variance[0])):
        if not np.isnan(active_users_variance[0, i]) and not np.isnan(other_users_variance[i]):
            temp_active.append(active_users_variance[0, i])
            temp_other.append(other_users_variance[i])
    correlations_matrix = np.corrcoef(np.array(temp_active), np.array(temp_other))
    correlation = correlations_matrix[0][1]
    correlations.append(correlation)
correlations

[0.560611910581388,
 0.2765957446808511,
 -0.6432041541566332,
 -0.9999999999999999,
 0.43386091563731227,
 -0.09124485970087104,
 0.7745966692414834]

In [37]:
def estimate_score(kappa,average_vote_for_a, product_index):
    sum_correlation_variance = 0
    j = product_index  # index of product we want to predict the vote for
    for i in range(len(all_users_variance)):
        if not np.isnan(all_users_variance[i][j]):
            sum_correlation_variance += all_users_variance[i][j] * correlations[i]
    p_a_j = average_vote_for_a + kappa * sum_correlation_variance
    return p_a_j

In [38]:
westworld = 3
estimate_score(1,active_user_mean,westworld)

2.8712982191008702

In [39]:
skam = 6
estimate_score(1,active_user_mean,skam)

0.280445998297429