In [1]:
import dataprep
import pandas as pd

import numpy as np
from scipy import sparse as sp
from scipy.sparse.linalg import norm
import sklearn.preprocessing as pp

from sklearn import svm
from sklearn.metrics import precision_score

In [75]:
used_features = ["engaging_user_id", "tweet_id"]
target_features = ["retweet", "reply", "like", "retweet_with_comment"]
# reply, like, retweet_with_comment
imported_data = dataprep.import_data(source_features=used_features, target_features=target_features, nrows=50000)

In [76]:
# create dictionarys for accessing interacted tweets for a user and the other way around
user_ratings = {}
tweets_users = {}

for row_id in range(0, len(imported_data)):
    row = imported_data.iloc[row_id]
    if row["engaging_user_id"] in user_ratings:
        user_ratings[row["engaging_user_id"]].append(row["tweet_id"])
    else:
        user_ratings[row["engaging_user_id"]] = [row["tweet_id"]]
        
    if row["tweet_id"] in tweets_users:
        tweets_users[row["tweet_id"]].append(row["engaging_user_id"])
    else:
        tweets_users[row["tweet_id"]] = [row["engaging_user_id"]]

In [77]:
INTERACTION_CATEGORY = "retweet"

filtered_data = imported_data[imported_data[INTERACTION_CATEGORY] == 1]

In [84]:
tweetIds = filtered_data["tweet_id"].unique()
tweetIds.sort()
userIds = filtered_data["engaging_user_id"].unique()
userIds.sort()

m = userIds.size
n = tweetIds.size
numRatings = len(imported_data)


## create internal ids for movies and users, that have consecutive indexes starting from 0
tweetId_to_tweetIDX = dict(zip(tweetIds, range(0, tweetIds.size)))
tweetIDX_to_tweetId = dict(zip(range(0, tweetIds.size), tweetIds))

userId_to_userIDX = dict(zip(userIds, range(0, userIds.size )))
userIDX_to_userId = dict(zip(range(0, userIds.size), userIds))

## drop timestamps
data = pd.concat([filtered_data['engaging_user_id'].map(userId_to_userIDX), filtered_data['tweet_id'].map(tweetId_to_tweetIDX), filtered_data[INTERACTION_CATEGORY]], axis=1)
data.columns = ['engaging_user_id', 'tweet_id', 'interaction']



R = sp.csr_matrix((data.interaction, (data.engaging_user_id, data.tweet_id)))
R_dok = R.todok()

In [96]:
data

Unnamed: 0,engaging_user_id,tweet_id,interaction
20,511,271,1
90,296,467,1
193,118,579,1
199,404,579,1
212,779,579,1
...,...,...,...
49666,180,356,1
49778,637,72,1
49868,467,512,1
49892,185,399,1


In [5]:
# begin test

In [58]:
def compute_pairwise_user_similarity(u_id, v_id):
    u = R[u_id,:].copy()
    v = R[v_id,:].copy()
    
    # using the formula on slide 25 of slide deck 2

    
    # calculate sqrt of sum of (r_ui - mean(r_u))^2
    u_denominator = np.sqrt(sum(u.data))
    
    # calculate sqrt of sum of (r_vi - mean(r_v))^2
    v_denominator = np.sqrt(sum(v.data))
    
    denominator = u_denominator * v_denominator
    
    
    #calculate numerator
    numerator = 0
    def calculate_nth_summand(index):
        if (u_id, index) in R_dok and (v_id, index) in R_dok:
            return 1
        else:
            return 0
        
    numerator = np.array([calculate_nth_summand(t) for t in range(0,n)]).sum()
         
    
    if denominator == 0:
        similarity = 0.;
    else:
        similarity = numerator/denominator
    
    return similarity

In [59]:
def compute_user_similarities(u_id):
    '''
    Much faster matrix-based approach
    '''
    
    R_copy = R.copy()

    u = R[u_id,:].copy()
    
    return R_copy.dot(u.T).toarray()[:,0]

In [60]:
## default values
k = 5

def create_user_neighborhood(u_id, i_id):
    nh = {} ## the neighborhood dict with (user id: similarity) entries
    ## nh should not contain u_id and only include users that have rated i_id; there should be at most k neighbors
    uU = compute_user_similarities(u_id)
    uU_copy = uU.copy() ## so that we can modify it, but also keep the original
    
    sort_index = np.flip(np.argsort(uU_copy))
    
    print(sort_index)
    
    taken = 0
    for i in sort_index:
        if i == u_id:
            continue
        if R_dok[i, i_id] != 0:
            print(uU_copy[i])
            nh[i] = uU_copy[i]
            taken += 1
            if (taken >= k):
                break
    
    return nh

In [61]:
## a default value

def predict_rating(u_id, i_id):
    
    if (u_id, i_id) in R_dok:
        print("user", u_id, "has rated item", i_id, "with", R[u_id, i_id])
    else:
        print("user", u_id, "has not rated item", i_id)
    
    
    nh = create_user_neighborhood(u_id, i_id)
    print(nh)
    
    neighborhood_weighted_avg = 0.

    numerator = np.array([nh[x] * (R[x, i_id]) for x in nh]).sum()

    denominator = np.array([abs(nh[x]) for x in nh]).sum()
    
    neighborhood_weighted_avg = numerator / denominator
    

    prediction = neighborhood_weighted_avg
    print(f'prediction {prediction:.4f}')
        
    return prediction

In [62]:
predict_rating(11189, 5121)

user 11189 has not rated item 5121
[ 1684 11189 11754 ...  7831  7830     0]
0
{1093: 0}
prediction nan


  neighborhood_weighted_avg = numerator / denominator


nan

In [192]:
sims = compute_user_similarities(410839)
sims[sims != 0]

array([], dtype=int64)

In [193]:
compute_pairwise_user_similarity(410839, 64765)

0.0

In [None]:
#end test

In [97]:
def compute_pairwise_user_similarity(u_id, v_id):
    u = filtered_data[filtered_data["engaging_user_id"] == u_id].copy()
    v = filtered_data[filtered_data["engaging_user_id"] == v_id].copy()
    
    # using the formula on slide 25 of slide deck 2

    
    # calculate sqrt of sum of (r_ui - mean(r_u))^2
    u_denominator = np.sqrt(sum(u[INTERACTION_CATEGORY]))
    
    # calculate sqrt of sum of (r_vi - mean(r_v))^2
    v_denominator = np.sqrt(sum(v[INTERACTION_CATEGORY]))
    
    denominator = u_denominator * v_denominator
    
    
    items = user_ratings[u_id] + user_ratings[v_id]
    items = set(items)
    
    
    numerator = 0
    def calculate_nth_summand(item):
        if item in user_ratings[u_id] and item in user_ratings[v_id]:
            return 1
        else:
            return 0
        
    numerator = np.array([calculate_nth_summand(i) for i in items]).sum()
         
    
    if denominator == 0:
        similarity = 0.;
    else:
        similarity = numerator/denominator
    
    return similarity

In [98]:
compute_pairwise_user_similarity("1958C5A1AF21AE7DFA1B709F5D45BB9C", "5D2073E67B1B5100EF1BE49DCCAF607F")

1.0

In [99]:
def compute_user_similarities(u_id):
    '''
    Much faster matrix-based approach
    '''
    
    similarities = []
    
    for userId in userIds:
        similarities.append(compute_pairwise_user_similarity(u_id, userId))

    
    
    return similarities

In [100]:
def compute_faster(u_id):

    u = R[u_id,:].copy()
    
    return R.dot(u.T).toarray()[:,0]

In [166]:
test_result = compute_faster(2)
test_result

array([0, 0, 0, ..., 0, 0, 0])

In [101]:
## default values
k = 5

def create_user_neighborhood(u_id, tweet_id):
    nh = {} ## the neighborhood dict with (user id: similarity) entries
    ## nh should not contain u_id and only include users that have rated i_id; there should be at most k neighbors
    uU = compute_user_similarities(u_id)
    
    sort_index = np.flip(np.argsort(uU))
    
    taken = 0
    for i in sort_index:
        index_userid = userIds[i]
        if index_userid == u_id:
            continue
        if index_userid in tweets_users[tweet_id]:
            nh[index_userid] = uU[i]
            taken += 1
            if (taken >= k):
                break
    
    return nh

In [104]:
create_user_neighborhood("F472536F8260041FC3C246C1AD1EDF02", "7876E60F62B4F3627A109A60691BDF4D")

KeyError: '7876E60F62B4F3627A109A60691BDF4D'

In [103]:
compute_pairwise_user_similarity("5D2073E67B1B5100EF1BE49DCCAF607F", "1958C5A1AF21AE7DFA1B709F5D45BB9C")

1.0

In [127]:
def predict_rating(u_id, tweet_id):
    
    if tweet_id in user_ratings[u_id]:
        print("user", u_id, "has rated item", tweet_id)
    else:
        print("user", u_id, "has not rated item", tweet_id)
    
    
    nh = create_user_neighborhood(u_id, tweet_id)
    neighborhood_weighted_avg = 0.
    
    return np.array([nh[x] for x in nh]).mean()

In [176]:
prediction = predict_rating(410839, 119903)

if prediction > 0.5:
    print("user will interact")
else:
    print("user will not interact")

KeyError: 410839

In [16]:
tweetId_to_tweetIDX["7876E60F62B4F3627A109A60691BDF4D"]

5122

In [17]:
userId_to_userIDX["F472536F8260041FC3C246C1AD1EDF02"]

11189

In [37]:
R_dok[0,1]

0

In [43]:
len(compute_user_similarities(4))

11755

In [57]:
R.shape

(11755, 10791)

In [74]:
(11189, 5122) in R_dok

False