In [1]:
import dataprep
import pandas as pd

import numpy as np
from scipy import sparse as sp
from scipy.sparse.linalg import norm
import sklearn.preprocessing as pp

from sklearn import svm
from sklearn.metrics import precision_score

In [14]:
used_features = ["engaging_user_id", "tweet_id"]
target_features = ["retweet", "reply", "like", "retweet_with_comment"]
# reply, like, retweet_with_comment
imported_data = dataprep.import_data(source_features=used_features, target_features=target_features, nrows=500000)

In [15]:
# create dictionarys for accessing interacted tweets for a user and the other way around
user_ratings = {}
tweets_users = {}

for row_id in range(0, len(imported_data)):
    row = imported_data.iloc[row_id]
    if row["engaging_user_id"] in user_ratings:
        user_ratings[row["engaging_user_id"]].append(row["tweet_id"])
    else:
        user_ratings[row["engaging_user_id"]] = [row["tweet_id"]]
        
    if row["tweet_id"] in tweets_users:
        tweets_users[row["tweet_id"]].append(row["engaging_user_id"])
    else:
        tweets_users[row["tweet_id"]] = [row["engaging_user_id"]]

In [72]:
INTERACTION_CATEGORY = "reply"

filtered_data = imported_data[imported_data[INTERACTION_CATEGORY] == 1]

In [73]:
tweetIds = filtered_data["tweet_id"].unique()
tweetIds.sort()
userIds = filtered_data["engaging_user_id"].unique()
userIds.sort()

m = userIds.size
n = tweetIds.size
numRatings = len(imported_data)


## create internal ids for movies and users, that have consecutive indexes starting from 0
tweetId_to_tweetIDX = dict(zip(tweetIds, range(0, tweetIds.size)))
tweetIDX_to_tweetId = dict(zip(range(0, tweetIds.size), tweetIds))

userId_to_userIDX = dict(zip(userIds, range(0, userIds.size )))
userIDX_to_userId = dict(zip(range(0, userIds.size), userIds))

## drop timestamps
data = pd.concat([filtered_data['engaging_user_id'].map(userId_to_userIDX), filtered_data['tweet_id'].map(tweetId_to_tweetIDX), filtered_data[INTERACTION_CATEGORY]], axis=1)
data.columns = ['engaging_user_id', 'tweet_id', 'interaction']



R = sp.csr_matrix((data.interaction, (data.engaging_user_id, data.tweet_id)))
R_dok = R.todok()

In [18]:
data

Unnamed: 0,engaging_user_id,tweet_id,interaction
20,5520,3291,1
90,3090,5716,1
193,1083,7125,1
199,4205,7125,1
212,8699,7125,1
...,...,...,...
499946,2446,5505,1
499947,4484,5505,1
499948,9192,5505,1
499954,10075,321,1


In [7]:
# begin test

In [8]:
def compute_pairwise_user_similarity(u_id, v_id):
    u = R[u_id,:].copy()
    v = R[v_id,:].copy()
    
    # using the formula on slide 25 of slide deck 2

    
    # calculate sqrt of sum of (r_ui - mean(r_u))^2
    u_denominator = np.sqrt(sum(u.data))
    
    # calculate sqrt of sum of (r_vi - mean(r_v))^2
    v_denominator = np.sqrt(sum(v.data))
    
    denominator = u_denominator * v_denominator
    
    
    #calculate numerator
    numerator = 0
    def calculate_nth_summand(index):
        if (u_id, index) in R_dok and (v_id, index) in R_dok:
            return 1
        else:
            return 0
        
    numerator = np.array([calculate_nth_summand(t) for t in range(0,n)]).sum()
         
    
    if denominator == 0:
        similarity = 0.;
    else:
        similarity = numerator/denominator
    
    return similarity

In [9]:
def compute_user_similarities(u_id):
    '''
    Much faster matrix-based approach
    '''
    
    R_copy = R.copy()

    u = R[u_id,:].copy()
    
    return R_copy.dot(u.T).toarray()[:,0]

In [65]:
## default values
k = 5

def create_user_neighborhood(u_id, i_id):
    nh = {} ## the neighborhood dict with (user id: similarity) entries
    ## nh should not contain u_id and only include users that have rated i_id; there should be at most k neighbors
    uU = compute_user_similarities(u_id)
    uU_copy = uU.copy() ## so that we can modify it, but also keep the original
    sort_index = np.flip(np.argsort(uU_copy))
    
    taken = 0
    
    # select the top-k other users that have rated the target item.
    # usually, the neighborhood will be very small and often only contain users with a similarity of 0
    # this is because for most tweets, the user will not have another user that has previously interacted with a common tweet.
    for i in sort_index:
        if i == u_id:
            continue
        if R_dok[i, i_id] != 0:
            nh[i] = uU_copy[i]
            taken += 1
            if (taken >= k):
                break
    
    return nh

In [66]:
## a default value

def predict_rating(u_id, i_id):
    
    #if (u_id, i_id) in R_dok:
    #    print("user", u_id, "has rated item", i_id, "with", R[u_id, i_id])
    #else:
    #    print("user", u_id, "has not rated item", i_id)
    
    
    nh = create_user_neighborhood(u_id, i_id)
    
    neighborhood_weighted_avg = 0.

    numerator = np.array([nh[x] * (R[x, i_id]) for x in nh]).sum()

    denominator = np.array([abs(nh[x]) for x in nh]).sum()
    
    if denominator != 0:
        neighborhood_weighted_avg = numerator / denominator
    else:
        neighborhood_weighted_avg = 0

    prediction = neighborhood_weighted_avg
        
    return prediction

In [67]:
predict_rating(11189, 5121)

0

In [68]:
predict_rating(11189, 5122)

1.0

In [None]:
#end test

In [71]:
prediction = predict_rating(410839, 119903)

if prediction > 0.5:
    print("user will interact")
else:
    print("user will not interact")

IndexError: row index (410839) out of range

In [26]:
tweetId_to_tweetIDX["7876E60F62B4F3627A109A60691BDF4D"]

5122

In [27]:
userId_to_userIDX["F472536F8260041FC3C246C1AD1EDF02"]

11189

In [41]:
userIDX_to_userId[1684]

'26B36E25FA9F0A4B591AA46A2E87D780'

In [29]:
len(compute_user_similarities(4))

11755

In [45]:
compute_pairwise_user_similarity(11189, 1684)

0.7071067811865475

In [None]:
(11189, 5122) in R_dok

In [53]:
print(R_dok[1684])

  (0, 5121)	1


In [32]:
print(R_dok)

  (4731, 0)	1
  (6078, 1)	1
  (10465, 2)	1
  (2696, 3)	1
  (6864, 4)	1
  (2409, 5)	1
  (9705, 5)	1
  (7936, 6)	1
  (10951, 7)	1
  (10871, 8)	1
  (4978, 9)	1
  (1532, 10)	1
  (4442, 11)	1
  (6719, 12)	1
  (6643, 13)	1
  (442, 14)	1
  (6308, 14)	1
  (11068, 15)	1
  (2808, 16)	1
  (7916, 17)	1
  (2529, 18)	1
  (11509, 19)	1
  (68, 20)	1
  (5917, 21)	1
  (8697, 22)	1
  :	:
  (11024, 10767)	1
  (3405, 10768)	1
  (7507, 10769)	1
  (9693, 10770)	1
  (4752, 10771)	1
  (6546, 10772)	1
  (6942, 10773)	1
  (10021, 10774)	1
  (3618, 10775)	1
  (10103, 10776)	1
  (2157, 10777)	1
  (7754, 10777)	1
  (9703, 10778)	1
  (5875, 10779)	1
  (680, 10780)	1
  (11417, 10781)	1
  (8660, 10782)	1
  (2704, 10783)	1
  (9577, 10784)	1
  (4732, 10785)	1
  (3588, 10786)	1
  (5412, 10787)	1
  (1128, 10788)	1
  (3981, 10789)	1
  (11728, 10790)	1
