In [98]:
import nltk
from nltk import FreqDist, word_tokenize
import string
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
%matplotlib inline

In [99]:
stream_content_df = pd.read_csv("data/stream_content.csv", header=0, encoding="ISO-8859-1")
stream_content_df.head()

Unnamed: 0,StreamID,Content
0,163,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...
1,419,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...
2,507,"wBYKUgUyGWc\r\nA team of world-class drivers, ..."
3,199,Castrol EDGE is Castrol?s flagship power bran...
4,201,"Charles ?Cheers? Wakefield, Castrol?s founder,..."


## Pre-process the content to remove stop words, punctutations and lemmatization

In [100]:
def preprocess(tokens):
    
    # TODO: remove random sequences that contain with more than one caps and small or combination of letters and numbers
    
    tokens_nop = [t for t in tokens if t not in string.punctuation]
    tokens_nop = [t.lower() for t in tokens_nop]
    wnl = nltk.WordNetLemmatizer()
    stop = stopwords.words('english')
    tokens_nostop = [t for t in tokens_nop if t not in stop]
    tokens_lem = [wnl.lemmatize(t) for t in tokens_nostop]
    tokens_clean = [t for t in tokens_lem if len(t) >= 3] 
    return tokens_clean

stream_content_df['Content_processed'] = stream_content_df['Content'].map(word_tokenize)
stream_content_df['Content_processed'] = stream_content_df.Content_processed.apply(preprocess)
stream_content_df.head()

Unnamed: 0,StreamID,Content,Content_processed
0,163,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...,"[txmak2kzay4, nmeujebo1ac, eeutxfhp3go, castro..."
1,419,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...,"[txmak2kzay4, nmeujebo1ac, eeutxfhp3go, castro..."
2,507,"wBYKUgUyGWc\r\nA team of world-class drivers, ...","[wbykuguygwc, team, world-class, driver, power..."
3,199,Castrol EDGE is Castrol?s flagship power bran...,"[castrol, edge, castrol, flagship, power, bran..."
4,201,"Charles ?Cheers? Wakefield, Castrol?s founder,...","[charles, cheer, wakefield, castrol, founder, ..."


### Generate the TFIDF vectors for the streams

In [101]:
stream_content_df['Content_processed'] = stream_content_df['Content_processed'].apply(lambda x: " ".join(x))
stream_content_df.head()

Unnamed: 0,StreamID,Content,Content_processed
0,163,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...,txmak2kzay4 nmeujebo1ac eeutxfhp3go castrol la...
1,419,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...,txmak2kzay4 nmeujebo1ac eeutxfhp3go castrol la...
2,507,"wBYKUgUyGWc\r\nA team of world-class drivers, ...",wbykuguygwc team world-class driver powered ca...
3,199,Castrol EDGE is Castrol?s flagship power bran...,castrol edge castrol flagship power brand pcos...
4,201,"Charles ?Cheers? Wakefield, Castrol?s founder,...",charles cheer wakefield castrol founder entrep...


In [102]:
all_streams_cleaned_text = stream_content_df['Content_processed']
all_streams_tfidf_vectorizer = TfidfVectorizer(min_df = 2)
all_streams_tfidf = all_streams_tfidf_vectorizer.fit_transform(all_streams_cleaned_text)
all_streams_tfidf

<97x928 sparse matrix of type '<class 'numpy.float64'>'
	with 6122 stored elements in Compressed Sparse Row format>

In [103]:
token_values = {all_streams_tfidf_vectorizer.vocabulary_[token]: token for token in all_streams_tfidf_vectorizer.vocabulary_}


In [104]:
similarities = cosine_similarity(all_streams_tfidf)

In [105]:
def linear(x, total_steps):
    step_value = 1/total_steps
    return 1 - (x * step_value)

In [106]:
def constant(x, total_steps):
    return 1

In [107]:
def get_similar_streams_based_on_history(viewed_streams, weight_pattern = linear, max_similar_streams = 10, max_viewed_streams_to_consider = 5):
    """
        viewed_streams: list of stream IDs that have been viewed by the current user. The streams at a lower ID has been viewed more recently. So stream with ID 0 is the last viewed stream.
        weight_pattern: The weight pattern to weight the contributions due to the stream history
    max_similar_streams: The maximum number of similar streams to return
    max_viewed_streams_to_consider: The maximum number of viewed streams to consider for the recommendation 
    
    Returns: A list of stream ID, score pairs
    
    """
    
    if viewed_streams:
        num_viewed_streams = len(viewed_streams)
        
        # set the max viewed streams to consider 
        max_viewed_streams_to_consider = min(max_viewed_streams_to_consider, num_viewed_streams)
        
        # create an array of 0's
        similarity_sum = np.zeros(similarities.shape[0])
    
        for x, viewed_stream in enumerate(viewed_streams):
            stream_index = stream_content_df[stream_content_df["StreamID"]==viewed_stream].index[0]
            weight_factor = weight_pattern(x, max_viewed_streams_to_consider)
            # print(weight_factor)
            similarity_sum = similarity_sum + (weight_factor * np.array(similarities[stream_index]))
        
        
        #print(similarity_sum)
        stream_ids = stream_content_df["StreamID"]
        
        # concatenate the stream ID and the similarity score sum as pairs
        stream_similarity = list(zip(stream_ids, similarity_sum))
        
        # print(stream_similarity)
        
        # sort the stream similarity on the score in descending order
        stream_similarity.sort(key = lambda x: x[1], reverse = True)
        
        # print(stream_similarity)
        
        # candidate streams are those which have greater than 0 score and they have not been viewed before
        candidate_streams = [x for x in stream_similarity if (x[1] > 0) and (x[0] not in viewed_streams)]
        
        num_candidate_streams = len(candidate_streams)
        
        return candidate_streams[:min(num_candidate_streams, max_similar_streams)]
            
            

In [108]:
get_similar_streams_based_on_history([1498, 163, 507, 201])

[(2265, 1.2620715824847637),
 (199, 1.2362178356915718),
 (2405, 1.1823430221582885),
 (2380, 1.1786054440773466),
 (2373, 1.1267495184586085),
 (2030, 1.1267286220099606),
 (2036, 1.1140552834250859),
 (419, 1.1055542339583908),
 (1658, 1.071461114797631),
 (2104, 1.0561773007430364)]

In [109]:
print(stream_content_df[stream_content_df["StreamID"] == 163])

print(stream_content_df[stream_content_df["StreamID"] == 419])

   StreamID                                            Content  \
0       163  TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...   

                                   Content_processed  
0  txmak2kzay4 nmeujebo1ac eeutxfhp3go castrol la...  
   StreamID                                            Content  \
1       419  TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...   

                                   Content_processed  
1  txmak2kzay4 nmeujebo1ac eeutxfhp3go castrol la...  


### Read in the content views for the users

In [110]:
content_views_per_user_scaled_df = pd.read_csv("data/content_views_per_user_scaled.csv", header=0)
new_column_names = ["UserID"]
new_column_names.extend(content_views_per_user_scaled_df.columns[1:])
content_views_per_user_scaled_df.columns = new_column_names
content_views_per_user_scaled_df.head()

Unnamed: 0,UserID,163,167,171,172,173,178,179,184,185,...,1657,1658,1659,1660,1661,1662,1665,1668,1670,1677
0,245,1.0,1.0,1.0,1.0,1.0,0.920755,0.2,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,246,0.040816,0.0,0.0,0.036145,0.0,0.415094,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,247,0.29932,0.116279,0.276423,0.457831,0.479042,0.633962,0.4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,248,0.014577,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,249,0.119534,0.736434,0.113821,0.373494,0.125749,1.0,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
features_df = content_views_per_user_scaled_df.drop("UserID", axis=1)
from sklearn.metrics.pairwise import euclidean_distances
user_similarities = euclidean_distances(features_df.values)
user_similarities.shape

(219, 219)

In [145]:
def get_similar_streams_based_on_other_users(current_user_id, max_similar_streams = 10, max_users_to_consider = 5):
    """
        current_user_id: The ID of the current user.
        max_similar_streams: The maximum number of similar streams to return
    max_users_to_consider: The maximum number of users to consider
    
    Returns: A list of stream IDs based on similar users who have watched the most
    
    """
    
    if current_user_id:
        userid_index = (content_views_per_user_scaled_df[content_views_per_user_scaled_df["UserID"] == current_user_id].index[0])
        distance_current_user_other_users = user_similarities[userid_index]
        distance_current_user_other_users = np.squeeze(distance_current_user_other_users)
        #print(distance_current_user_other_users)
        
        user_ids = content_views_per_user_scaled_df["UserID"].values
        #print(type(user_ids))
        #print(user_ids.shape)
        #print(distance_current_user_other_users.shape)
        
        # concatenate the user ID and the similarity score sum as pairs
        user_similarity = list(zip(user_ids, distance_current_user_other_users))
        
        #print(user_similarity)
        
        # sort the user similarity on the score in ascending order
        user_similarity.sort(key = lambda x: x[1])
        
        #print(user_similarity)
        
        # candidate users are those which are other users closest to the current user
        candidate_users = [x for x in user_similarity if (x[0] != current_user_id)]
        
        num_candidate_users = len(candidate_users)
        candidate_users = candidate_users[:min(num_candidate_users, max_similar_streams)]
        print(type(candidate_users[0]))
        
        # create an array of 0's equal to the number of columns other than UserID (# of streams)
        stream_views_sum = np.zeros(features_df.shape[1])
        
        # get the sum of the views
        for x, candidate_user in enumerate(candidate_users):
            
            # convert the distance to a similarity measure
            weight_factor = 1/ (1 + candidate_user[1])
            
            print(weight_factor)
            stream_views_for_user = content_views_per_user_scaled_df[content_views_per_user_scaled_df["UserID"] == candidate_user[0]].values
            
            # remove the first column (UserID)
            stream_views_for_user = np.squeeze(stream_views_for_user)[1:]
            stream_views_for_user = np.array(stream_views_for_user)
            stream_views_sum = stream_views_sum + (weight_factor * stream_views_for_user)
        
        
        stream_ids = features_df.columns
        
        # concatenate the stream ID and the view score sum as pairs
        streams_with_sum_views = list(zip(stream_ids, stream_views_sum))
        
        
        # sort the stream views on the sum in descending order
        streams_with_sum_views.sort(key = lambda x: x[1], reverse = True)
        
        print(streams_with_sum_views)
        
        # candidate streams are those which have greater than 0 views
        candidate_streams = [x for x in streams_with_sum_views if (x[1] > 0)]
        
        num_candidate_streams = len(candidate_streams)
        
        return candidate_streams[:min(num_candidate_streams, max_similar_streams)]
            
        
            
            

In [146]:
get_similar_streams_based_on_other_users(249)

<class 'tuple'>
0.520338714702191
0.49663781431739906
0.48471358346532145
0.44271397037408805
0.4334183535265351
0.43167773547306765
0.4251087988150735
0.4231966427681995
0.4229501585584985
0.42282706099849626
[('178', 0.8080360403904144), ('173', 0.581175006652959), ('179', 0.4044057654374205), ('172', 0.39167484614116027), ('171', 0.3014641422302069), ('163', 0.2781202762282926), ('167', 0.1803998681335533), ('204', 0.17738402776074624), ('202', 0.16600114774012653), ('200', 0.06001040256335238), ('198', 0.0367641676056403), ('203', 0.03659858558531212), ('205', 0.032652052214421246), ('199', 0.0310426347069407), ('201', 0.026121702240877922), ('206', 0.025521389203562245), ('230', 0.017270817212888742), ('229', 0.013224895086506235), ('228', 0.0010782248019504893), ('184', 0.0), ('185', 0.0), ('186', 0.0), ('188', 0.0), ('189', 0.0), ('190', 0.0), ('191', 0.0), ('192', 0.0), ('217', 0.0), ('218', 0.0), ('219', 0.0), ('220', 0.0), ('223', 0.0), ('231', 0.0), ('232', 0.0), ('233', 0.0

[('178', 0.8080360403904144),
 ('173', 0.581175006652959),
 ('179', 0.4044057654374205),
 ('172', 0.39167484614116027),
 ('171', 0.3014641422302069),
 ('163', 0.2781202762282926),
 ('167', 0.1803998681335533),
 ('204', 0.17738402776074624),
 ('202', 0.16600114774012653),
 ('200', 0.06001040256335238)]

### Small application to get the streams to recommend for a user given their UserID and their previous history of stream views, keeping all other parameters as default

In [152]:
from ipywidgets import widgets
from IPython.display import display

In [166]:
def get_recommended_stream_ids(userid, stream_view_history_concatenated, max_num_streams_required=5):
    
    userid = int(userid)
    stream_view_history = stream_view_history_concatenated.split(",")
    stream_view_history = [int(x.strip()) for x in stream_view_history]
    similar_streams_based_on_history = get_similar_streams_based_on_history(stream_view_history)
    
    streams_based_on_other_users = get_similar_streams_based_on_other_users(userid)
    c = {}
    for similar_stream_history in similar_streams_based_on_history:
        c[int(similar_stream_history[0])] += similar_stream_history[1]
        
    for similar_stream_user in streams_based_on_other_users:
        c[int(similar_stream_user[0])] += similar_stream_user[1]    
        
    count_values = [(k, c[k]) for k in c]
    count_values.sort(key=lambda x: x[1], reverse=True)
    
    return count_values[:min(len(count_values, max_num_streams_required))]

In [167]:
def button_on_click(b):
    user_id = userid_text.value
    stream_view_history = user_history_text.value
    recommendations = get_recommended_stream_ids(user_id, stream_view_history)
    print(recommendations)


In [168]:
userid_text = widgets.Text(description="UserID")
user_history_text = widgets.Text(description="Previous stream views (separated by commas)")
button = widgets.Button(description="Get stream recommendations")
display(userid_text)
display(user_history_text)
display(button)

button.on_click(button_on_click)

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

<class 'tuple'>
0.6950275922811304
0.6905658383683766
0.6825810944727624
0.6824043553171352
0.6823624653779445
0.6823619351588348
0.6823606677614377
0.6823540094490234
0.6823397496452992
0.6823363775661535
[('171', 0.12920274626258677), ('172', 0.09173579804461637), ('173', 0.08697120584011783), ('178', 0.07310014153062859), ('163', 0.07037118623888154), ('167', 0.04807978835977079), ('204', 0.038012798442295964), ('203', 0.03090189391361589), ('199', 0.015796850470931184), ('201', 0.008747947486445223), ('206', 0.006808032252311862), ('229', 0.00426462343528312), ('228', 0.002732372230932488), ('548', 0.002369459567073386), ('363', 0.001317298586412042), ('179', 0.0), ('184', 0.0), ('185', 0.0), ('186', 0.0), ('188', 0.0), ('189', 0.0), ('190', 0.0), ('191', 0.0), ('192', 0.0), ('198', 0.0), ('200', 0.0), ('202', 0.0), ('205', 0.0), ('217', 0.0), ('218', 0.0), ('219', 0.0), ('220', 0.0), ('223', 0.0), ('230', 0.0), ('231', 0.0), ('232', 0.0), ('233', 0.0), ('236', 0.0), ('237', 0.0), 

KeyError: 2265

## Input:
### UserID: 246
### Previous: 1498, 163, 507, 201
