In [10]:
import nltk
from nltk import FreqDist, word_tokenize
import string
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
%matplotlib inline

In [4]:
stream_content_df = pd.read_csv("data/stream_content.csv", header=0, encoding="ISO-8859-1")
stream_content_df.head()

Unnamed: 0,StreamID,Content
0,163,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...
1,419,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...
2,507,"wBYKUgUyGWc\r\nA team of world-class drivers, ..."
3,199,Castrol EDGE is Castrol?s flagship power bran...
4,201,"Charles ?Cheers? Wakefield, Castrol?s founder,..."


## Pre-process the content to remove stop words, punctutations and lemmatization

In [9]:
def preprocess(tokens):
    
    # TODO: remove random sequences that contain with more than one caps and small or combination of letters and numbers
    
    tokens_nop = [t for t in tokens if t not in string.punctuation]
    tokens_nop = [t.lower() for t in tokens_nop]
    wnl = nltk.WordNetLemmatizer()
    stop = stopwords.words('english')
    tokens_nostop = [t for t in tokens_nop if t not in stop]
    tokens_lem = [wnl.lemmatize(t) for t in tokens_nostop]
    tokens_clean = [t for t in tokens_lem if len(t) >= 3] 
    return tokens_clean

stream_content_df['Content_processed'] = stream_content_df['Content'].map(word_tokenize)
stream_content_df['Content_processed'] = stream_content_df.Content_processed.apply(preprocess)
stream_content_df.head()

Unnamed: 0,StreamID,Content,Content_processed
0,163,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...,"[txmak2kzay4, nmeujebo1ac, eeutxfhp3go, castro..."
1,419,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...,"[txmak2kzay4, nmeujebo1ac, eeutxfhp3go, castro..."
2,507,"wBYKUgUyGWc\r\nA team of world-class drivers, ...","[wbykuguygwc, team, world-class, driver, power..."
3,199,Castrol EDGE is Castrol?s flagship power bran...,"[castrol, edge, castrol, flagship, power, bran..."
4,201,"Charles ?Cheers? Wakefield, Castrol?s founder,...","[charles, cheer, wakefield, castrol, founder, ..."


### Generate the TFIDF vectors for the streams

In [11]:
stream_content_df['Content_processed'] = stream_content_df['Content_processed'].apply(lambda x: " ".join(x))
stream_content_df.head()

Unnamed: 0,StreamID,Content,Content_processed
0,163,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...,txmak2kzay4 nmeujebo1ac eeutxfhp3go castrol la...
1,419,TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...,txmak2kzay4 nmeujebo1ac eeutxfhp3go castrol la...
2,507,"wBYKUgUyGWc\r\nA team of world-class drivers, ...",wbykuguygwc team world-class driver powered ca...
3,199,Castrol EDGE is Castrol?s flagship power bran...,castrol edge castrol flagship power brand pcos...
4,201,"Charles ?Cheers? Wakefield, Castrol?s founder,...",charles cheer wakefield castrol founder entrep...


In [14]:
all_streams_cleaned_text = stream_content_df['Content_processed']
all_streams_tfidf_vectorizer = TfidfVectorizer(min_df = 2)
all_streams_tfidf = all_streams_tfidf_vectorizer.fit_transform(all_streams_cleaned_text)
all_streams_tfidf

<97x928 sparse matrix of type '<class 'numpy.float64'>'
	with 6122 stored elements in Compressed Sparse Row format>

In [16]:
token_values = {all_streams_tfidf_vectorizer.vocabulary_[token]: token for token in all_streams_tfidf_vectorizer.vocabulary_}


In [20]:
similarities = cosine_similarity(all_streams_tfidf)

In [30]:
def linear(x, total_steps):
    step_value = 1/total_steps
    return 1 - (x * step_value)

In [31]:
def constant(x, total_steps):
    return 1

In [45]:
def get_similar_streams_based_on_history(viewed_streams, weight_pattern = linear, max_similar_streams = 10, max_viewed_streams_to_consider = 5):
    """
        viewed_streams: list of stream IDs that have been viewed by the current user. The streams at a lower ID has been viewed more recently. So stream with ID 0 is the last viewed stream.
        weight_pattern: The weight pattern to weight the contributions due to the stream history
    max_similar_streams: The maximum number of similar streams to return
    max_viewed_streams_to_consider: The maximum number of viewed streams to consider for the recommendation 
    
    Returns: A list of stream ID, score pairs
    
    """
    
    if viewed_streams:
        num_viewed_streams = len(viewed_streams)
        
        # set the max viewed streams to consider 
        max_viewed_streams_to_consider = min(max_viewed_streams_to_consider, num_viewed_streams)
        
        # create an array of 0's
        similarity_sum = np.zeros(similarities.shape[0])
    
        for x, viewed_stream in enumerate(viewed_streams):
            stream_index = stream_content_df[stream_content_df["StreamID"]==viewed_stream].index[0]
            weight_factor = weight_pattern(x, max_viewed_streams_to_consider)
            # print(weight_factor)
            similarity_sum = similarity_sum + (weight_factor * np.array(similarities[stream_index]))
        
        
        #print(similarity_sum)
        stream_ids = stream_content_df["StreamID"]
        
        # concatenate the stream ID and the similarity score sum as pairs
        stream_similarity = list(zip(stream_ids, similarity_sum))
        
        # print(stream_similarity)
        
        # sort the stream similarity on the score in descending order
        stream_similarity.sort(key = lambda x: x[1], reverse = True)
        
        # print(stream_similarity)
        
        # candidate streams are those which have greater than 0 score and they have not been viewed before
        candidate_streams = [x for x in stream_similarity if (x[1] > 0) and (x[0] not in viewed_streams)]
        
        num_candidate_streams = len(candidate_streams)
        
        return candidate_streams[:min(num_candidate_streams, max_similar_streams)]
            
            

In [48]:
get_similar_streams_based_on_history([1498, 163, 507, 201])

[(2265, 1.2620715824847637),
 (199, 1.2362178356915718),
 (2405, 1.1823430221582885),
 (2380, 1.1786054440773466),
 (2373, 1.1267495184586085),
 (2030, 1.1267286220099606),
 (2036, 1.1140552834250859),
 (419, 1.1055542339583908),
 (1658, 1.071461114797631),
 (2104, 1.0561773007430364)]

In [49]:
print(stream_content_df[stream_content_df["StreamID"] == 163])

print(stream_content_df[stream_content_df["StreamID"] == 419])

   StreamID                                            Content  \
0       163  TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...   

                                   Content_processed  
0  txmak2kzay4 nmeujebo1ac eeutxfhp3go castrol la...  
   StreamID                                            Content  \
1       419  TXmAk2KZAy4\r\nNMeUjebo1Ac\r\nEEuTxFhp3go\r\nC...   

                                   Content_processed  
1  txmak2kzay4 nmeujebo1ac eeutxfhp3go castrol la...  


### Read in the content views for the users

In [62]:
content_views_per_user_scaled_df = pd.read_csv("data/content_views_per_user_scaled.csv", header=0)
new_column_names = ["UserID"]
new_column_names.extend(content_views_per_user_scaled_df.columns[1:])
content_views_per_user_scaled_df.columns = new_column_names
content_views_per_user_scaled_df.head()

Unnamed: 0,UserID,163,167,171,172,173,178,179,184,185,...,1657,1658,1659,1660,1661,1662,1665,1668,1670,1677
0,245,1.0,1.0,1.0,1.0,1.0,0.920755,0.2,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,246,0.040816,0.0,0.0,0.036145,0.0,0.415094,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,247,0.29932,0.116279,0.276423,0.457831,0.479042,0.633962,0.4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,248,0.014577,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,249,0.119534,0.736434,0.113821,0.373494,0.125749,1.0,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
features_df = content_views_per_user_scaled_df.drop("UserID", axis=1)
from sklearn.metrics.pairwise import euclidean_distances
user_similarities = euclidean_distances(features_df.values)
user_similarities.shape

(219, 219)

In [90]:
def get_similar_streams_based_on_other_users(current_user_id, max_similar_streams = 10, max_users_to_consider = 5):
    """
        current_user_id: The ID of the current user.
        max_similar_streams: The maximum number of similar streams to return
    max_users_to_consider: The maximum number of users to consider
    
    Returns: A list of stream IDs based on similar users who have watched the most
    
    """
    
    if current_user_id:
        distance_current_user_other_users = user_similarities[content_views_per_user_scaled_df["UserID"] == current_user_id]
        distance_current_user_other_users = np.squeeze(distance_current_user_other_users)
        #print(distance_current_user_other_users)
        
        user_ids = content_views_per_user_scaled_df["UserID"].values
        #print(type(user_ids))
        #print(user_ids.shape)
        #print(distance_current_user_other_users.shape)
        
        # concatenate the user ID and the similarity score sum as pairs
        user_similarity = list(zip(user_ids, distance_current_user_other_users))
        
        #print(user_similarity)
        
        # sort the user similarity on the score in ascending order
        user_similarity.sort(key = lambda x: x[1])
        
        #print(user_similarity)
        
        # candidate users are those which are other users closest to the current user
        candidate_users = [x for x in user_similarity if (x[0] != current_user_id)]
        
        num_candidate_users = len(candidate_users)
        
        return candidate_users[:min(num_candidate_users, max_similar_streams)]
        
            
            

In [91]:
get_similar_streams_based_on_other_users(245)

<class 'numpy.ndarray'>
(219,)
(219,)
[(245, 0.0), (246, 3.064629336020015), (247, 2.6085794880011854), (248, 3.183672908270569), (249, 2.5247593626807907), (254, 2.7249418240280234), (267, 4.178094970263231), (268, 3.528968017880467), (269, 2.6715976334901765), (277, 3.1500195537837183), (278, 3.030577763914333), (337, 3.221455997641786), (343, 3.2972527445965314), (344, 3.2281338720828496), (345, 3.258845843331141), (349, 3.192551812597635), (351, 3.1906944800725476), (400, 5.142507829430353), (408, 3.1939842225912485), (413, 4.2549583732155405), (416, 5.613463065729484), (419, 3.5690058515364607), (420, 3.9503445006664957), (421, 3.183010423837408), (447, 3.1957226303292945), (539, 3.2803429257839807), (552, 4.460612831546688), (553, 3.426843168813564), (554, 3.8844709084568816), (556, 3.7289608647210235), (566, 3.1154435011172086), (573, 3.637799159209524), (574, 3.4948212162466463), (575, 3.3009256202260038), (576, 3.7970884266841893), (577, 3.273117613016375), (580, 3.45894431186

[(249, 2.5247593626807907),
 (247, 2.6085794880011854),
 (269, 2.6715976334901765),
 (254, 2.7249418240280234),
 (278, 3.030577763914333),
 (246, 3.064629336020015),
 (566, 3.1154435011172086),
 (277, 3.1500195537837183),
 (421, 3.183010423837408),
 (248, 3.183672908270569)]