# Homework 4 - Recommendation systems and clustering everywhere


## 1. Recommendation system


In [43]:
#import necessary libraries
import pandas as pd
import numpy as np 
from itertools import permutations
import warnings


In [2]:
#open data
df = pd.read_csv('vodclickstream_uk_movies_03.csv')
df = df.rename(columns={'Unnamed: 0': 'row_id'})
df.head()

Unnamed: 0,row_id,datetime,duration,title,genres,release_date,movie_id,user_id
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
4,58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671736 entries, 0 to 671735
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   row_id        671736 non-null  int64  
 1   datetime      671736 non-null  object 
 2   duration      671736 non-null  float64
 3   title         671736 non-null  object 
 4   genres        671736 non-null  object 
 5   release_date  671736 non-null  object 
 6   movie_id      671736 non-null  object 
 7   user_id       671736 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 41.0+ MB


#### 1.1 Looking at the data, you can see that there is data available for each user for the movies the user clicked on. Gather the title and genre of the maximum top 10 movies that each user clicked on regarding the number of clicks.



In [5]:
df1 = df.copy()

# dropping unnecessary columns
df1 = df1.drop(columns=['datetime', 'row_id', 'duration', 'release_date'])

# number of clicks for each movie_id and user_id pair in df
clicks_count = df.groupby(['movie_id', 'user_id']).size().reset_index(name='number_of_clicks')

# merging this count back into df1
df1 = pd.merge(df1, clicks_count, on=['movie_id', 'user_id'], how='left')

# dropping duplicate (movie_id, user_id) pair keeping entry with the highest number_of_clicks
df1 = df1.sort_values('number_of_clicks', ascending=False).drop_duplicates(subset=['movie_id', 'user_id'])

# top 10 for each user
df1 = df1.sort_values(['user_id', 'number_of_clicks'], ascending=[True, False])
df1 = df1.groupby('user_id').head(10)

df1.head(20)

Unnamed: 0,title,genres,movie_id,user_id,number_of_clicks
223802,Hannibal,"Crime, Drama, Thriller",9bfee795ff,00004e2862,1
103059,Looper,"Action, Drama, Sci-Fi, Thriller",4718f9963c,000052a0a0,9
102839,Jumanji,"Adventure, Comedy, Family, Fantasy",4fa0b092d6,000052a0a0,3
84304,Frailty,"Crime, Drama, Thriller",7314699c23,000052a0a0,3
84324,Resident Evil,"Action, Horror, Sci-Fi",6275614f9a,000052a0a0,2
109326,The Big Lebowski,"Comedy, Crime, Sport",d601124c11,000052a0a0,1
108638,The SpongeBob Movie: Sponge Out of Water,"Animation, Action, Adventure, Comedy, Family, ...",96debad268,000052a0a0,1
109109,Ant-Man,"Action, Adventure, Comedy, Sci-Fi",59d313ed8b,000052a0a0,1
105025,Drive Angry,"Action, Fantasy, Thriller",fcabde6e42,000052a0a0,1
106664,The Nice Guys,"Action, Comedy, Crime, Mystery, Thriller",f254d41c3d,000052a0a0,1


In [6]:
# top 10 movies for all users as a list

top10_per_user = df1.groupby('user_id').agg(top_movies=('title', list), top_genres=('genres', list)).reset_index()
top10_per_user.head(10)

Unnamed: 0,user_id,top_movies,top_genres
0,00004e2862,[Hannibal],"[Crime, Drama, Thriller]"
1,000052a0a0,"[Looper, Jumanji, Frailty, Resident Evil, The ...","[Action, Drama, Sci-Fi, Thriller, Adventure, C..."
2,000090e7c8,[Mute],"[Mystery, Sci-Fi, Thriller]"
3,000118a755,"[From Dusk till Dawn (franchise), The Omen]","[NOT AVAILABLE, Horror]"
4,000296842d,[Black Mirror: Bandersnatch],"[Drama, Mystery, Sci-Fi, Thriller]"
5,0002aab109,"[The Iron Lady, King Cobra, 127 Hours]","[Biography, Drama, Comedy, Crime, Drama, Biogr..."
6,0002abf14f,[Fifty Shades Darker],"[Drama, Romance]"
7,0002d1c4b1,[Hot Bot],"[Comedy, Sci-Fi]"
8,000499c2b6,[Flushed Away],"[Animation, Adventure, Comedy, Family, Fantasy]"
9,00051f0e1f,[Ant-Man],"[Action, Adventure, Comedy, Sci-Fi]"


### 1.2 Minhash Signatures

Using the movie genre and user_ids, try to implement your min-hash signatures so that users with similar interests in a genre appear in the same bucket.

In [7]:
#splitting genres by comma
df['genres'] = df['genres'].apply(lambda x: x.split(', '))

# flattenning the list, finding unique genres
df = df.explode('genres')
unique_users = df['user_id'].unique()

# number of unique genres
unique_genres = df['genres'].unique()
num_unique_genres = len(unique_genres)

print("unique genres:", unique_genres)
print("number of unique genres:", num_unique_genres)

unique genres: ['Comedy' 'Drama' 'Romance' 'Fantasy' 'Horror' 'Mystery' 'Thriller'
 'Action' 'Animation' 'Adventure' 'Family' 'History' 'War' 'Musical'
 'Biography' 'Sport' 'Documentary' 'Sci-Fi' 'Crime' 'Music'
 'NOT AVAILABLE' 'Western' 'News' 'Short' 'Film-Noir' 'Reality-TV'
 'Talk-Show']
number of unique genres: 27


In [8]:
len(unique_users)

161918

In [10]:
# creating user-genre matrix
user_genre_matrix = pd.DataFrame(np.zeros((len(unique_users), len(unique_genres)), dtype=int), index=unique_users, columns=unique_genres)

for _, row in df.iterrows():
    user_genre_matrix.loc[row['user_id'], row['genres']] = 1

user_genre_matrix.head()

Unnamed: 0,Comedy,Drama,Romance,Fantasy,Horror,Mystery,Thriller,Action,Animation,Adventure,...,Sci-Fi,Crime,Music,NOT AVAILABLE,Western,News,Short,Film-Noir,Reality-TV,Talk-Show
1dea19f6fe,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
544dcbc510,1,1,1,1,1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7cbcc791bf,1,0,1,1,0,0,1,1,1,1,...,0,1,0,0,0,0,0,0,0,0
ebf43c36b6,1,1,0,1,0,0,1,1,1,1,...,1,1,0,0,0,0,0,0,0,0
a57c992287,1,1,0,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,0,0,0


In [12]:
# number of permutations
num_permutations = 111

# initializing the minhash signature matrix
signature_matrix = np.full((len(unique_users), num_permutations), np.inf)

# generating minhash signatures for each user
for i in range(num_permutations):

    # random permutations of genre indices
    permuted_indices = np.random.permutation(num_unique_genres)

    # for each user
    for user_index, user_id in enumerate(unique_users):
        # for each genre
        for genre_index in permuted_indices:
            
            if user_genre_matrix.at[user_id, unique_genres[genre_index]] == 1:
                signature_matrix[user_index, i] = min(signature_matrix[user_index, i], genre_index)
                break

signature_df = pd.DataFrame(signature_matrix, index=unique_users)
signature_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,101,102,103,104,105,106,107,108,109,110
1dea19f6fe,0.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,2.0,0.0,...,1.0,2.0,0.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0
544dcbc510,3.0,3.0,2.0,1.0,2.0,1.0,5.0,2.0,3.0,6.0,...,6.0,3.0,5.0,3.0,5.0,5.0,1.0,5.0,9.0,3.0
7cbcc791bf,10.0,3.0,2.0,0.0,8.0,6.0,8.0,2.0,3.0,18.0,...,6.0,7.0,9.0,7.0,7.0,10.0,2.0,8.0,9.0,3.0
ebf43c36b6,10.0,3.0,17.0,1.0,17.0,1.0,14.0,17.0,17.0,18.0,...,6.0,7.0,9.0,7.0,7.0,10.0,17.0,17.0,9.0,3.0
a57c992287,10.0,3.0,15.0,23.0,21.0,20.0,5.0,23.0,20.0,15.0,...,23.0,15.0,5.0,7.0,16.0,10.0,17.0,23.0,11.0,23.0


### 1.3 Locality-Sensitive Hashing (LSH)

Recommend at most five movies given a user_id, use the following procedure:

1. Identify the two most similar users to this user.
2. If these two users have any movies in common, recommend those movies based on the total number of clicks by these users.
3. If there are no more common movies, try to propose the most clicked movies by the most similar user first, followed by the other user.

In [13]:
num_bands = 11
bucket_size = num_permutations // num_bands
buckets = {}

for band in range(num_bands):

    # each user
    for user_id in signature_df.index:

        band_signature = tuple(signature_df.loc[user_id, band * bucket_size:(band + 1) * bucket_size])

        if band_signature not in buckets:
            buckets[band_signature] = []

        buckets[band_signature].append(user_id)


In [26]:
#finding jaccard similiarity between to users
def jaccard_similarity(user1, user2):

    # creating sets of genres liked by each user
    set1 = set(signature_df.columns[signature_df.loc[user1] == 1])
    set2 = set(signature_df.columns[signature_df.loc[user2] == 1])

    # calculating the intersection and union of the two sets
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))

    return intersection / union if union != 0 else 0

# function returns two most similar users
def similar_users(target_user_id):
    
    target_signature = signature_df.loc[target_user_id]
    
    # initialize a list to hold similarity scores of other users
    users_similarity = []

    # iterating through each bucket to find the target user
    for band_signature, users in buckets.items():
        if target_user_id in users:

            # comparing the target user with each user in the same bucket
            for user in users:
                if user != target_user_id:
                    similarity = jaccard_similarity(target_user_id, user)
                    users_similarity.append((user, similarity))
            break

    return sorted(users_similarity, key=lambda x: x[1], reverse=True)[:2]

# to collect information about movie clicks by each user
def movie_clicks(df):
    return df.groupby(['user_id', 'movie_id'])['number_of_clicks'].sum().unstack(fill_value=0)


# main function that returns recommended movie IDs 
def recommend_movies(user_id, aggregated_clicks):

    most_similar_users = similar_users(user_id)  
    similar_user_id = most_similar_users[0][0] if most_similar_users else None

    # dictionary to store recommended movies
    recommended_movies = {}

    # iterating over the movies clicked by similar users
    for similar_user, _ in most_similar_users:
        user_movies = aggregated_clicks.loc[similar_user]
        for movie, clicks in user_movies.items():
            if clicks > 0:
                recommended_movies[movie] = recommended_movies.get(movie, 0) + clicks

    # sorting the movies based on total clicks and recommending top 5
    sorted_movies = sorted(recommended_movies.items(), key=lambda x: x[1], reverse=True)
    top_recommended_movie_ids = [movie for movie, _ in sorted_movies][:5]

    return similar_user_id, top_recommended_movie_ids

# to get top 5 movie titles clicked by a given user
def movie_titles(user_id, df):

    user_movies = df[df['user_id'] == user_id].sort_values(by='number_of_clicks', ascending=False)
    top_movies = user_movies['title'].head(5).tolist()

    return top_movies


Here is the example for user b43b4d8024

In [28]:
given_user_id = "b43b4d8024"
clicks = movie_clicks(df1)

most_similar_user_id, recommended_movie_ids = recommend_movies(given_user_id, clicks)

given_user_movie_titles = movie_titles(given_user_id, df1)
recommended_movie_titles = df1[df1['movie_id'].isin(recommended_movie_ids)]['title'].unique().tolist()

print(f"Given User ID: {given_user_id}")
print(f"Most Similar User ID: {most_similar_user_id}")
print(f"Movies Clicked by Given User: {given_user_movie_titles}")
print(f"Recommended Movies: {recommended_movie_titles}")

Given User ID: b43b4d8024
Most Similar User ID: 0224dab8a5
Movies Clicked by Given User: ['Fury', 'Guardians of the Galaxy', 'Filth', 'The Road to El Dorado', 'Fight Club']
Recommended Movies: ['S.W.A.T.', 'Halo 4: Forward Unto Dawn', 'I Love You, Man', 'Kid Cannabis', 'Snitch']


### 2. Grouping Users together!



#### 2.1 Getting your data + feature engineering

In [36]:
# function to calculate the favorite genre
def favorite_genre(users_df):
    return users_df.groupby('genres')['duration'].sum().idxmax()

# function to count total clicks
def total_clicks_count(users_df):
    return users_df.shape[0]

# function to count number of unique movies watched
def unique_movies_watched(users_df):
    return users_df['movie_id'].nunique()

# function to find the most watched movie
def favourite_movie(users_df):
    return users_df.groupby('movie_id')['duration'].sum().idxmax()

# function to find the most active month
def most_active_month(users_df):
    return users_df['datetime'].dt.month.mode()[0]

In [39]:
df1 = df.copy()

# convert to datetime format
df1['datetime'] = pd.to_datetime(df1['datetime'])

# group by user_id
grouped = df1.groupby('user_id')

# calculate favorite genre
favorite_genre = grouped.apply(lambda x: x.groupby('genres')['duration'].sum().idxmax())

# calculate total clicks count
total_clicks_count = grouped.size()

# calculate number of unique movies watched
unique_movies_watched = grouped['movie_id'].nunique()

# calculate favorite movie
favourite_movie = grouped.apply(lambda x: x.groupby('movie_id')['duration'].sum().idxmax())

# calculate most active month
most_active_month = grouped['datetime'].apply(lambda x: x.dt.month.mode()[0])

# combine all features
user_features_df = pd.DataFrame({
    'favorite_genre': favorite_genre,
    'total_clicks_count': total_clicks_count,
    'number_of_unique_movies_watched': unique_movies_watched,
    'favourite_movie': favourite_movie,
    'most_active_month': most_active_month
}).reset_index()

In [41]:
user_features_df.head()

Unnamed: 0,user_id,favorite_genre,total_clicks_count,number_of_unique_movies_watched,favourite_movie,most_active_month
0,00004e2862,Crime,3,1,9bfee795ff,12
1,000052a0a0,Action,92,11,f254d41c3d,6
2,000090e7c8,Mystery,3,1,eb72fbc6ee,3
3,000118a755,Horror,4,2,24c4c7425d,6
4,000296842d,Drama,32,1,e847f14da5,12
