In [1]:
import duckdb
import os
import pandas as pd
import urllib.request 
from zipfile import ZipFile 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances


from matplotlib import pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# Larger MovieLens dataset

For this homework assignment, we are going to use the large, 32 million review MovieLens data, as it is more complete and has more interesting associations and will work better for recommendations

Remember to change the path here to wherever you want to save this locally

By the end of this homework, you will have created the simple recommender that I showed at the start of class

In [3]:
filename = "ml-32m.zip"
path = "/Users/yashwanth/Documents/GWU/Sem 3/Data Mining/Class 3/Assignment/MovieLens/"
url = "https://files.grouplens.org/datasets/movielens/ml-32m.zip"
if os.path.isfile(path+filename):
    print(f'file already downloaded: {filename}')
else:
    print(f'downloading file: {filename}')
    headers = urllib.request.urlretrieve(url, filename=path+filename)

with ZipFile(path+filename, 'r') as zip_file: 
    files = zip_file.namelist()
    for f in files: print(f)
    zip_file.extractall(path=path)
    print("Extracted")

file already downloaded: ml-32m.zip
ml-32m/
ml-32m/tags.csv
ml-32m/links.csv
ml-32m/README.txt
ml-32m/checksums.txt
ml-32m/ratings.csv
ml-32m/movies.csv
Extracted


In [4]:
## All we will use is the ratings and movies tables, so we will create views of those.

extracted_path = path+'ml-32m/'

duckdb.sql(f'CREATE TEMPORARY VIEW ratings AS (SELECT * FROM "{extracted_path}ratings.csv")')
duckdb.sql(f'CREATE TEMPORARY VIEW movies AS (SELECT * FROM "{extracted_path}movies.csv")')

In [5]:
# AS in the walkthrough in class, we will limit our analysis to only those ratings that are 5 stars

query = '''
CREATE OR REPLACE TABLE high_ratings_base AS
(
      SELECT
         *
      FROM 
         ratings
      WHERE
          rating = 5
)
'''
duckdb.sql(query)

In [6]:
duckdb.sql('SELECT UserId, COUNT(*) AS num_high_rated FROM high_ratings_base GROUP BY UserId ORDER BY num_high_rated DESC LIMIT 20')

┌────────┬────────────────┐
│ userId │ num_high_rated │
│ int64  │     int64      │
├────────┼────────────────┤
│  87007 │           5525 │
│ 103925 │           2674 │
│  82050 │           1849 │
│ 132423 │           1739 │
│  71601 │           1695 │
│  91496 │           1265 │
│ 113165 │           1244 │
│ 100346 │           1195 │
│ 163021 │           1194 │
│  30910 │           1169 │
│ 131675 │           1117 │
│ 123623 │            974 │
│  31361 │            941 │
│  44117 │            940 │
│ 176458 │            933 │
│  91449 │            906 │
│  37512 │            900 │
│  10202 │            883 │
│ 135791 │            859 │
│ 141034 │            844 │
├────────┴────────────────┤
│ 20 rows       2 columns │
└─────────────────────────┘

# Problem 1:

Take a look at the query above. There exist some users who rate many hundreds or thousands of movies as 5-stars. I'm not sure whether these are critics (I wouldn't expect such high reviews), bots, trolls, etc. This certainly occurred, to a lesser degree, in the small movie data set, but it was not as prominent or problematic.

But, whatever these users are, they will cause us some problems now that we are looking at this full data.

If user 87007 rated 5525 movies as 5 stars, then that user has 5525*5524/2 = 15 million pairwise combinations of movies. Too many of these kinds of over-active and enthusiastic critics and we will be overwhelmed with uninteresting combinations of movies

Question 1a: Create a new table called high_ratings that filters the high_ratings_base table so that it only includes reviews by users who have reviewed at most 25 different movies as 5-stars.

Question 1b: Count how many total ratings exist, how many 5-star ratings exist, and how many 5-stars by non-critics exist.

This initial filtering down of the data reduces the number of ratings by quite a lot and should make our pattern finding much faster and more interesting

In [8]:
# Question 1a 

duckdb.sql('''
CREATE OR REPLACE TABLE high_ratings AS
(
    SELECT *
    FROM 
        high_ratings_base
    WHERE UserId IN 
    (
        SELECT 
            UserId
        FROM 
            high_ratings_base
        GROUP BY 
            UserId
        HAVING 
            COUNT(*) <= 25
    )
)
''')

In [9]:
# Question 1b

# total ratings 
query_total_ratings = '''
SELECT COUNT(*) AS total_ratings FROM ratings
'''
total_ratings = duckdb.sql(query_total_ratings)
print(total_ratings)

# total 5-star ratings 
query_5_star_ratings = '''
SELECT COUNT(*) AS total_5_star_ratings FROM ratings WHERE rating = 5
'''
total_5_star_ratings = duckdb.sql(query_5_star_ratings)
print(total_5_star_ratings)

# total 5-star ratings by non-critics 
query_non_critic_5_star_ratings = '''
SELECT COUNT(*) AS total_non_critic_5_star_ratings FROM high_ratings
'''
total_non_critic_5_star_ratings = duckdb.sql(query_non_critic_5_star_ratings)
print(total_non_critic_5_star_ratings)

┌───────────────┐
│ total_ratings │
│     int64     │
├───────────────┤
│      32000204 │
└───────────────┘

┌──────────────────────┐
│ total_5_star_ratings │
│        int64         │
├──────────────────────┤
│              4596577 │
└──────────────────────┘

┌─────────────────────────────────┐
│ total_non_critic_5_star_ratings │
│              int64              │
├─────────────────────────────────┤
│                         1407551 │
└─────────────────────────────────┘



# Finding length 2 itemsets

The following two code blocks is taken from the walkthrough notebook in class.

In [11]:
min_support = 5 # We want there to be at least 5 users who rated both movies as 5-stars.

query = f'''
CREATE OR REPLACE TABLE frequent_items_1 AS
(

    SELECT
        movieId
        , COUNT(*) AS frequency
    FROM
        high_ratings
    GROUP BY
        movieId
    HAVING 
        frequency >= {min_support}
) 
'''

duckdb.sql(query)


query2 = f'''
CREATE OR REPLACE TABLE candidate_items_2 AS
(
    SELECT
        [m.movieId, m2.movieId] AS candidate
        , m.frequency AS movie1_frequency
        , m2.frequency AS movie2_frequency
    FROM
        frequent_items_1 m
    FULL OUTER JOIN
        frequent_items_1 m2
    ON 
        1=1
    WHERE
        m.movieId < m2.movieId

)
'''

duckdb.sql(query2)


duckdb.sql('''SELECT * FROM candidate_items_2 USING SAMPLE 20''')

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌─────────────────┬──────────────────┬──────────────────┐
│    candidate    │ movie1_frequency │ movie2_frequency │
│     int64[]     │      int64       │      int64       │
├─────────────────┼──────────────────┼──────────────────┤
│ [4345, 4626]    │               16 │               10 │
│ [1610, 50851]   │             1213 │               19 │
│ [42004, 95307]  │               32 │                6 │
│ [943, 7055]     │               23 │                7 │
│ [1648, 49394]   │               53 │                5 │
│ [34338, 70183]  │               21 │               49 │
│ [942, 990]      │              148 │               23 │
│ [4921, 6777]    │               23 │               45 │
│ [71211, 93840]  │                7 │              292 │
│ [664, 26163]    │                9 │                9 │
│ [43, 202393]    │              147 │                8 │
│ [3977, 223876]  │              209 │               39 │
│ [2153, 33145]   │               12 │               20 │
│ [2114, 59995

In [12]:
query = f'''
CREATE OR REPLACE TABLE frequent_items_2 AS
(
    
    SELECT
        cl.candidate AS itemset, COUNT(*) AS frequency
    FROM
        high_ratings m
    INNER JOIN
        high_ratings m2
    USING (UserId)

    INNER JOIN
        candidate_items_2 cl
    ON
        m.movieId = cl.candidate[1] 
    AND
        m2.movieId = cl.candidate[2]
        
        
   GROUP BY ALL

   HAVING frequency >= {min_support}

)
'''

duckdb.sql(query)

duckdb.sql('SELECT * FROM frequent_items_2')

┌──────────────────┬───────────┐
│     itemset      │ frequency │
│     int64[]      │   int64   │
├──────────────────┼───────────┤
│ [17, 79]         │        17 │
│ [2406, 2628]     │         8 │
│ [34405, 116797]  │        19 │
│ [58559, 68954]   │       607 │
│ [608, 805]       │       189 │
│ [3753, 3793]     │        45 │
│ [177593, 177765] │        32 │
│ [1079, 1080]     │       138 │
│ [742, 891]       │        10 │
│ [364, 1356]      │        62 │
│      ·           │         · │
│      ·           │         · │
│      ·           │         · │
│ [150, 68358]     │        26 │
│ [1060, 5225]     │         5 │
│ [1967, 6807]     │         6 │
│ [2000, 6874]     │         6 │
│ [265, 3044]      │         6 │
│ [1653, 6774]     │         5 │
│ [47, 4016]       │         6 │
│ [260, 3512]      │        13 │
│ [1704, 179817]   │         5 │
│ [3418, 4022]     │         7 │
├──────────────────┴───────────┤
│ ? rows             2 columns │
└──────────────────────────────┘

# Question 2

In class, we created rules using the 3-itemsets. For this, given our downstream use, we will stop here at k=2.

We want to create rules similar to those from the in-class demo.

Question 2: Modify the query from the Class 3 class notebook that creates the rules table. Please create the following columns:

antecedent: this can be simpler than before. No need to make this a list - a single movieId is fine
consequent: also, no need for a list
a_title: the title of the antecedent movie -- this was not in the example
c_title: the title of the consequent movie -- this was not in the example
frequency: the number of times that the pattern occurs (# of users that rated both movies as 5-stars) -- this was in the example
user_count: the total number of users that rated at least 1 movie as 5-stars -- this was in the example
antecedent_frequency: the number of users that rated the antecedent movie as 5-stars -- this was in the example
consequent_frequency: the number of users that rated the consequent movie as 5-stars -- this was in the example
support: percent of all users that rated these 2 movies as 5-stars -- this was in the example
confidence: Of all users that rated the antecedent movie as 5-stars, what percent rated the consequent 5-stars? -- this was in the example
PMI: the pointwise mutual information: this should be calculable from the other numbers that you have here. -- this was not in the example
Please include a filter of 5 for minimum confidence and 1 for minimum PMI

One difference of PMI from the confidence measurement is that it is symmetric: PMI(A,B) = PMI(B,A) whereas confidence(A=>B) is not necessarily equal to confidence(B=>A)

Make sure that you include both A => B and B => A in the rules set

In [14]:
min_confidence = 5
min_pmi = 1

query = f'''
CREATE OR REPLACE TABLE rules AS
(
    SELECT
        fi2.itemset[1] AS antecedent,  
        fi2.itemset[2] AS consequent,  
        m1.title AS a_title, 
        m2.title AS c_title,  
        fi2.frequency AS frequency,  
        (SELECT COUNT(DISTINCT UserId) FROM high_ratings) AS user_count,  
        m1f.frequency AS antecedent_frequency,  
        m2f.frequency AS consequent_frequency,  
        CAST(fi2.frequency AS FLOAT) / (SELECT COUNT(DISTINCT UserId) FROM high_ratings) AS support,  
        CAST(fi2.frequency AS FLOAT) / m1f.frequency AS confidence, 
        LOG2((CAST(fi2.frequency AS FLOAT) * (SELECT COUNT(DISTINCT UserId) FROM high_ratings)) / (m1f.frequency * m2f.frequency)) AS PMI  
    FROM
        frequent_items_2 fi2  
    JOIN
        frequent_items_1 m1f  
    ON
        fi2.itemset[1] = m1f.movieId
    JOIN
        frequent_items_1 m2f  
    ON
        fi2.itemset[2] = m2f.movieId
    JOIN
        movies m1  
    ON
        m1.movieId = fi2.itemset[1]
    JOIN
        movies m2  
    ON
        m2.movieId = fi2.itemset[2]
    WHERE
        (CAST(fi2.frequency AS FLOAT) / m1f.frequency) * 100 >= {min_confidence}  
    AND
        LOG2((CAST(fi2.frequency AS FLOAT) * (SELECT COUNT(DISTINCT UserId) FROM high_ratings)) / (m1f.frequency * m2f.frequency)) >= {min_pmi}  
)
'''
duckdb.sql(query)
duckdb.sql('SELECT * FROM rules ORDER BY support DESC').df()

Unnamed: 0,antecedent,consequent,a_title,c_title,frequency,user_count,antecedent_frequency,consequent_frequency,support,confidence,PMI
0,4993,7153,"Lord of the Rings: The Fellowship of the Ring,...","Lord of the Rings: The Return of the King, The...",8118,139798,11711,11425,0.058070,0.693194,3.084410
1,4993,5952,"Lord of the Rings: The Fellowship of the Ring,...","Lord of the Rings: The Two Towers, The (2002)",7745,139798,11711,10078,0.055401,0.661344,3.197535
2,5952,7153,"Lord of the Rings: The Two Towers, The (2002)","Lord of the Rings: The Return of the King, The...",7609,139798,10078,11425,0.054429,0.755011,3.207647
3,318,527,"Shawshank Redemption, The (1994)",Schindler's List (1993),7127,139798,29070,15738,0.050981,0.245167,1.122855
4,260,1196,Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back...,6302,139798,13922,10107,0.045079,0.452665,2.646432
...,...,...,...,...,...,...,...,...,...,...,...
30281,243642,263007,Zack Snyder's Justice League (2021),Spider-Man: No Way Home (2021),5,139798,12,221,0.000036,0.416667,8.042047
30282,252710,287699,Attack On Titan (2013),Oppenheimer (2023),5,139798,38,81,0.000036,0.131579,7.827135
30283,255335,260667,Shang-Chi and the Legend of the Ten Rings (2021),Encanto (2021),5,139798,26,43,0.000036,0.192308,9.288208
30284,260667,269638,Encanto (2021),Turning Red (2022),5,139798,43,9,0.000036,0.116279,10.818722


# Question 3:

Question 3a: Write a function that takes a movieId and returns the top (up to) 30 movies associated with that movieId, rankined by highest PMI

Question 3b,c Run your helper function with the IDs for Inception and Interstellar (function calls below)

In [16]:
def find_movie_from_title(title):
    return duckdb.sql(f"""SELECT * FROM movies WHERE UPPER(title) LIKE '%{title.upper()}%'""")

def find_recommendations_from_one_pmi(movie_id):
    query = f'''
    SELECT
        CASE
            WHEN 
                antecedent = {movie_id} THEN consequent
            ELSE 
                antecedent
        END AS 
            recommended_movie_id,
        CASE
            WHEN 
                antecedent = {movie_id} THEN c_title
            ELSE 
                a_title
        END AS 
            recommended_movie_title, PMI
    FROM 
        rules
    WHERE 
        antecedent = {movie_id} OR consequent = {movie_id}
    ORDER BY PMI DESC
    LIMIT 30
    '''
    return duckdb.sql(query).df()

In [17]:
find_recommendations_from_one_pmi(79132)  # Inception

Unnamed: 0,recommended_movie_id,recommended_movie_title,PMI
0,70336,G.I. Joe: The Rise of Cobra (2009),2.861488
1,74458,Shutter Island (2010),2.83479
2,109487,Interstellar (2014),2.722203
3,91529,"Dark Knight Rises, The (2012)",2.686486
4,48780,"Prestige, The (2006)",2.635469
5,69640,Public Enemies (2009),2.474465
6,64030,Transporter 3 (2008),2.474465
7,51412,Next (2007),2.392003
8,58299,Horton Hears a Who! (2008),2.392003
9,75341,Remember Me (2010),2.392003


In [18]:
find_recommendations_from_one_pmi(109487)  # Interstellar

Unnamed: 0,recommended_movie_id,recommended_movie_title,PMI
0,101864,Oblivion (2013),3.387566
1,134130,The Martian (2015),3.274112
2,94864,Prometheus (2012),3.204302
3,164179,Arrival (2016),3.169431
4,104841,Gravity (2013),3.113022
5,103253,Elysium (2013),3.091827
6,85414,Source Code (2011),3.0555
7,95875,Total Recall (2012),2.943436
8,116797,The Imitation Game (2014),2.927324
9,115713,Ex Machina (2015),2.912171


# Question 4: A simple recommender - part 1

We are going to create a simple recommendation system that takes in a list of movieIds and returns a list of recommended movies.

One way to think of the rules table that we created is that it is a sparse representation of a matrix (well, sort of a tensor) Ignoring the other fields, the following 3 fields are a sparse matrix:

antecedent, consequent, PMI

where antecedent, consequent are the x,y index of the matrix and PMI is the value of that element. Think of this as the movie-movie-PMI matrix

What we'd like to do is to think of a list of movieIds as a feature vector to take the dot product of that feature vector with our movie-movie-PMI matrix.

If I took each movie in my feature-vector/list individually, I could use find_recommendations_from_one_pmi to find the list of movies that are associated with that. But, in order to combine the multiple lists of recommended movies together, we can take a simple sum of the PMIs calculated one by one.

You do not need to implement this as a series of calls to find_recommendations_from_one_pmi - in my answer, I just wrote a new query to express this.

Question 4a Implement this dot product as a function below

In [20]:
def make_recommendation_from_list(list_of_ids):
    list_string = ",".join([str(i) for i in list_of_ids])
    
    query = f'''
    WITH movie_pmi AS (
        SELECT
            CASE
                WHEN 
                    antecedent IN ({list_string}) THEN consequent
                ELSE 
                    antecedent
            END AS 
                recommended_movie_id,
            SUM(PMI) AS aggregated_pmi
        FROM 
            rules
        WHERE 
            antecedent IN ({list_string}) OR consequent IN ({list_string})
        GROUP BY 
            recommended_movie_id
    )
    
    SELECT
        mp.recommended_movie_id,
        m.title AS recommended_movie_title,
        mp.aggregated_pmi AS total_pmi
    FROM 
        movie_pmi mp
    LEFT 
        JOIN movies m ON mp.recommended_movie_id = m.movieId
    WHERE 
        mp.recommended_movie_id NOT IN ({list_string})
    ORDER 
        BY total_pmi DESC
    LIMIT 30;
    '''
    
    return duckdb.sql(query).df()


Question 4b: Test it out below!

In [22]:
make_recommendation_from_list([109487, 79132, 91529])  # Interstellar, Inception, Dark Knight Rises

Unnamed: 0,recommended_movie_id,recommended_movie_title,total_pmi
0,74458,Shutter Island (2010),8.342725
1,48780,"Prestige, The (2006)",8.113781
2,134130,The Martian (2015),8.016859
3,33794,Batman Begins (2005),7.939315
4,116797,The Imitation Game (2014),7.925484
5,106782,"Wolf of Wall Street, The (2013)",7.808566
6,58559,"Dark Knight, The (2008)",7.500728
7,70336,G.I. Joe: The Rise of Cobra (2009),7.303914
8,99114,Django Unchained (2012),7.286417
9,112552,Whiplash (2014),6.73052


# Summary

This recommendation system works OK! It's clearly not perfect, but think about what information it is using and not using:

It is only using information from our length 2 association rules
This is a type of technique known as item-based collaborative filtering
We have a couple of refinements: we only include item-item information when it is above a certain frequency and above a certain PMI.
Another refinement could be to only include the top-N items for each item (filter on rank of the consequent, not just the PMI)
It is not using any information like the genre, actors, directors, year of release, etc.
Not so bad!

I encourage you to try a few of your favorite movies out and see what kinds of results you get.