In [2]:
import duckdb
import pandas as pd
import re
from collections import Counter
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [3]:
path_to_export = "/Users/yashwanth/Documents/GWU/Sem 3/Data Mining/Class 2/Class Material/MovieLensExport" 
duckdb.sql(f'''IMPORT DATABASE '{path_to_export}' ''')
duckdb.sql('SHOW TABLES')

┌──────────────┐
│     name     │
│   varchar    │
├──────────────┤
│ date_dim     │
│ driver       │
│ driver2      │
│ genre_dim    │
│ links        │
│ movie_dim    │
│ movies       │
│ ratings      │
│ ratings_fact │
│ tags         │
│ user_dim     │
│ user_matrix  │
├──────────────┤
│   12 rows    │
└──────────────┘

# Most frequently reviewed movies

In [20]:
query = '''
WITH rating_count AS
( SELECT movieId, COUNT(*) AS num_reviews FROM ratings_fact GROUP BY movieId)

SELECT
    m.movieId
    ,m.title
    , r.num_reviews
FROM
    rating_count r
INNER JOIN
    movie_dim m
USING (movieId)
ORDER BY 
    num_reviews DESC
LIMIT 10'''

duckdb.sql(query).df()

Unnamed: 0,movieId,title,num_reviews
0,356,Forrest Gump (1994),329
1,318,"Shawshank Redemption, The (1994)",317
2,296,Pulp Fiction (1994),307
3,593,"Silence of the Lambs, The (1991)",279
4,2571,"Matrix, The (1999)",278
5,260,Star Wars: Episode IV - A New Hope (1977),251
6,480,Jurassic Park (1993),238
7,110,Braveheart (1995),237
8,589,Terminator 2: Judgment Day (1991),224
9,527,Schindler's List (1993),220


So, lots of blockbusters from the 90s.

# How many total movies are there in this data set?

In [23]:
duckdb.sql('SELECT COUNT(*) AS num_movie FROM movie_dim')

┌───────────┐
│ num_movie │
│   int64   │
├───────────┤
│      9742 │
└───────────┘

How many are rated at least 5 times?

In [26]:
query = '''
WITH ratings_count AS
    (
    SELECT 
        COUNT(*) AS num_ratings 
        , movieId
    FROM 
        ratings_fact
    GROUP BY movieId
    HAVING num_ratings >=5
)
SELECT COUNT(*) FROM ratings_count
'''
duckdb.sql(query)


┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│         3650 │
└──────────────┘

Looking at length of 2

In [29]:
query = '''

WITH co_rated AS
(
  SELECT
     r.movieId AS movie1
     ,r2.movieId AS movie2
     , COUNT(*) AS num_both
  FROM 
     ratings_fact r
  INNER JOIN
     ratings_fact r2
  USING (userId)
  WHERE r.movieId < r2.movieId
  GROUP BY ALL
)


SELECT
    m1.movieId
    , m1.title
    , m2.movieId
    , m2.title
    , cr.num_both
FROM
    co_rated cr
LEFT JOIN
    movie_dim m1
ON 
    cr.movie1 = m1.movieId
LEFT JOIN
    movie_dim m2
ON 
    cr.movie2 = m2.movieId
ORDER BY 
    num_both DESC
LIMIT 10
'''

duckdb.sql(query).df()

Unnamed: 0,movieId,title,movieId_1,title_1,num_both
0,318,"Shawshank Redemption, The (1994)",356,Forrest Gump (1994),231
1,296,Pulp Fiction (1994),356,Forrest Gump (1994),230
2,296,Pulp Fiction (1994),318,"Shawshank Redemption, The (1994)",222
3,296,Pulp Fiction (1994),593,"Silence of the Lambs, The (1991)",207
4,318,"Shawshank Redemption, The (1994)",593,"Silence of the Lambs, The (1991)",199
5,356,Forrest Gump (1994),593,"Silence of the Lambs, The (1991)",199
6,356,Forrest Gump (1994),480,Jurassic Park (1993),198
7,356,Forrest Gump (1994),2571,"Matrix, The (1999)",194
8,260,Star Wars: Episode IV - A New Hope (1977),1196,Star Wars: Episode V - The Empire Strikes Back...,190
9,260,Star Wars: Episode IV - A New Hope (1977),2571,"Matrix, The (1999)",183


So, we can already see that the most common pair of movies to be both reviewed by the same user are actually the 2 most common movies. Does this make these interesting in any way? We'll get to that in a bit.

In [32]:
query = '''
WITH co_rated AS
(
  SELECT
     r.movieId AS movie1
     ,r2.movieId AS movie2
     , COUNT(*) AS num_both
  FROM 
     ratings_fact r
  INNER JOIN
     ratings_fact r2
  USING (userId)
  WHERE r.movieId < r2.movieId
  GROUP BY ALL
)

SELECT COUNT(*) FROM co_rated'''
duckdb.sql(query)

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│     13157672 │
└──────────────┘

In [34]:
query = '''
WITH co_rated AS
(
  SELECT
     r.movieId AS movie1
     ,r2.movieId AS movie2
     , COUNT(*) AS num_both
  FROM 
     ratings_fact r
  INNER JOIN
     ratings_fact r2
  USING (userId)
  WHERE r.movieId < r2.movieId
  GROUP BY ALL
)

SELECT COUNT(*) FROM co_rated WHERE num_both >=5'''
duckdb.sql(query)

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      1293963 │
└──────────────┘

# OK, let's build some association rules

If we are thinking about building something like a movie recommender system, then we might want to narrow our view of associations to only those movies that are "highly rated" by the same user. So, let's narrow down the ratings to only 5-star ratings: movies that people loved.

Let's create a table of just those ratings, so it is simpler to use downstream when we have a lot of joins

In [56]:
query = '''
CREATE OR REPLACE TABLE high_ratings AS
(
      SELECT
         *
      FROM 
         ratings_fact
      WHERE
      rating =5
)
'''
duckdb.sql(query)

In [58]:
min_support = 5 # We want there to be at least 5 users who rated both movies as 5-stars.

query = f'''
CREATE OR REPLACE TABLE frequent_items_1 AS
(

    SELECT
        movieId
        , COUNT(*) AS frequency
    FROM
        high_ratings
    GROUP BY
        movieId
    HAVING frequency >= {min_support}
) 
'''

duckdb.sql(query)


query2 = f'''
CREATE OR REPLACE TABLE candidate_items_2 AS
(
    SELECT
        [m.movieId, m2.movieId] AS candidate
        , m.frequency AS movie1_frequency
        , m2.frequency AS movie2_frequency
    FROM
        frequent_items_1 m
    FULL OUTER JOIN
        frequent_items_1 m2
    ON 1=1
    WHERE
        m.movieId < m2.movieId

)
'''

duckdb.sql(query2)


duckdb.sql('''SELECT * FROM candidate_items_2 USING SAMPLE 20''')

┌─────────────────┬──────────────────┬──────────────────┐
│    candidate    │ movie1_frequency │ movie2_frequency │
│     int64[]     │      int64       │      int64       │
├─────────────────┼──────────────────┼──────────────────┤
│ [339, 1201]     │               12 │               24 │
│ [1958, 2951]    │                5 │                5 │
│ [923, 2580]     │               22 │               10 │
│ [2968, 7022]    │                8 │                7 │
│ [1244, 4308]    │                9 │               11 │
│ [509, 1748]     │                9 │                7 │
│ [1645, 55247]   │                6 │                9 │
│ [802, 4878]     │                5 │               25 │
│ [1035, 2011]    │               21 │               10 │
│ [1183, 1242]    │                9 │               12 │
│ [308, 1953]     │                5 │                6 │
│ [4995, 69122]   │               21 │                9 │
│ [58998, 68954]  │                5 │               25 │
│ [163, 2968] 

We now have our list of candidates movie pairs.

Each movieId in each of these pairs on this list has been rated 5 stars at least 5 times

Now, we want to find all pairs of movies that are both rated 5 stars by at least 5 different users.

In [61]:
query = f'''
CREATE OR REPLACE TABLE frequent_items_2 AS
(
    
    SELECT
        cl.candidate AS itemset
        , COUNT(*) AS frequency
    FROM
        high_ratings m
    INNER JOIN
        high_ratings m2
    USING (UserId)

    INNER JOIN
        candidate_items_2 cl
    ON
        m.movieId = cl.candidate[1] -- index starts at 1
    AND
        m2.movieId = cl.candidate[2]
        
        
   GROUP BY ALL

   HAVING frequency >= {min_support}

)
'''

duckdb.sql(query)

duckdb.sql('SELECT * FROM frequent_items_2')

┌───────────────┬───────────┐
│    itemset    │ frequency │
│    int64[]    │   int64   │
├───────────────┼───────────┤
│ [318, 1968]   │         8 │
│ [858, 1500]   │         5 │
│ [904, 1968]   │         5 │
│ [541, 555]    │         6 │
│ [527, 6377]   │         6 │
│ [4886, 6377]  │         6 │
│ [296, 6377]   │         5 │
│ [541, 8874]   │         5 │
│ [1270, 8874]  │         5 │
│ [1200, 8874]  │         6 │
│      ·        │         · │
│      ·        │         · │
│      ·        │         · │
│ [1259, 1584]  │         6 │
│ [1240, 1584]  │         5 │
│ [110, 1584]   │         7 │
│ [2571, 33794] │         7 │
│ [457, 736]    │         5 │
│ [296, 1219]   │        11 │
│ [527, 1219]   │        10 │
│ [1198, 33794] │         5 │
│ [260, 33493]  │         5 │
│ [260, 733]    │         8 │
├───────────────┴───────────┤
│   6557 rows (20 shown)    │
└───────────────────────────┘

For generating k=3 candidates, we can use the following property:

if A, B is frequent, then A,B,C will only be frequent if B,C is also frequent.

In [64]:
query = f'''

CREATE OR REPLACE TABLE candidate_items_3 AS
(
    SELECT
        [m.itemset[1], m.itemset[2], m2.itemset[2]] AS candidate
        , m.frequency AS prefix_frequency
        , m2.frequency AS suffix_frequency
    FROM
        frequent_items_2 m
    FULL OUTER JOIN
        frequent_items_2 m2
    ON 1=1
    WHERE
        m.itemset[2] = m2.itemset[1]
)
'''
duckdb.sql(query)

duckdb.sql('SELECT * FROM candidate_items_3')

┌─────────────────────┬──────────────────┬──────────────────┐
│      candidate      │ prefix_frequency │ suffix_frequency │
│       int64[]       │      int64       │      int64       │
├─────────────────────┼──────────────────┼──────────────────┤
│ [110, 318, 1968]    │               35 │                8 │
│ [47, 318, 1968]     │               23 │                8 │
│ [165, 318, 1968]    │                7 │                8 │
│ [265, 318, 1968]    │                8 │                8 │
│ [11, 318, 1968]     │                6 │                8 │
│ [288, 318, 1968]    │                5 │                8 │
│ [260, 318, 1968]    │               37 │                8 │
│ [246, 318, 1968]    │                6 │                8 │
│ [17, 318, 1968]     │                8 │                8 │
│ [272, 318, 1968]    │                5 │                8 │
│        ·            │                · │                · │
│        ·            │                · │                · │
│       

In [66]:
query = f'''
CREATE OR REPLACE TABLE frequent_items_3 AS
(
    
    SELECT
        cl.candidate AS itemset
        , COUNT(*) AS frequency
    FROM
        high_ratings m
    INNER JOIN
        high_ratings m2
    USING (UserId)

    INNER JOIN
        high_ratings m3
    USING (UserId)

    INNER JOIN
        candidate_items_3 cl
    ON
        m.movieId = cl.candidate[1] -- index starts at 1
    AND
        m2.movieId = cl.candidate[2]
    AND
        m3.movieId = cl.candidate[3]
        
        
   GROUP BY ALL

   HAVING frequency >= {min_support}

)
'''

%time duckdb.sql(query)
%time duckdb.sql('SELECT COUNT(*) FROM frequent_items_3')



FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

CPU times: user 10.7 s, sys: 127 ms, total: 10.8 s
Wall time: 6.24 s
CPU times: user 127 μs, sys: 4 μs, total: 131 μs
Wall time: 132 μs


┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        17773 │
└──────────────┘

In [68]:
duckdb.sql('SELECT * FROM frequent_items_3 ORDER BY frequency DESC')

┌────────────────────┬───────────┐
│      itemset       │ frequency │
│      int64[]       │   int64   │
├────────────────────┼───────────┤
│ [260, 1196, 1210]  │        41 │
│ [4993, 5952, 7153] │        36 │
│ [260, 1196, 2571]  │        36 │
│ [260, 1196, 1198]  │        31 │
│ [260, 296, 1196]   │        29 │
│ [260, 1210, 2571]  │        29 │
│ [1196, 1210, 2571] │        28 │
│ [2571, 4993, 7153] │        26 │
│ [50, 296, 318]     │        25 │
│ [260, 858, 1196]   │        25 │
│        ·           │         · │
│        ·           │         · │
│        ·           │         · │
│ [1080, 1199, 1222] │         5 │
│ [923, 924, 1196]   │         5 │
│ [111, 260, 2571]   │         5 │
│ [1036, 1089, 2571] │         5 │
│ [47, 1136, 1240]   │         5 │
│ [260, 356, 778]    │         5 │
│ [608, 1208, 3996]  │         5 │
│ [608, 2762, 2858]  │         5 │
│ [904, 1204, 1208]  │         5 │
│ [1080, 1213, 2716] │         5 │
├────────────────────┴───────────┤
│ ? rows (>9999 rows

# What did we find?
Let's see a bit about which movies these are. I'll look at the top couple of frequent 3-itemsets.

In [71]:
duckdb.sql('SELECT * FROM movie_dim WHERE movieId IN (260, 1196, 1210)')

┌─────────┬────────────────────────┬────────────────────────────────────────────────┬───────────────┬─────────┬────────┐
│ movieId │         title          │                extracted_title                 │ year_released │ imdbId  │ tmdbId │
│  int64  │        varchar         │                    varchar                     │    varchar    │ varchar │ int64  │
├─────────┼────────────────────────┼────────────────────────────────────────────────┼───────────────┼─────────┼────────┤
│     260 │ Star Wars: Episode I…  │ Star Wars: Episode IV - A New Hope             │ 1977          │ 0076759 │     11 │
│    1196 │ Star Wars: Episode V…  │ Star Wars: Episode V - The Empire Strikes Back │ 1980          │ 0080684 │   1891 │
│    1210 │ Star Wars: Episode V…  │ Star Wars: Episode VI - Return of the Jedi     │ 1983          │ 0086190 │   1892 │
└─────────┴────────────────────────┴────────────────────────────────────────────────┴───────────────┴─────────┴────────┘

In [73]:
duckdb.sql('SELECT * FROM movie_dim WHERE movieId IN (4993, 5952, 7153)')

┌─────────┬──────────────────────┬──────────────────────────────────────────────────┬───────────────┬─────────┬────────┐
│ movieId │        title         │                 extracted_title                  │ year_released │ imdbId  │ tmdbId │
│  int64  │       varchar        │                     varchar                      │    varchar    │ varchar │ int64  │
├─────────┼──────────────────────┼──────────────────────────────────────────────────┼───────────────┼─────────┼────────┤
│    4993 │ Lord of the Rings:…  │ Lord of the Rings: The Fellowship of the Ring,…  │ 2001          │ 0120737 │    120 │
│    5952 │ Lord of the Rings:…  │ Lord of the Rings: The Two Towers, The           │ 2002          │ 0167261 │    121 │
│    7153 │ Lord of the Rings:…  │ Lord of the Rings: The Return of the King, The   │ 2003          │ 0167260 │    122 │
└─────────┴──────────────────────┴──────────────────────────────────────────────────┴───────────────┴─────────┴────────┘

# Converting from frequent itemsets to association rules

That's pretty good! We've found two of the trilogies that have the strongest fan bases.

But, we'd like to turn these into rules, like:

{"Star Wars: Episode IV - A New Hope" , "Star Wars: Episode V - The Empire Strikes Back"} => "Star Wars: Episode VI - Return of the Jedi", support: x%, confidence y%

We might also want to include all patterns of length 2, as well.

But, for simplicity, let's do only length 2 antecedents

In [76]:
min_conf = 1 

query = f'''

CREATE OR REPLACE TABLE rules AS 
(

    WITH user_count AS
    
    ( SELECT COUNT(DISTINCT UserId) AS user_count FROM high_ratings)
    
   
    SELECT 
        3 AS k
        , item.itemset[1:2] AS antecedent
        , [item.itemset[3]] AS consequent
        , item.frequency
        , user_count
        , f1.frequency AS antecedent_frequency
        , f2.frequency AS consequent_frequency
        , 100*item.frequency/user_count AS support
        , 100*item.frequency/f1.frequency AS confidence
    
    FROM 
        frequent_items_3 item
    LEFT JOIN
        user_count
    ON 1=1
    
    LEFT JOIN
        frequent_items_2 f1
    ON
        item.itemset[1:2] = f1.itemset
    
    LEFT JOIN
        frequent_items_1 f2
    ON 
        item.itemset[3] = f2.movieId
    
    WHERE  100*item.frequency/f1.frequency >= {min_conf}

UNION 

    SELECT 
        3 AS k
        , item.itemset[2:3] AS antecedent
        , [item.itemset[1]] AS consequent
        , item.frequency
        , user_count
        , f1.frequency AS antecedent_frequency
        , f2.frequency AS consequent_frequency
        , 100*item.frequency/user_count AS support
        , 100*item.frequency/f1.frequency AS confidence
    
    FROM 
        frequent_items_3 item
    LEFT JOIN
        user_count
    ON 1=1
    
    LEFT JOIN
        frequent_items_2 f1
    ON
        item.itemset[2:3] = f1.itemset
    
    LEFT JOIN
        frequent_items_1 f2
    ON 
        item.itemset[1] = f2.movieId
    
    WHERE  100*item.frequency/f1.frequency >= {min_conf}

UNION

    SELECT 
        3 AS k
        , [item.itemset[1],item.itemset[3]] AS antecedent
        , [item.itemset[2]] AS consequent
        , item.frequency
        , user_count
        , f1.frequency AS antecedent_frequency
        , f2.frequency AS consequent_frequency
        , 100*item.frequency/user_count AS support
        , 100*item.frequency/f1.frequency AS confidence
    
    FROM 
        frequent_items_3 item
    LEFT JOIN
        user_count
    ON 1=1
    
    LEFT JOIN
        frequent_items_2 f1
    ON
        [item.itemset[1],item.itemset[3]] = f1.itemset
    
    LEFT JOIN
        frequent_items_1 f2
    ON 
        item.itemset[2] = f2.movieId
    
    WHERE  100*item.frequency/f1.frequency >= {min_conf}

)



    
'''

duckdb.sql(query)

duckdb.sql('SELECT * FROM rules ORDER BY support DESC')

┌───────┬───────────────┬────────────┬───┬──────────────────────┬────────────────────┬────────────────────┐
│   k   │  antecedent   │ consequent │ … │ consequent_frequency │      support       │     confidence     │
│ int32 │    int64[]    │  int64[]   │   │        int64         │       double       │       double       │
├───────┼───────────────┼────────────┼───┼──────────────────────┼────────────────────┼────────────────────┤
│     3 │ [1196, 1210]  │ [260]      │ … │                  104 │  7.155322862129145 │   89.1304347826087 │
│     3 │ [260, 1196]   │ [1210]     │ … │                   65 │  7.155322862129145 │  66.12903225806451 │
│     3 │ [260, 1210]   │ [1196]     │ … │                   80 │  7.155322862129145 │   80.3921568627451 │
│     3 │ [1196, 2571]  │ [260]      │ … │                  104 │  6.282722513089006 │  85.71428571428571 │
│     3 │ [260, 2571]   │ [1196]     │ … │                   80 │  6.282722513089006 │               80.0 │
│     3 │ [4993, 5952]  │ [7

Let's create a few helper functions, then try out a simplistic recommender!

In [79]:
def find_movie_from_title(title):
    return duckdb.sql(f"""SELECT * FROM movie_dim WHERE UPPER(title) LIKE '%{title.upper()}%'""")


def find_recommendations_from_two(id1, id2):
    sorted_ids = sorted([id1,id2])
    query = f"""
                SELECT
                    r.antecedent[1] AS movie1
                    , r.antecedent[2] AS movie2
                    , r.consequent[1] AS movie3
                    , m.title AS recommendation
                    , r.support
                    , r.confidence
                FROM
                    rules r
                LEFT JOIN
                    movie_dim m
                ON 
                    r.consequent[1] = m.movieId
                WHERE
                    r.antecedent[1] = {sorted_ids[0]}
                AND
                    r.antecedent[2] = {sorted_ids[1]}
                ORDER BY confidence DESC
                """
    return duckdb.sql(query)
                

In [81]:
find_movie_from_title('Inception'), find_movie_from_title('Interstellar')

(┌─────────┬──────────────────┬─────────────────┬───────────────┬─────────┬────────┐
 │ movieId │      title       │ extracted_title │ year_released │ imdbId  │ tmdbId │
 │  int64  │     varchar      │     varchar     │    varchar    │ varchar │ int64  │
 ├─────────┼──────────────────┼─────────────────┼───────────────┼─────────┼────────┤
 │   79132 │ Inception (2010) │ Inception       │ 2010          │ 1375666 │  27205 │
 └─────────┴──────────────────┴─────────────────┴───────────────┴─────────┴────────┘,
 ┌─────────┬─────────────────────┬─────────────────┬───────────────┬─────────┬────────┐
 │ movieId │        title        │ extracted_title │ year_released │ imdbId  │ tmdbId │
 │  int64  │       varchar       │     varchar     │    varchar    │ varchar │ int64  │
 ├─────────┼─────────────────────┼─────────────────┼───────────────┼─────────┼────────┤
 │  109487 │ Interstellar (2014) │ Interstellar    │ 2014          │ 0816692 │ 157336 │
 └─────────┴─────────────────────┴───────────────

In [83]:
find_recommendations_from_two(79132, 109487)

┌────────┬────────┬────────┬──────────────────────────────────────────────────┬────────────────────┬───────────────────┐
│ movie1 │ movie2 │ movie3 │                  recommendation                  │      support       │    confidence     │
│ int64  │ int64  │ int64  │                     varchar                      │       double       │      double       │
├────────┼────────┼────────┼──────────────────────────────────────────────────┼────────────────────┼───────────────────┤
│  79132 │ 109487 │   7153 │ Lord of the Rings: The Return of the King, The…  │ 1.0471204188481675 │ 54.54545454545455 │
│  79132 │ 109487 │   2571 │ Matrix, The (1999)                               │ 1.0471204188481675 │ 54.54545454545455 │
│  79132 │ 109487 │   4993 │ Lord of the Rings: The Fellowship of the Ring,…  │ 1.0471204188481675 │ 54.54545454545455 │
│  79132 │ 109487 │  68954 │ Up (2009)                                        │ 0.8726003490401396 │ 45.45454545454545 │
│  79132 │ 109487 │    527 │ Sch