In [1]:
# Installing necessary dependent packages
!pip install -q fsspec
!pip install -q ipython-autotime
!pip install -q pyspark
!pip install -q s3fs

# Loading autotime for the notebook
%load_ext autotime

[K     |████████████████████████████████| 1.6 MB 3.9 MB/s 
[K     |████████████████████████████████| 281.3 MB 44 kB/s 
[K     |████████████████████████████████| 199 kB 63.5 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 64 kB 2.0 MB/s 
[K     |████████████████████████████████| 8.6 MB 18.3 MB/s 
[K     |████████████████████████████████| 140 kB 23.6 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
requests 2.23.0 requires urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1, but you have urllib3 1.26.12 which is incompatible.[0m
[?25htime: 884 µs (started: 2022-08-23 08:52:41 +00:00)


# Content-based Recommendations with PCA
## Similar movies have similar tags. How well is this similarity captured with PCA?

In [2]:
# SparkSession
from pyspark.sql import SparkSession
from seaborn import heatmap

import pandas as pd

spark = SparkSession.builder.master("local") \
                            .appName("PCA_ML_Lib") \
                            .getOrCreate()

time: 10.6 s (started: 2022-08-23 08:52:41 +00:00)


In [3]:
df = pd.read_csv("s3a://sparkdemonstration/movielens-tag-relevance.csv", header="infer")
df = spark.createDataFrame(df)

time: 5min 1s (started: 2022-08-23 08:52:52 +00:00)


In [4]:
import random
colsToShow = ['title'] + [random.choice(df.columns) for i in range(4)]
df.select(*colsToShow).show()

+--------------------+------------------+------------------+------------------+------------------+
|               title|      heartwarming|            ballet|             ocean|              pulp|
+--------------------+------------------+------------------+------------------+------------------+
|    Toy Story (1995)|0.8122499999999999|             0.004|0.0797499999999999|             0.143|
|      Jumanji (1995)|           0.33525|0.0052499999999999|0.0732499999999999|           0.26275|
|Grumpier Old Men ...|           0.09025|0.0037499999999999|           0.04125|0.1392499999999999|
|Waiting to Exhale...|              0.31|            0.0045|             0.021|           0.18925|
|Father of the Bri...|            0.4865|            0.0035|           0.04125|             0.151|
|         Heat (1995)|            0.0635|             0.004|           0.03675|           0.20575|
|      Sabrina (1995)|           0.23625|0.0047499999999999|           0.04625|            0.1185|
| Tom and 

In [5]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

time: 174 ms (started: 2022-08-23 08:58:04 +00:00)


In [6]:
newCols = []

for c in df.columns:
    if "." in c:
        new_column = c.replace('.', '_')
        df = df.withColumnRenamed(c, new_column)
        newCols.append(new_column)
    else:
        newCols.append(c)

time: 799 ms (started: 2022-08-23 08:58:04 +00:00)


In [7]:
assembler = VectorAssembler(inputCols=[c for c in newCols if c != 'title'],
                            outputCol='features')
scaler    = StandardScaler(inputCol='features', outputCol='normFeats', withMean=True)

df          = assembler.transform(df)
scalerModel = scaler.fit(df)
df          = scalerModel.transform(df)

time: 24.4 s (started: 2022-08-23 08:58:05 +00:00)


## PCA

In [8]:
rdd = df.select('normFeats').rdd

time: 346 ms (started: 2022-08-23 08:58:29 +00:00)


In [9]:
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg import Vectors

time: 41.2 ms (started: 2022-08-23 08:58:30 +00:00)


In [10]:
vectors = rdd.map(Vectors.dense)

time: 5.72 ms (started: 2022-08-23 08:58:30 +00:00)


In [11]:
matrix = RowMatrix(vectors)

time: 164 ms (started: 2022-08-23 08:58:30 +00:00)


## Get the PCs

In [12]:
pc = matrix.computePrincipalComponents(500)
matrix_reduced = matrix.multiply(pc)

time: 1min 15s (started: 2022-08-23 08:58:30 +00:00)


## Nearest Neighbour Search in PC space

In [13]:
import numpy as np
X = matrix_reduced.rows.map(np.array).collect()
X = np.array(X)

time: 25.6 s (started: 2022-08-23 08:59:46 +00:00)


In [14]:
titles = df.select('title').toPandas()

time: 4.47 s (started: 2022-08-23 09:00:11 +00:00)


In [15]:
import pandas as pd
pdf = pd.DataFrame(X, index=titles['title'])

time: 5.03 ms (started: 2022-08-23 09:00:16 +00:00)


In [16]:
pdf.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),5.580545,-18.610977,-16.135425,-10.331208,11.372613,12.75103,-3.018331,-6.520022,8.494472,-6.138467,...,1.253834,-0.097425,-0.810692,-0.710021,0.116611,-0.089386,-0.244684,0.246042,-0.294096,-0.445163
Jumanji (1995),-11.108312,-13.98895,-7.307447,-9.619463,2.110947,5.069314,0.301305,-7.396269,4.811784,-0.296702,...,-0.311983,0.371603,0.728868,0.54526,-1.490517,0.211687,0.778517,-0.415525,-0.500738,0.372158
Grumpier Old Men (1995),-9.606716,1.298515,-4.212114,2.36377,1.961123,1.262194,-1.321139,0.344805,0.539581,-1.326395,...,0.042183,0.004842,-0.713675,0.68396,0.393586,0.619775,-0.084641,-0.034188,-0.127041,0.06334
Waiting to Exhale (1995),-8.162652,6.345987,-5.327036,0.797458,0.42299,-2.181263,-3.5103,-1.65222,0.060993,2.596061,...,-0.480278,0.623731,0.046855,-0.388345,-0.026437,-0.022832,0.466444,0.289981,-0.533415,-0.344706
Father of the Bride Part II (1995),-10.634055,2.242504,-8.047581,0.584347,3.230809,-1.263154,-4.077159,-2.192635,0.611344,-0.085841,...,-0.176579,-0.209784,-0.34811,0.080846,0.547523,0.737323,-0.301655,0.399188,0.327135,0.084268


time: 95.1 ms (started: 2022-08-23 09:00:16 +00:00)


In [17]:
from sklearn.neighbors import NearestNeighbors

time: 578 ms (started: 2022-08-23 09:00:16 +00:00)


In [18]:
n_pcs = 2
nn = NearestNeighbors()
nn = nn.fit(X[:, :n_pcs])
neighbors = nn.kneighbors(pdf.loc['Toy Story (1995)'].values[:n_pcs].reshape(1, -1), return_distance=False)
pdf.index[neighbors.ravel()].tolist()

['Toy Story (1995)',
 'The Punisher: Dirty Laundry (2012)',
 "Empire of Dreams: The Story of the 'Star Wars' Trilogy (2004)",
 'Ten Commandments, The (1956)',
 'Blood of Heroes, The (Salute of the Jugger, The) (1989)']

time: 29.4 ms (started: 2022-08-23 09:00:17 +00:00)


<table>
    <tr>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/1/13/Toy_Story.jpg"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/c/cf/Poster_for_Dirty_Laundry.jpg"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/e/ef/Empire_of_Dreams.png"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/d/df/The_Ten_Commandments_%281956_film_poster%29.jpg/313px-The_Ten_Commandments_%281956_film_poster%29.jpg"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/2/2a/Bloodofheroes.jpg"></img></td>
        
</table>

## Increase the number of Principal Components

In [19]:
n_pcs = 10
nn = NearestNeighbors()
nn = nn.fit(X[:, :n_pcs])
neighbors = nn.kneighbors(pdf.loc['Toy Story (1995)'].values[:n_pcs].reshape(1, -1), return_distance=False)
pdf.index[neighbors.ravel()].tolist()

['Toy Story (1995)',
 'Finding Nemo (2003)',
 'Monsters, Inc. (2001)',
 "Bug's Life, A (1998)",
 'Ratatouille (2007)']

time: 30.6 ms (started: 2022-08-23 09:00:17 +00:00)


<table>
    <tr>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/1/13/Toy_Story.jpg"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/2/29/Finding_Nemo.jpg"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/6/63/Monsters_Inc.JPG"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/c/cc/A_Bug%27s_Life.jpg"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/5/50/RatatouillePoster.jpg"></img></td>
        
</table>

In [20]:
n_pcs = 100
nn = NearestNeighbors()
nn = nn.fit(X[:, :n_pcs])
neighbors = nn.kneighbors(pdf.loc['Toy Story (1995)'].values[:n_pcs].reshape(1, -1), return_distance=False)
pdf.index[neighbors.ravel()].tolist()

['Toy Story (1995)',
 'Monsters, Inc. (2001)',
 'Toy Story 2 (1999)',
 "Bug's Life, A (1998)",
 'Toy Story 3 (2010)']

time: 26.6 ms (started: 2022-08-23 09:00:17 +00:00)


In [21]:
n_pcs = 500
nn = NearestNeighbors()
nn = nn.fit(X[:, :n_pcs])
neighbors = nn.kneighbors(pdf.loc['Toy Story (1995)'].values[:n_pcs].reshape(1, -1), return_distance=False)
pdf.index[neighbors.ravel()].tolist()

['Toy Story (1995)',
 'Toy Story 2 (1999)',
 'Monsters, Inc. (2001)',
 'Toy Story 3 (2010)',
 "Bug's Life, A (1998)"]

time: 47.8 ms (started: 2022-08-23 09:00:17 +00:00)


In [22]:
n_pcs = 10
nn = NearestNeighbors()
nn = nn.fit(X[:, :n_pcs])
neighbors = nn.kneighbors(pdf.loc['Avengers, The (2012)'].values[:n_pcs].reshape(1, -1), return_distance=False)
pdf.index[neighbors.ravel()].tolist()

['Avengers, The (2012)',
 'X-Men (2000)',
 'Iron Man (2008)',
 'X-Men: First Class (2011)',
 'X2: X-Men United (2003)']

time: 61.3 ms (started: 2022-08-23 09:00:17 +00:00)


### Graded Questions

In [23]:
n_pcs = 300
nn = NearestNeighbors()
nn = nn.fit(X[:, :n_pcs])
neighbors = nn.kneighbors(pdf.loc['Interstellar (2014)'].values[:n_pcs].reshape(1, -1), return_distance=False)
pdf.index[neighbors.ravel()].tolist()

['Interstellar (2014)',
 'The Martian (2015)',
 'Sunshine (2007)',
 'Gravity (2013)',
 'Arrival (2016)']

time: 42.6 ms (started: 2022-08-23 09:00:17 +00:00)


time: 61.9 ms (started: 2022-08-23 09:00:17 +00:00)
