# Building a Recommendation System for Podcasts

Final Project - APAM Senior Seminar - Fall 2022 - October 10

Yamini Ananth, Jafar Vohra, Kathy Wang, Abhiram Kolluri

In [1]:
#installing modules
!pip install stop-words
!pip install pyspark

Collecting stop-words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25ldone
[?25h  Created wheel for stop-words: filename=stop_words-2018.7.23-py3-none-any.whl size=32916 sha256=c2c8a20da4dd9a7b1c021b21274091be7f408bb6853e70297b3e3345e93af2df
  Stored in directory: /Users/abhiramkolluri/Library/Caches/pip/wheels/eb/03/0d/3bd31c983789aeb0b4d5e2ca48590288d9db1586cf5f225062
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 132 kB/s eta 0:00:013███████              | 158.4 MB 6.5 MB/s eta 0:00:19     |█████████████████████████       | 220.0 MB 454 kB/s eta 0:02:16
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 25.2

In [1]:
#imports for text pre-processing & data manipulation

import itertools
import re
import nltk
import pandas as pd
import numpy as np

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from stop_words import get_stop_words

#imports for collaborative filtering model

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

#utilities for text processing

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abhiramkolluri/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abhiramkolluri/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/abhiramkolluri/nltk_data...


True

# Data Pre-Processing

Data scraped from Apple Podcasts using BeautifulSoup

Scripts for web scraping attributed to [Siddharth Kumaran](https://github.com/siddgood/podcast-recommendation-engine/blob/master/scripts/get_podcast_info.py)

Scraped data includes: 
* Title (text)
* Producer (text)
* Description (text)
* 6 Recent Episode Titles (text)
* 6 Recent Episode Descriptions (text)

Pre-processing included:
* Filtered out URLs and special characters
* Tokenized (separated each word into its own string)
* Removed stop-words (common words like articles, pronouns etc)
* Lemmatized (removed endings from words, so ‘like’ and ‘likes’ and ‘likely’ would all be converted to ‘lik’)


In [3]:
podcasts_df_orig = pd.read_pickle('https://github.com/yaminivibha/podcast-recs/blob/main/data/data/pickle_files/english_podcasts_detailed_cleaned.pkl?raw=true')

In [4]:
# Combining all text data into one column for downstream analysis

podcasts_df = podcasts_df_orig
podcasts_df['text'] = podcasts_df[['title', 'producer', 'genre', 'description', 'episode_titles', 'episode_descriptions']].apply(lambda x: ' '.join(x), axis=1)
podcasts_df = podcasts_df.drop(columns=['genre', 'description', 'num_episodes', 'rating', 'num_reviews', 'link', 'episode_titles', 'episode_descriptions'])
podcasts_df['idx'] = list(range(podcasts_df.shape[0]))

In [5]:
# Creating stopwords list & tokenizer

stop = get_stop_words('en')
stop = [re.sub(r'([^\s\w]|_)+', '', x) for x in stop]
tokenizer = RegexpTokenizer(r'\w+')

In [6]:
# Creating helper functions to remove stop words 
# and lemmatize tokenized sentences

def remove_stop(text, stop):
    return [word for word in text if word not in stop ]

def lemmatize(text, l_stemmer):
    return [l_stemmer.lemmatize(word) for word in text]

In [7]:
def preprocess_text(text):
    # remove mixed alphanumeric, URLS, stop words
    text = re.sub(r"""(?x) \b(?=\w*\d)\w+\s*""","", text)
    re.sub(r"http\S+", "", text)
    text = re.sub(r'([^\s\w]|_)+', '', text)
    text = tokenizer.tokenize(text.lower())
    text = remove_stop(text, stop)
    text = lemmatize(text, WordNetLemmatizer())
    
    new_text = ' '.join(text)
    return new_text

In [8]:
podcasts_df['text'] = podcasts_df['text'].map(preprocess_text)
podcasts_df = podcasts_df.query('text !=""')

# Preparing Utilities for Recommendation

Collection of helper functions

In [9]:
def get_title_from_index(index):
    """get title of podcast from index of podcast
        parameters:
            index: (int)
        returns:
            title (string)
        raises:
            ValueError: index not in podcasts_df['idx']
    """
    return podcasts_df[podcasts_df.idx == index]["title"].values[0]

def get_index_from_title(title):
    """get index of podcast from title of podcast
        parameters:
            title: (string)
        returns:
            index (int)
        raises:
            ValueError: string not in podcasts_df['title']
    """
    return podcasts_df[podcasts_df.title == title]["idx"].values[0]

In [10]:
def recommend(podcast_title, sim_matrix, number_recs=5, pretty_print=True):
    """given a podcast title & a similarity matrix, return n most similar podcasts
        parameters:
            podcast_title: (str) must be in podcasts_tf['title]
            sim_matrix: (np.array) similarity matrix
            number_recs: (int) how many recommendations do you want per title?
        returns:
            recommendations: (list[str]) list of n most similar podcasts 
                            according to the similarity matrix
    """

    podcast_id = get_index_from_title(podcast_title)
    similar_podcasts =  list(enumerate(sim_matrix[podcast_id]))
    sorted_similar_podcast = sorted(similar_podcasts,key=lambda x:x[1],reverse=True)
    
    recommendations = [get_title_from_index(sorted_similar_podcast[i][0]) for i in range(number_recs+2)]
    
    ### formatting for pretty printing ###
    if pretty_print:
      print("If you liked {}, try: ".format(podcast_title))
      for i in recommendations[1:]:
          print("     {}".format(i))
    
    return recommendations[1:]

In [11]:
# Podcasts we'll use to validate results
sample_podcasts = ['The Daily', "Murder, etc.",'This American Life', 'Call Her Daddy', 'The Joe Rogan Experience']

# Bag of Words + Cosine Similarity

Here, we use the bag of words model to encode the podcast text and use that to generate a cosine similarity matrix.

In [12]:
cv = CountVectorizer()
cv_matrix = cv.fit_transform(podcasts_df["text"])
cv_cosine_sim = cosine_similarity(cv_matrix)

In [13]:
# Let's look at how many words are in our vocabulary:
cv_matrix.shape

(4303, 95787)

In [14]:
# Let's examine our cosine similarity matrix
cv_cosine_sim

array([[1.        , 0.0667489 , 0.02251472, ..., 0.00647609, 0.02846787,
        0.03677664],
       [0.0667489 , 1.        , 0.11828514, ..., 0.02986087, 0.09000942,
        0.11970001],
       [0.02251472, 0.11828514, 1.        , ..., 0.17455451, 0.17043338,
        0.12398095],
       ...,
       [0.00647609, 0.02986087, 0.17455451, ..., 1.        , 0.21086224,
        0.1287162 ],
       [0.02846787, 0.09000942, 0.17043338, ..., 0.21086224, 1.        ,
        0.13933709],
       [0.03677664, 0.11970001, 0.12398095, ..., 0.1287162 , 0.13933709,
        1.        ]])

In [15]:
sample_podcasts

['The Daily',
 'Murder, etc.',
 'This American Life',
 'Call Her Daddy',
 'The Joe Rogan Experience']

In [16]:
for i in sample_podcasts:
    recs = recommend(i, cv_cosine_sim)
    print('\n')

If you liked The Daily, try: 
     Impeachment Inquiry: Updates from The Washington Post
     Impeachment: A Daily Podcast
     The Takeaway
     Article II: Inside Impeachment
     The Daily 202's Big Idea
     The 11th Hour with Brian Williams


If you liked Murder, etc., try: 
     Criminology
     Murderville
     Unsolved Murders: True Crime Stories
     Murder Minute
     Don't Talk to Strangers
     True Crime All The Time Unsolved


If you liked This American Life, try: 
     The Stoop Storytelling Series
     The Story Home Children's Audio Stories
     Spooky Boo's Scary Story Time
     The Story Behind
     This is the Gospel Podcast
     1001 Heroes, Legends, Histories & Mysteries Podcast


If you liked Call Her Daddy, try: 
     Stiff Socks
     Two Judgey Girls
     NAKED with Catt Sadler
     Slay Girl Slay
     Hot Marriage. Cool Parents.
     Safe For Work


If you liked The Joe Rogan Experience, try: 
     The Creative Penn Podcast For Writers
     1001 Classic Short 

In [20]:
#Try it yourself! 
your_podcast = "The Jordan B. Peterson Podcast" #Replace this with a podcast of your choice!
recs = recommend(your_podcast, cv_cosine_sim)

If you liked The Jordan B. Peterson Podcast, try: 
     Jordan Peterson Archive
     Jordan Peterson Interviews & Speeches
     AMERICA'S DOCTOR: The Dr. Oz Podcast
     Adulting
     She Podcast
     The Skinny Confidential Him & Her Podcast


# TFIDF + Cosine Similarity 

Here, we use tf-idf to encode the podcast text and use that to generate a cosine similarity matrix.

In [22]:
tf = TfidfVectorizer()
tf_matrix = tf.fit_transform(podcasts_df["text"])
tf_cosine_sim = cosine_similarity(tf_matrix)

In [23]:
# Let's examine our tf_idf cosine similarity matrix!

tf_cosine_sim

array([[1.        , 0.01419719, 0.00750197, ..., 0.00185515, 0.00643431,
        0.00463919],
       [0.01419719, 1.        , 0.03029042, ..., 0.00772847, 0.01713156,
        0.02497313],
       [0.00750197, 0.03029042, 1.        , ..., 0.0502345 , 0.042034  ,
        0.03182914],
       ...,
       [0.00185515, 0.00772847, 0.0502345 , ..., 1.        , 0.04325737,
        0.01595114],
       [0.00643431, 0.01713156, 0.042034  , ..., 0.04325737, 1.        ,
        0.0208546 ],
       [0.00463919, 0.02497313, 0.03182914, ..., 0.01595114, 0.0208546 ,
        1.        ]])

In [24]:
for i in sample_podcasts:
    recs = recommend(i, tf_cosine_sim)
    print('\n')

If you liked The Daily, try: 
     Impeachment Inquiry: Updates from The Washington Post
     The 11th Hour with Brian Williams
     The Daily 202's Big Idea
     Article II: Inside Impeachment
     Impeachment: A Daily Podcast
     The Takeaway


If you liked Murder, etc., try: 
     Murder Minute
     Criminology
     Murderville
     Unsolved Murders: True Crime Stories
     Don't Talk to Strangers
     True Crime All The Time Unsolved


If you liked This American Life, try: 
     Experimental Brewing
     1A
     Through the Looking Glass: A LOST Retrospective
     The Grave Talks | Haunted, Paranormal & Supernatural
     Darkness Prevails Podcast | TRUE Horror Stories
     BeerSmith Home and Beer Brewing Podcast


If you liked Call Her Daddy, try: 
     hey, girl.
     Girls Night with Stephanie May Wilson
     Stiff Socks
     Fierce Girls
     Becoming Something with Jonathan Pokluda
     Two Judgey Girls


If you liked The Joe Rogan Experience, try: 
     MILLION DOLLAR LIFE LE

In [54]:
#Try it yourself! 
your_podcast = "Making Sense with Sam Harris" #Replace this with a podcast of your choice!
recs = recommend(your_podcast, tf_cosine_sim)

If you liked Making Sense with Sam Harris, try: 
     It's Been a Minute with Sam Sanders
     The Kevin Rose Show
     The Good Parts with Andy Grammer
     The Peter Attia Drive
     Science Salon
     Common Sense with Dan Carlin


# Compare results of the two models

We want to see whether or not the models tend to agree,
and what amount of the total body of podcasts are ever actually recommended (do we solve the long tail problem)?

In [32]:
def print_compare(pod, num_recs=5):
    """for a given podcast and number of recommendations
        print the recommendations from both tf-idf and cv
        unique to tf-idf
        and unique to cv
    """

    tf_idf_recs = recommend(pod, tf_cosine_sim, num_recs, pretty_print=False)
    cv_recs = recommend(pod, cv_cosine_sim, num_recs, pretty_print=False)

    both = list(set(tf_idf_recs).intersection(set(cv_recs)))
    unique_to_tf = list(set(tf_idf_recs).difference(set(cv_recs)))
    unique_to_cv = list(set(cv_recs).difference(set(tf_idf_recs)))
    print("Recs for {}: ".format(pod))
    
    print("    Recommended by both tf-idf and cv:")
    for i in both: print("         {}".format(i))

    print("    Uniqely recommended by tf-idf:")
    for i in unique_to_tf: print("         {}".format(i))

    print("    Uniqely recommended by cv:")
    for i in unique_to_cv: print("         {}".format(i))
    print('\n')

In [None]:
for pod in sample_podcasts: print_compare(pod) 

Recs for The Daily: 
    Recommended by both tf-idf and cv:
         Impeachment Inquiry: Updates from The Washington Post
         Article II: Inside Impeachment
         The 11th Hour with Brian Williams
         Impeachment: A Daily Podcast
         The Daily 202's Big Idea
         The Takeaway
    Uniqely recommended by tf-idf:
    Uniqely recommended by cv:


Recs for Murder, etc.: 
    Recommended by both tf-idf and cv:
         True Crime All The Time Unsolved
         Murderville
         Criminology
         Unsolved Murders: True Crime Stories
         Don't Talk to Strangers
         Murder Minute
    Uniqely recommended by tf-idf:
    Uniqely recommended by cv:


Recs for This American Life: 
    Recommended by both tf-idf and cv:
    Uniqely recommended by tf-idf:
         Through the Looking Glass: A LOST Retrospective
         1A
         The Grave Talks | Haunted, Paranormal & Supernatural
         Darkness Prevails Podcast | TRUE Horror Stories
         Experimental B

In [49]:
# Try it yourself!

your_podcast = "Happier with Gretchen Rubin" #Replace this with your podcast 
print_compare(your_podcast)

Recs for Happier with Gretchen Rubin: 
    Recommended by both tf-idf and cv:
         Do The Thing, with Whole30's Melissa Urban
         Achieve Your Goals with Hal Elrod
    Uniqely recommended by tf-idf:
         Kalyn’s Coffee Talk
         Family Ghosts
         Side Hustle School
         Happy Hour with Gretchen Geraghty
    Uniqely recommended by cv:
         minimal-ish: realistic minimalism
         The Edge of Sleep
         This is Love
         Better Than Happy




In [34]:
def coverage(model_name, sim_matrix, num_recs=10):
    """Track what % of the overall library of podcasts
        was ever actually recommended, when we serve
        10 recs for each podcast in the library

        parameters:
          model_name: (str) either 'tf-idf' or 'cv'
                    should correspond to the passed sim_matrix 
          sim_matrix: (np.array) an item-item similarity matrix
          num_recs: how many recs for each item in library?
        returns:
          indices: (np.array) recommended podcast indices
    """
    indices = np.argpartition(sim_matrix, -num_recs, axis=1)[:,-num_recs:]
    
    #calculating coverage:
    recommended = set(list(itertools.chain(*indices)))
    coverage = (len(recommended)/indices.shape[0])*100

    print("Stats for {} Model with {} recs".format(model_name, num_recs))
    print("    Coverage: {} %".format(coverage))
    
    return indices

In [35]:
cv_recs_10 = coverage("CountVectorizer", cv_cosine_sim, 5)
tf_idf_recs_10 = coverage("tf-idf", tf_cosine_sim, 5)

Stats for CountVectorizer Model with 5 recs
    Coverage: 100.0 %
Stats for tf-idf Model with 5 recs
    Coverage: 100.0 %


# Implementing Collaborative Filtering

## Generating Fake User Ratings

We want to create users that have preferences.
Each of them randomly rates between 5-20 randomly selected podcasts on a scale from 1-5. 
This is a non-realistic way to generate fake user ratings (as most users like similar things, and have a pattern to how they rate things). 

In [36]:
def generate_user_ratings(users_count):
    """generates fake user ratings
      parameters:
        users_count: (int) how many fake users to generate
      returns:
        users: (pd.DataFrame) table of user, podcast, & rating    
    """
    
    user_ratings = []
    for idx, user in enumerate(np.arange(0,users_count)):
        ratings = []
        quantity_rated = np.random.randint(5,21)
        reviewed = set()
        
        for i in np.arange(quantity_rated):
            podcast =  np.random.randint(0, podcasts_df.shape[0])
            title = get_title_from_index(podcast)
            
            # don't want the same user to review 
            # the same podcast multiple times
            while (podcast in reviewed):
                podcast =  np.random.randint(0, podcasts_df.shape[0]+1)
            reviewed.add(podcast)

            rating = np.random.randint(1,6)
            ratings.append([idx, podcast, rating, title])
        
        user_df = pd.DataFrame(ratings, \
                          columns=['user_id', 'podcast_idx', 'rating', 'podcast_title'])
        user_ratings.append(user_df)
    return pd.concat(user_ratings)

In [50]:
def checkUserProfile(user_idx, pretty_print=True):
  """For a given user id, create a profile including 3 attributes

    parameters:
      user_id: (int) user id
      print: (boolean) whether or not printed outcomes are desired
    
    returns:
      user_profile: (dict) contains 3 attributes of a user profile
  """
  user_id=user_idx
  user_reviews = usr.query('user_id==@user_id') \
          .sort_values('rating', ascending=False)
  
  user_profile = {'no_reviews' : user_reviews.shape[0], 
                  'top_5_shows' : user_reviews['podcast_title'].iloc[:5].to_list(), 
                  'ave_rating' : user_reviews['rating'].mean() }
  
  
  #### formatting for pretty printing ###  
  if pretty_print:
    print(f"User #{user_id} Profile:")
    print(f"{user_profile['no_reviews']} reviews")
    print(f"Mean rating: {user_profile['ave_rating']} stars")
    print(f"Top 5 shows:")
    
    for show in user_profile['top_5_shows']:
      print(f"       {show}")
    print("                ..            ")
  
  return user_profile

In [51]:
num_users = 1000
usr = generate_user_ratings(num_users)

In [52]:
#investigate a random user!
my_random_user = np.random.randint(0, num_users)
profile = checkUserProfile(my_random_user)

User #613 Profile:
11 reviews
Mean rating: 3.4545454545454546 stars
Top 5 shows:
       Encyclopedia Womannica
       The Late-Round Podcast
       The World of Phil Hendrie
       The SelfWork Podcast
       French Podcast
                ..            


## Implement ALS

Now that we have our user rating data, we can implement collaborative filtering to generate recommendations based on user similarity. We specifically used the pyspark implementation of ALS Matrix Factorization with root mean squared error. 

We used a pyspark implementation of ALS code as published by [Jeffrey Chiang](https://github.com/chiang9/Medium_blog/blob/main/ALS_model/movielen%20ALS.ipynb)

In [53]:
sc = SparkContext.getOrCreate()
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
     \
    .getOrCreate()

In [None]:
df = spark.createDataFrame(usr)

In [None]:
train, test = df.randomSplit([0.7,0.3],111)

In [None]:
# we use the cross validator to tune the hyperparameters
als = ALS(
         userCol="user_id", 
         itemCol="podcast_idx",
         ratingCol="rating", 
         coldStartStrategy="drop" 
)

param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 100]) \
            .addGrid(als.regParam, [.1]) \
            .addGrid(als.maxIter, [10]) \
            .build()

evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="rating", 
           predictionCol="prediction")

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3, parallelism = 6)
model = cv.fit(train)

In [None]:
best_model = model.bestModel

print(f"# of latent factors Used = {best_model._java_obj.parent().getRank()}")

# of latent factors Used = 100


In [None]:
prediction = best_model.transform(test)
rmse = evaluator.evaluate(prediction)
print(f'RMSE = {rmse}')

# we can get the user latent factors and item latent factors from the model
user_latent_features = best_model.userFactors
item_latent_features = best_model.itemFactors

RMSE = 2.5522706293182815


In [None]:
user_recs = best_model.recommendForAllUsers(3)
user_recs_pandas = user_recs.toPandas()

In [None]:
def checkUserRecommendations(user_row_idx, pretty_print=True):
  """Print each user's profile
    and recommended future podcasts/predicted ratings

    parameters:
      user_row: (int) index of row from user_recs dataframe
      print: (boolean) whether or not printed outcomes are desired
    
    returns:
      user_recs: (list) recommended podcast titles
  """
  user_row = user_recs_pandas.iloc[user_row_idx]
  user_id = user_row['user_id']
  user_profile = checkUserProfile(user_id)
  
  user_recs=[]
  for rec in user_row['recommendations']:
    rec_idx = rec.__getitem__('podcast_idx')
    rec_title = get_title_from_index(rec_idx)
    user_recs.append(rec_title)

  #### formatting for pretty printing ###  
  if pretty_print: 
    print("We recommend the following: ")
    for rec_title in user_recs:
      print(f"       {rec_title}")
  print("\n")
  return user_recs

In [None]:
# Checking out the profiles & recommendations 
# for 10 random users

for i in np.random.randint(0, len(user_recs_pandas), 10):
  checkUserRecommendations(i)

User #343 Profile:
10 reviews
Mean rating: 3.6 stars
Top 5 shows:
       Voddie Baucham on SermonAudio
       Magic: The Gathering Drive to Work Podcast
       True Crime All The Time
       Scam Goddess
       Kubernetes Podcast from Google
                ..            
We recommend the following: 
       Impeachment Today
       SOFREP Radio
       Planet Money


User #898 Profile:
15 reviews
Mean rating: 3.4 stars
Top 5 shows:
       Our Portland with Sarah Iannarone
       Radiolab
       Kalila Stormfire's Economical Magick Services
       Regenerative Agriculture Podcast
       Noodle Loaf
                ..            
We recommend the following: 
       Wonders of the World
       Weeknight Kitchen with Melissa Clark
       CuriosiD


User #17 Profile:
15 reviews
Mean rating: 3.0 stars
Top 5 shows:
       River to River
       Stanford Steve & The Bear
       The Unexpectables
       Directionally Challenged
       The Allusionist
                ..            
We recommend th

In [None]:
#try it yourself!
my_random_user = np.random.randint(0, len(user_recs_pandas))
recs = checkUserRecommendations(my_random_user)

User #207 Profile:
15 reviews
Mean rating: 2.2 stars
Top 5 shows:
       Attention HellMart Shoppers!
       The Brilliant Idiots
       Newt's World
       The 2 Robbies
       The Stronger By Science Podcast
                ..            
We recommend the following: 
       The Brilliant Idiots
       Attention HellMart Shoppers!
       Newt's World


