## Importing Libraries

In [8]:
from IPython.display import clear_output
import pandas as pd

In [56]:
collect_data_reviews = True

## Downloading data

In [4]:
#Downloading the dataset from GitHub
!git clone https://github.com/acorreal7/youtube-comments-analysis
!mv youtube-comments-analysis/recommender_systems/data .
!rm -rf youtube-comments-analysis
clear_output()

## Exploring the dataset

In [6]:
# Path to the dataset files
data_path = 'data/u.data'
genre_path = 'data/u.genre'
item_path = 'data/u.item'

# Reading u.data
u_data = pd.read_csv(data_path, sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

# Reading u.genre
u_genre = pd.read_csv(genre_path, sep='|', header=None, names=['genre', 'genre_id'])

# Reading u.item
# Since u.item has more columns and is tab-separated, we need to specify more parameters
u_item = pd.read_csv(item_path, sep='|', header=None, encoding='latin-1',
                     names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL'] +
                           [f'genre_{i}' for i in range(len(u_genre))])


In [7]:
u_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [9]:
u_data.shape

(100000, 4)

In [12]:
u_genre

Unnamed: 0,genre,genre_id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
5,Comedy,5
6,Crime,6
7,Documentary,7
8,Drama,8
9,Fantasy,9


In [13]:
u_item.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,genre_0,genre_1,genre_2,genre_3,genre_4,...,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [14]:
u_item.shape

(1682, 24)

## Getting movie reviews from TMDB

In [53]:
# get api-token from colab secrets
from google.colab import userdata
import requests
import json
import re

In [18]:
# Checking API Authentication

if collect_data_reviews:
    api_token = userdata.get('API_TMDB_TOKEN')

    url = "https://api.themoviedb.org/3/authentication"

    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_token}"
    }

    response = requests.get(url, headers=headers)

    print(response.text)

{"success":true,"status_code":1,"status_message":"Success."}


In [34]:
def get_movie_reviews(movie_title):
  reviews = []
  api_key = userdata.get('API_TMDB_API_KEY')
  search_url = f'https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={movie_title}'
  search_response = requests.get(search_url)
  search_data = search_response.json()
  if 'results' in search_data and len(search_data['results']) > 0:
      movie_id = search_data['results'][0]['id']

      # Get movie reviews
      reviews_url = f'https://api.themoviedb.org/3/movie/{movie_id}/reviews?api_key={api_key}'
      reviews_response = requests.get(reviews_url)
      reviews_data = reviews_response.json()

      for review in reviews_data['results']:
        reviews.append(review['content'])
  return reviews


In [54]:
# method to remove the year in the movie title
def remove_year(title):
    pattern = r"\s*\(\d{4}\)$"
    return re.sub(pattern, "", title)

In [57]:
# getting the reviews for all the movies
if collect_data_reviews:
  movie_reviews = []
  for movie_title in u_item['movie_title']:
    movie_title = remove_year(movie_title)
    movie_reviews.append(get_movie_reviews(movie_title))
    if len(movie_reviews) % 100 == 0:
      #store the collected reviews as json
      with open('movie_reviews.json', 'w') as f:
        json.dump(movie_reviews, f)
      print(f'Processed {len(movie_reviews)} movies')
  with open('movie_reviews.json', 'w') as f:
    json.dump(movie_reviews, f)
  u_item['reviews'] = movie_reviews
else:
  with open('movie_reviews.json', 'r') as f:
    movie_reviews = json.load(f)
    u_item['reviews'] = movie_reviews

Processed 100 movies
Processed 200 movies
Processed 300 movies
Processed 400 movies
Processed 500 movies
Processed 600 movies
Processed 700 movies
Processed 800 movies
Processed 900 movies
Processed 1000 movies
Processed 1100 movies
Processed 1200 movies
Processed 1300 movies
Processed 1400 movies
Processed 1500 movies
Processed 1600 movies


In [58]:
a = u_item[u_item['reviews'].apply(lambda x: len(x) != 0)]
a.shape

(779, 25)

In [60]:
a.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,genre_0,genre_1,genre_2,genre_3,genre_4,...,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,reviews
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,[This movie came out when I was three. Now I'm...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,[Really solid entry into the series with Brosn...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,[I know that no movie is perfect but for my mo...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,"[Sigourney Weaver is ""Helen"", a psychologist w..."
6,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,[Really good sci-fi thriller with wonderful pe...


In [59]:
b = u_item[u_item['reviews'].apply(lambda x: len(x) == 0)]
b.shape

(903, 25)

In [61]:
b.tail()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,genre_0,genre_1,genre_2,genre_3,genre_4,...,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,reviews
1676,1677,Sweet Nothing (1995),20-Sep-1996,,http://us.imdb.com/M/title-exact?Sweet%20Nothi...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,[]
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
1681,1682,Scream of Stone (Schrei aus Stein) (1991),08-Mar-1996,,http://us.imdb.com/M/title-exact?Schrei%20aus%...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]


In [52]:

def remove_year(title):
    pattern = r"\s*\(\d{4}\)$"
    return re.sub(pattern, "", title)


B. Monkey
