In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
!wget https://files.grouplens.org/datasets/tag-genome-2021/genome_2021.zip
!unzip genome_2021.zip

--2024-07-09 12:30:56--  https://files.grouplens.org/datasets/tag-genome-2021/genome_2021.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1928028583 (1.8G) [application/zip]
Saving to: ‘genome_2021.zip’


2024-07-09 12:31:17 (91.4 MB/s) - ‘genome_2021.zip’ saved [1928028583/1928028583]

Archive:  genome_2021.zip
   creating: movie_dataset_public_final/
  inflating: movie_dataset_public_final/.DS_Store  
  inflating: __MACOSX/movie_dataset_public_final/._.DS_Store  
   creating: movie_dataset_public_final/scores/
  inflating: movie_dataset_public_final/readme.txt  
  inflating: __MACOSX/movie_dataset_public_final/._readme.txt  
   creating: movie_dataset_public_final/processed/
   creating: movie_dataset_public_final/raw/
  inflating: __MACOSX/movie_dataset_public_final/._raw  
   creating: movie_dataset_public_final/p

In [4]:
origin_folder = "./movie_dataset_public_final/raw"
target_folder = "./drive/MyDrive/bc/taggenome"

In [5]:
# movies.json sample:
# {"title": "Toy Story (1995)", "directedBy": "John Lasseter", "starring": "Tim Allen, Tom Hanks, Don Rickles, Jim Varney, John Ratzenberger, Wallace Shawn, Laurie Metcalf, John Morris, R. Lee Ermey, Annie Potts", "avgRating": 3.89146, "imdbId": "0114709", "item_id": 1}
movies = pd.read_json(f"{origin_folder}/metadata_updated.json", lines=True)
print(movies.shape)
movies.head()

(84661, 6)


Unnamed: 0,title,directedBy,starring,avgRating,imdbId,item_id
0,Toy Story (1995),John Lasseter,"Tim Allen, Tom Hanks, Don Rickles, Jim Varney,...",3.89146,114709,1
1,Jumanji (1995),Joe Johnston,"Jonathan Hyde, Bradley Pierce, Robin Williams,...",3.26605,113497,2
2,Grumpier Old Men (1995),Howard Deutch,"Jack Lemmon, Walter Matthau, Ann-Margret , Sop...",3.17146,113228,3
3,Waiting to Exhale (1995),Forest Whitaker,"Angela Bassett, Loretta Devine, Whitney Housto...",2.86824,114885,4
4,Father of the Bride Part II (1995),Charles Shyer,"Steve Martin, Martin Short, Diane Keaton, Kimb...",3.0762,113041,5


In [6]:
# ratings.json sample:
# {"item_id": 5, "user_id": 997206, "rating": 3.0}
ratings = pd.read_json(f"{origin_folder}/ratings.json", lines=True)
print(f'Unique users: {ratings.user_id.nunique()}\nUnique items: {ratings.item_id.nunique()}')
ratings.shape

Unique users: 247383
Unique items: 67873


(28490116, 3)

In [7]:
# Several rows in ratings are duplicates
ratings = ratings.drop_duplicates(subset=['user_id', 'item_id'])
print(f'Unique users: {ratings.user_id.nunique()}\nUnique items: {ratings.item_id.nunique()}')
ratings.shape

Unique users: 247383
Unique items: 67873


(28249191, 3)

In [8]:
# tag_count.json sample:
# {"item_id": 1, "tag_id": 86963, "num": 4}
tag_count = pd.read_json(f"{origin_folder}/tag_count.json", lines=True)
tag_count.shape

(212704, 3)

In [9]:
def extract_year_from_title(movies, ratings):
    """ Extracts the realease year from title and creates a new column containing the year. Removes movies that do not have release year available from movies and ratings """
    movies["title"] = movies["title"].str.strip()
    movies["year"] = movies["title"].str.extract(r"\((\d{4})\)$")
    movies = movies.dropna(subset=["year"])
    movies["year"] = movies["year"].astype(int)
    movies = movies[["title", "year", "directedBy", "starring", "avgRating", "imdbId", "item_id"]]
    ratings = ratings[ratings["item_id"].isin(movies["item_id"])]

    return movies, ratings

# Call the function
movies, ratings = extract_year_from_title(movies, ratings)
print(f'Movies: {movies.shape}')
print(f'Ratings: {ratings.shape}')
movies.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["year"] = movies["year"].astype(int)


Movies: (83992, 7)
Ratings: (28201816, 3)


Unnamed: 0,title,year,directedBy,starring,avgRating,imdbId,item_id
0,Toy Story (1995),1995,John Lasseter,"Tim Allen, Tom Hanks, Don Rickles, Jim Varney,...",3.89146,114709,1
1,Jumanji (1995),1995,Joe Johnston,"Jonathan Hyde, Bradley Pierce, Robin Williams,...",3.26605,113497,2
2,Grumpier Old Men (1995),1995,Howard Deutch,"Jack Lemmon, Walter Matthau, Ann-Margret , Sop...",3.17146,113228,3
3,Waiting to Exhale (1995),1995,Forest Whitaker,"Angela Bassett, Loretta Devine, Whitney Housto...",2.86824,114885,4
4,Father of the Bride Part II (1995),1995,Charles Shyer,"Steve Martin, Martin Short, Diane Keaton, Kimb...",3.0762,113041,5


In [10]:
def get_active_users(movies_df, ratings_df, min_year):
    """ Returns a list of user_ids that have rated movies from min_year """

    item_ids = movies_df[movies_df["year"] >= min_year]["item_id"]
    user_ids = ratings_df[ratings_df["item_id"].isin(item_ids)]["user_id"].unique()
    return user_ids

def filter_active_users(movies_df, ratings_df, tag_count_df, min_year):
    """ Filters out all users who haven't rated movies from at least min_year
    Returns a new DataFrame with only ratings from active users and removes movies aren't rated anymore """

    user_ids = get_active_users(movies_df, ratings_df, min_year)
    ratings_df = ratings_df[ratings_df["user_id"].isin(user_ids)]

    # Remove movies that are not rated by any active users
    item_ids = ratings_df["item_id"].unique()
    movies_df = movies_df[movies_df["item_id"].isin(item_ids)]

    # Remove tags that are not related to any active users
    tag_count_df = tag_count_df[tag_count_df["item_id"].isin(item_ids)]

    return ratings_df, movies_df, tag_count_df

ratings, movies, tag_count = filter_active_users(movies, ratings, tag_count, min_year=2015)
print(f'Movies: {movies.shape}')
print(f'Ratings: {ratings.shape}')
print(f'Unique users: {ratings.user_id.nunique()}\nUnique items: {ratings.item_id.nunique()}')

Movies: (66325, 7)
Ratings: (10076619, 3)
Unique users: 43380
Unique items: 66325


In [11]:
def remove_unpopular_items(ratings_df, movies_df, min_ratings):
    """ Removes unpopular movies with fewer than min_ratings. """
    ratings_per_item = ratings_df.groupby("item_id").size()
    items_to_keep = ratings_per_item[ratings_per_item >= min_ratings].index
    ratings_df = ratings_df[ratings_df["item_id"].isin(items_to_keep)]
    movies_df = movies_df[movies_df["item_id"].isin(items_to_keep)]
    return ratings_df, movies_df

ratings, movies = remove_unpopular_items(ratings, movies, min_ratings=20)
print(f'Movies: {movies.shape}')
print(f'Ratings: {ratings.shape}')
print(f'Unique users: {ratings.user_id.nunique()}\nUnique items: {ratings.item_id.nunique()}')

Movies: (17807, 7)
Ratings: (9853772, 3)
Unique users: 43368
Unique items: 17807


In [12]:
import numpy as np

# Take 1000 users for testing. The users must have rated at least 25 ratings
user_counts = ratings['user_id'].value_counts()
users_with_100_plus = user_counts[user_counts >= 25].index

selected_users = np.random.choice(users_with_100_plus, size=1000, replace=False)

ratings_test = ratings[ratings['user_id'].isin(selected_users)]

ratings_train = ratings[~ratings['user_id'].isin(selected_users)]

# You now have ratings_train and ratings_test as required
print("Training set size:", ratings_train.shape)
print("Testing set size:", ratings_test.shape)
print(f'Train - unique users: {ratings_train.user_id.nunique()}\nUnique items: {ratings_train.item_id.nunique()}')
print(f'Test - unique users: {ratings_test.user_id.nunique()}\nUnique items: {ratings_test.item_id.nunique()}')

Training set size: (9573225, 3)
Testing set size: (280547, 3)
Train - unique users: 42368
Unique items: 17807
Test - unique users: 1000
Unique items: 14504


In [13]:
train_contains_all_movies = ratings_train.item_id.nunique() == ratings.item_id.nunique()
assert train_contains_all_movies

In [14]:
no_users_in_common = set(ratings_train.user_id.unique()).isdisjoint(set(ratings_test.user_id.unique()))
assert no_users_in_common

In [15]:
for i, row in movies.iterrows():
    """ Converts IMDB id to proper format - "tt", followed by a sequence of at least 7 digits long, padded with zeros if necessary
    Secondly, movies starting with "The", "A", "An" have the articles at the end of the title, so we move them back to the beginning
    e.g. "Matrix, The" -> "The Matrix" """
    imdb_id = f'tt{str(row["imdbId"]).zfill(7)}'
    movies.at[i, "imdbId"] = imdb_id


    title = row["title"][:-7]
    year = row["title"][-6:]

    if title.endswith(", The"):
        title = "The " + title[:-5]
    elif title.endswith(", A"):
        title = "A " + title[:-3]
    elif title.endswith(", An"):
        title = "An " + title[:-4]

    movies.at[i, "title"] = title + " " + year

movies.head()

Unnamed: 0,title,year,directedBy,starring,avgRating,imdbId,item_id
0,Toy Story (1995),1995,John Lasseter,"Tim Allen, Tom Hanks, Don Rickles, Jim Varney,...",3.89146,tt0114709,1
1,Jumanji (1995),1995,Joe Johnston,"Jonathan Hyde, Bradley Pierce, Robin Williams,...",3.26605,tt0113497,2
2,Grumpier Old Men (1995),1995,Howard Deutch,"Jack Lemmon, Walter Matthau, Ann-Margret , Sop...",3.17146,tt0113228,3
3,Waiting to Exhale (1995),1995,Forest Whitaker,"Angela Bassett, Loretta Devine, Whitney Housto...",2.86824,tt0114885,4
4,Father of the Bride Part II (1995),1995,Charles Shyer,"Steve Martin, Martin Short, Diane Keaton, Kimb...",3.0762,tt0113041,5


In [16]:
# Add popularity column to movies based on the number of ratings
ratings_count = ratings_train.groupby('item_id').size().reset_index(name='popularity')
movies_with_popularity = pd.merge(movies, ratings_count, on="item_id")
movies_sorted = movies_with_popularity.sort_values(by='popularity', ascending=False)
movies_sorted = movies_sorted[['title', 'year', 'popularity', 'directedBy', 'starring', 'avgRating', 'imdbId', 'item_id']]

movies_sorted.head()

Unnamed: 0,title,year,popularity,directedBy,starring,avgRating,imdbId,item_id
2117,The Matrix (1999),1999,28693,"Andy Wachowski, Larry Wachowski","Laurence Fishburne, Keanu Reeves, Hugo Weaving...",4.15952,tt0133093,2571
288,The Shawshank Redemption (1994),1994,27201,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",4.41985,tt0111161,318
10961,Inception (2010),2010,26153,Christopher Nolan,"Leonardo DiCaprio, Ken Watanabe, Joseph Gordon...",4.17404,tt1375666,79132
325,Forrest Gump (1994),1994,24417,Robert Zemeckis,"Tom Hanks, Gary Sinise, Mykelti Williamson, Ro...",4.06633,tt0109830,356
5856,The Lord of the Rings: The Return of the King ...,2003,23997,Peter Jackson,"Sean Astin, Ian McKellen, Viggo Mortensen, Eli...",4.10393,tt0167260,7153


In [18]:
ratings_train.to_json(f"{target_folder}/ratings.json", orient="records", lines=True)
ratings_test.to_json(f"{target_folder}/ratings_test.json", orient="records", lines=True)
movies_sorted.to_json(f"{target_folder}/movies.json", orient="records", lines=True)
tag_count.to_json(f"{target_folder}/tag_count.json", orient="records", lines=True)