###  Created by Luis A. Sanchez-Perez (l.alejandro.2011@gmail.com).
<p><span style="color:green"><b>Copyright &#169;</b> Do not distribute or use without authorization from author.</span></p>

Generates a movies matrix of size `[movies x genres]` where columns (one per genre) are features.

In [1]:
import pathlib
import pandas as pd
import collections
import re

In [2]:
DATASETS = pathlib.Path('/media/alejand/DatasetsT7/datasets')

In [3]:
# Loads raw movies info
movies = pd.read_csv(DATASETS / 'recommender/movies/ml-20m/movies.csv', index_col='movieId')
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
# Converts genres column to list of str where each entry is one of the genres in the movie
movies['genres'] = movies['genres'].apply(str.split, sep='|')
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
2,Jumanji (1995),"[Adventure, Children, Fantasy]"
3,Grumpier Old Men (1995),"[Comedy, Romance]"
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
5,Father of the Bride Part II (1995),[Comedy]


In [5]:
# Generates a counter of genres found in movies
entries = []
for entry in movies['genres']:
    entries.extend(entry)
genres_to_count = collections.Counter(entries)

In [6]:
genres_to_count

Counter({'Adventure': 2329,
         'Animation': 1027,
         'Children': 1139,
         'Comedy': 8374,
         'Fantasy': 1412,
         'Romance': 4127,
         'Drama': 13344,
         'Action': 3520,
         'Crime': 2939,
         'Thriller': 4178,
         'Horror': 2611,
         'Mystery': 1514,
         'Sci-Fi': 1743,
         'IMAX': 196,
         'Documentary': 2471,
         'War': 1194,
         'Musical': 1036,
         'Western': 676,
         'Film-Noir': 330,
         '(no genres listed)': 246})

In [7]:
# Assigns index to each genre
index_to_genres = []
genres_to_index = {}
for key in genres_to_count:
    genres_to_index[key] = len(index_to_genres)
    index_to_genres.append(key)

In [8]:
movies

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
2,Jumanji (1995),"[Adventure, Children, Fantasy]"
3,Grumpier Old Men (1995),"[Comedy, Romance]"
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
5,Father of the Bride Part II (1995),[Comedy]
...,...,...
131254,Kein Bund für's Leben (2007),[Comedy]
131256,"Feuer, Eis & Dosenbier (2002)",[Comedy]
131258,The Pirates (2014),[Adventure]
131260,Rentun Ruusu (2001),[(no genres listed)]


In [9]:
# Listing movies with no genre
mask = ['(no genres listed)' in entry for entry in movies['genres']]
movies[mask]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
83773,Away with Words (San tiao ren) (1999),[(no genres listed)]
83829,Scorpio Rising (1964),[(no genres listed)]
84768,Glitterbug (1994),[(no genres listed)]
86493,"Age of the Earth, The (A Idade da Terra) (1980)",[(no genres listed)]
87061,Trails (Veredas) (1978),[(no genres listed)]
...,...,...
131082,Playground (2009),[(no genres listed)]
131108,The Fearless Four (1997),[(no genres listed)]
131166,WWII IN HD (2009),[(no genres listed)]
131172,Closed Curtain (2013),[(no genres listed)]


In [10]:
# Creates a column per genre and initializes all values to zero
for entry in index_to_genres:
    movies[entry] = 0
movies.head()

Unnamed: 0_level_0,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Jumanji (1995),"[Adventure, Children, Fantasy]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Father of the Bride Part II (1995),[Comedy],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Regex to extract year from title
EXTRACT_YEAR = re.compile(r'\s\(([0-9]*)\)')
# Splits title and year into two columns
year = []
title = []
for entry in movies['title']:
    result = EXTRACT_YEAR.search(entry)
    if result:
        span = result.span()
        year.append(entry[span[0] + 2:span[1] - 1])
        title.append(entry[:span[0]])
    else:
        year.append(0)
        title.append(entry)
movies['year'] = year
movies['title'] = title

In [12]:
# K-hot encodes genres
values_per_movie = movies[index_to_genres].to_numpy()
genres_per_movie = movies['genres'].to_numpy()
for row, genres in zip(values_per_movie, genres_per_movie):
    row[[genres_to_index[value] for value in genres]] = 1
movies[index_to_genres] = values_per_movie
movies.drop(columns=['genres'], inplace=True)

In [13]:
movies.head()

Unnamed: 0_level_0,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed),year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
2,Jumanji,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
3,Grumpier Old Men,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
4,Waiting to Exhale,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1995
5,Father of the Bride Part II,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [14]:
movies.to_csv('data/ml-20m/encoded_movies.csv')