In [1]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics 

data_path = {
    'ratings': 'ratings.txt',
    'movies': 'netflix_titles.csv'
}

def load_data(data: str) -> pd.DataFrame:
    df = pd.read_csv(data_path[data], \
                        sep=',', \
                        engine='python')

    return df

In [2]:

def movie_data_treatment(df_movies: pd.DataFrame) -> pd.DataFrame:
    # crea columna de id
    df_movies['idMovie'] = [i for i in range(1, len(df_movies) + 1)]
    # coge las primeras 3952 para tener las mismas que en el otro dataset
    movies_id = [i for i in range(0, 3952)]
    df_movies = df_movies.iloc[movies_id]
    
    return df_movies

def create_rating_matrix(df_movies: pd.DataFrame, df_ratings: pd.DataFrame) -> pd.DataFrame:
    # merge info 
    df = pd.merge(df_ratings, df_movies, on='idMovie', how='left')

    # creates rating matrix
    ratings_matrix = df.pivot( index = "idUser", columns = "idMovie", values = "rating")
    # 0 if film not rated
    ratings_matrix.fillna( 0, inplace = True ) 
    # drops possible duplicated columns
    ratings_matrix = ratings_matrix.loc[:, ~ratings_matrix.columns.duplicated()]
    
    return ratings_matrix




In [3]:


movies = load_data('movies')
ratings = load_data('ratings')
movies = movie_data_treatment(movies)
movies.info()

cols = ['idMovie', 'type', 'director', 'cast',  \
        'release_year', 'listed_in']
movies = movies[cols]


<class 'pandas.core.frame.DataFrame'>
Index: 3952 entries, 0 to 3951
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       3952 non-null   object
 1   type          3952 non-null   object
 2   title         3952 non-null   object
 3   director      2565 non-null   object
 4   cast          3573 non-null   object
 5   country       3413 non-null   object
 6   date_added    3952 non-null   object
 7   release_year  3952 non-null   int64 
 8   rating        3952 non-null   object
 9   duration      3952 non-null   object
 10  listed_in     3952 non-null   object
 11  description   3952 non-null   object
 12  idMovie       3952 non-null   int64 
dtypes: int64(2), object(11)
memory usage: 432.2+ KB


In [4]:
movies

Unnamed: 0,idMovie,type,director,cast,release_year,listed_in
0,1,Movie,Kirsten Johnson,,2020,Documentaries
1,2,TV Show,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",2021,"International TV Shows, TV Dramas, TV Mysteries"
2,3,TV Show,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",2021,"Crime TV Shows, International TV Shows, TV Act..."
3,4,TV Show,,,2021,"Docuseries, Reality TV"
4,5,TV Show,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",2021,"International TV Shows, Romantic TV Shows, TV ..."
...,...,...,...,...,...,...
3947,3948,Movie,Fernando Ayllón,Ricardo Quevedo,2019,Stand-Up Comedy
3948,3949,Movie,"Rocky Soraya, Anggy Umbara","Luna Maya, Herjunot Ali, Teuku Rifnu Wikana, V...",2018,"Horror Movies, International Movies"
3949,3950,TV Show,,"Bassel Khayyat, Daniella Rahme, Bassam Moughne...",2018,"International TV Shows, Romantic TV Shows, TV ..."
3950,3951,Movie,Shelly Chopra Dhar,"Anil Kapoor, Sonam Kapoor, Rajkummar Rao, Juhi...",2019,"Comedies, Dramas, International Movies"


In [5]:


## -- EXPANDE LOS DIRECTORES --
directores_unicos = set()
for directores in movies['director']:
    directores_unicos.update(str(directores).split(', ')[:1])

# Crear un diccionario para mapear los directores a columnas con 0 o 1
directores_dict = {}
for director in directores_unicos:
    directores_dict[director] = movies['director'].apply(lambda x: 1 if director in str(x).split(', ') else 0)

# Concatenar el DataFrame original con las nuevas columnas para cada director
df = pd.concat([movies, pd.DataFrame(directores_dict)], axis=1)

# Eliminar la columna original de directores
df.drop(columns=['director'], inplace=True)
df.to_csv('expanded_directors.csv', index=False, sep='|', encoding='utf_8_sig')



OSError: Cannot save file into a non-existent directory: 'tmp'

In [None]:


## -- EXPANDE CASTING --

actores_unicos = set()
for actores in movies['cast']:
    actores_unicos.update(str(actores).split(', ')[:1])

# Crear un diccionario para mapear los directores a columnas con 0 o 1
actores_dict = {}
for actor in actores_unicos:
    actores_dict[actor] = movies['cast'].apply(lambda x: 1 if actor in str(x).split(', ') else 0)
len(actores_dict.keys())
# Concatenar el DataFrame original con las nuevas columnas para cada director
df = pd.concat([df, pd.DataFrame(actores_dict)], axis=1)

# Eliminar la columna original de directores
df.drop(columns=['cast'], inplace=True)
df.to_csv('expanded_director_cast.csv', index=False, sep='|', encoding='utf_8_sig')



In [7]:

## -- EXPANDE GENRES --
genres_unicos = set()
for genre in movies['listed_in']:
    genres_unicos.update(str(genre).split(', ')[:3])

# Crear un diccionario para mapear los directores a columnas con 0 o 1
genre_dict = {}
for genre in genres_unicos:
    genre_dict[genre] = movies['listed_in'].apply(lambda x: 1 if actor in str(x).split(', ') else 0)
len(genre_dict.keys())
# Concatenar el DataFrame original con las nuevas columnas para cada director
df = pd.concat([df, pd.DataFrame(genre_dict)], axis=1)

# Eliminar la columna original de directores
df.drop(columns=['listed_in'], inplace=True)
df.to_csv('expanded_director_cast_country_genres.csv', index=False, sep='|', encoding='utf_8_sig')

data_merged = pd.merge(df, ratings, how='inner', on='idMovie')

NameError: name 'actor' is not defined