# Importar librerías

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns



# Cargar el dataset

In [2]:
movies_file = "dataset/movies.csv"
ratings_file = "dataset/ratings.csv"
ratings_df = pd.read_csv(ratings_file)
movies_df = pd.read_csv(movies_file)

DataFrame de los registros de calificaciones("ratings.csv")

In [3]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


DataFrame de los registros de películas("movies.csv")

In [4]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


# Normalizar los DataFrames
Combinar el dataframe 'ratings_df' y 'movies_df' concatenados por el valor de la columna 'movieId'

In [5]:
df = ratings_df.merge(movies_df,on='movieId', how='left')
model_data = df

In [6]:
#Exportar DataFrame a [userId, movieId, rating ]
model_data=model_data.drop(['timestamp','title','genres'],axis =1) #Eliminar 
model_data.to_csv("movies_rating.csv")

# Procesamiento de datos

Convertir Géneros en fila a columnas. Se crea una nueva columna para cada género. Por cada género existente se crea una columna a la que inicialmente se le asignarán valores de 0

In [7]:
#Columna géneros de películas
x = df.genres 
a = list()
for i in x:
    #Dividir la cadena de texto en una lista
    a.append(i.split('|'))
#Crear columnas    
a = pd.DataFrame(a)   
b = a[0].unique()
for i in b:
    #Asignar valor 0 a cada columna
    df[i] = 0  
df.head(1)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Adventure,Comedy,Action,Mystery,Crime,Thriller,Drama,Animation,Children,Horror,Documentary,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,(no genres listed),War
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Se asigna 1 al valor de la columna del género que pertenece a cada película.

In [8]:
for i in b:
    df.loc[df['genres'].str.contains(i), i] = 1
df.head(1)

  


Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Adventure,Comedy,Action,Mystery,Crime,Thriller,Drama,Animation,Children,Horror,Documentary,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,(no genres listed),War
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0


# Verificar y Limpiar Datos
Comprobar si existen valores NaN en el DataFrame

In [9]:
df.isna().sum()

userId                0
movieId               0
rating                0
timestamp             0
title                 0
genres                0
Adventure             0
Comedy                0
Action                0
Mystery               0
Crime                 0
Thriller              0
Drama                 0
Animation             0
Children              0
Horror                0
Documentary           0
Sci-Fi                0
Fantasy               0
Film-Noir             0
Western               0
Musical               0
Romance               0
(no genres listed)    0
War                   0
dtype: int64

Limpiar la fila inconsisitente y verificar nuevamente

In [10]:
df.dropna(inplace = True)
df.isnull().sum()

userId                0
movieId               0
rating                0
timestamp             0
title                 0
genres                0
Adventure             0
Comedy                0
Action                0
Mystery               0
Crime                 0
Thriller              0
Drama                 0
Animation             0
Children              0
Horror                0
Documentary           0
Sci-Fi                0
Fantasy               0
Film-Noir             0
Western               0
Musical               0
Romance               0
(no genres listed)    0
War                   0
dtype: int64

In [11]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Adventure,Comedy,Action,Mystery,Crime,Thriller,Drama,Animation,Children,Horror,Documentary,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,(no genres listed),War
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


# Eliminar caracteristicas que no influyen en el modelo
Se procede a eliminar 2 características que no se utilizarán.

**timestamp.-** Es el valor de fecha, que no se toma en cuenta para el sistema de recomendación.

**genres.-** Es el valor que indica los géneros a los cuales pertenecen las películas. No se utiliza ya que cada género se encuentra en columnas. 

**title.-** No se elimina del DataFrame, ya que tendrá utilidad para relacionar los resultados de la recomendación con el título de cada película.

In [12]:
df = df.drop(['timestamp','genres','(no genres listed)'],axis =1) #Eliminar características 

# Método de ingeniería de características 

In [13]:
min_rating = min(df["rating"])
max_rating = max(df["rating"])

#df = df.sample(frac=1, random_state=42)

# Conjunto original de los datos
# Crear el emparejamiento tradicional (x, y) 
X = df[["userId", "movieId"]]

#X = df[['userId', 'movieId', 'Adventure', 'Comedy', 'Action', 'Mystery', 'Crime', 'Thriller',   
#       'Drama', 'Animation', 'Children', 'Horror', 'Documentary', 'Sci-Fi', 'Fantasy', 'Film-Noir', 'Western', 'Musical', 'Romance']]

# Normalización Min-Max. 
y = df['rating'].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
y = y.astype('int')# Arregla el error continous



# Dividir el dataset en training and testing 

In [14]:
#Dividir los datos en conjuntos de datos de entrenamiento y prueba.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print("Datos de entrenamiento",X_train.shape, y_train.shape)
print("Datos de prueba",X_test.shape, y_test.shape)

Datos de entrenamiento (70585, 2) (70585,)
Datos de prueba (30251, 2) (30251,)


In [19]:
X_train

Unnamed: 0,userId,movieId
18199,115,1617
59075,385,999
28375,198,589
83847,534,58025
33675,229,34
...,...,...
31617,219,1210
45508,300,2858
37301,249,110297
35943,244,3683


# Selección de características
Mediante una técnica de selección de características, se puede obtener aquellas que son importantes para el modelo KNN. Para algoritmos de clustering se utilizacarácteristicas distintas, las cuales son los géneros de las películas que ya se encuentran en columnas. Como valor de salida para el modelo de aprendizaje supervisado se tomará el rating.


In [15]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(max_depth=7, n_estimators=100)
RF.fit(X_train, y_train)
y_pred_RF= RF.predict(X_test)

df_all = pd.DataFrame()

cols = ['Machine Learning Classification Method',
        'Train Accuracy', 
        'Test Accuracy', 
        "Top 1 Feature Predictor",
        "Top 2 Feature Predictor",
        "Top 3 Feature Predictor",
        "Top 4 Feature Predictor",
        "Top 5 Feature Predictor"]

df_all.loc['Machine Learning Classification Method', 
       'Details'] = 'Random Forest Classifier'
df_all.loc['Train Accuracy', 'Details'] = RF.score(X_train, y_train) 
df_all.loc['Test Accuracy', 'Details'] = RF.score(X_test, y_test)

feature_importances = X.columns[np.argsort(RF.feature_importances_)][-2:]

#df_all.loc['Top 1 Feature Predictor', 'Details'] = feature_importances[2]
df_all.loc['Top 2 Feature Predictor', 'Details'] = feature_importances[1]
df_all.loc['Top 3 Feature Predictor', 'Details'] = feature_importances[0]


display(df_all)

Unnamed: 0,Details
Machine Learning Classification Method,Random Forest Classifier
Train Accuracy,0.871786
Test Accuracy,0.865294
Top 2 Feature Predictor,movieId
Top 3 Feature Predictor,userId


# Generar un archivo plano con el dataset generado despúes haber aplicado el proceso de cleasing o limpiado de dato. 

In [16]:
#Exportar DataFrame a CSV
ruta = "movies_all.csv"
df.to_csv(ruta)
df.head()

Unnamed: 0,userId,movieId,rating,title,Adventure,Comedy,Action,Mystery,Crime,Thriller,Drama,Animation,Children,Horror,Documentary,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,War
0,1,1,4.0,Toy Story (1995),1,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0
1,1,3,4.0,Grumpier Old Men (1995),0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1,6,4.0,Heat (1995),0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,1,50,5.0,"Usual Suspects, The (1995)",0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
