# Creating model's Dataset

In [1]:
import pandas as pd
#import nltk 
import numpy as np
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
from IPython.display import display
import pyspark
sc = pyspark.SparkContext
from pyspark.sql import SparkSession
from pyspark import SQLContext
from pyspark.sql import Window as W
import pyspark.sql.functions as F
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.types import *
from pyspark.sql import *
spark = SparkSession \
    .builder \
    .getOrCreate()

## Import datasets

In [3]:
genome_scores = pd.read_csv('data/'+'genome_scores'+'.csv',header=0, dtype = {"tagId" : "str","movieId" : "str"})
display(genome_scores.head())

genome_tags = pd.read_csv('data/'+'genome_tags'+'.csv',header=0, dtype = {"tagId" : "str","tag" : "str"})
display(genome_tags.head())

genome_scores = pd.merge(genome_scores,genome_scores.groupby('movieId').mean().reset_index().rename(columns={'relevance':'avg_relevance'}),on=['movieId'],how='left')
genome_scores_most_relevant = genome_scores[genome_scores.relevance>=genome_scores.avg_relevance]
genome_tags_relevance=pd.merge(genome_scores_most_relevant,genome_tags,how='left',on=['tagId'])
display(genome_tags_relevance)

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


Unnamed: 0,movieId,tagId,relevance,avg_relevance,tag
0,1,6,0.21700,0.160223,1950s
1,1,8,0.26275,0.160223,1970s
2,1,9,0.26200,0.160223,1980s
3,1,11,0.57700,0.160223,3d
4,1,13,0.18800,0.160223,80s
...,...,...,...,...,...
3571640,131170,1109,0.20875,0.162884,wilderness
3571641,131170,1110,0.47125,0.162884,wine
3571642,131170,1116,0.31125,0.162884,women
3571643,131170,1123,0.64375,0.162884,writers


In [2]:
tag = pd.read_csv('data/'+'tag'+'.csv',header=0, parse_dates=['timestamp'], dtype = {"userId" : "str","movieId" : "str"})
display(tag.head())

rating = pd.read_csv('data/'+'rating'+'.csv',header=0, parse_dates=['timestamp'], dtype = {"userId" : "str","movieId" : "str"})
display(rating.head())

BASE = "genome_tags_relevance"
genome_tags_relevance=pd.read_pickle("data/" + BASE + ".pkl")
display(genome_tags_relevance)

movie = pd.read_csv('data/'+'movie'+'.csv',header=0, dtype = {"title" : "str","movieId" : "str"})
display(movie.head())

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19
3,65,521,noir thriller,2013-05-10 01:39:43
4,65,592,dark hero,2013-05-10 01:41:18


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


Unnamed: 0,movieId,tagId,relevance,avg_relevance,tag
0,1,6,0.21700,0.160223,1950s
1,1,8,0.26275,0.160223,1970s
2,1,9,0.26200,0.160223,1980s
3,1,11,0.57700,0.160223,3d
4,1,13,0.18800,0.160223,80s
...,...,...,...,...,...
3571640,131170,1109,0.20875,0.162884,wilderness
3571641,131170,1110,0.47125,0.162884,wine
3571642,131170,1116,0.31125,0.162884,women
3571643,131170,1123,0.64375,0.162884,writers


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Cleaning, filters & feature engineering

For these tasks we'll use my own library developed for this project. Its name came for the sound that is a little bite like 'testing', which is what we are 'mathematically' doing in this case, and for the love I have for movies that is compare only with the joy that someone can have with his/her favorite food when, for example, you have in front of you a fine steak or a couple of cochinita pibil tacos... you don't eat it, you have tasted its value from long before you eat it... same with movies :)

<br> 

- The package documentation will be release soon. In the mean time, I hope it could be useful the following explination, the outputs from the methods executions, the code itself and, of course, any comment, questions, issues sent at this repo or to my personal e-mail.

- () Any contribution / pull-request will be high valued

In [3]:
import tastingmovies

**Init the class `prepmovies` with the next parameters:**

- The original datasets given: `movie, rating, tag`

- `genoma_tags_relevance`: It's just the join from the genoma's datasets.

- `minimun_wordfreq`: This would be for the tags' words frequency. It depends of how many words are we willing to lose in order to not allow words that do not appears to many times and would be just trash. In this case, in order to have a little bit more time to reaction and don't spare to much time waiting for processing machine time since I decided to focus on the nature of the features and the engineering of the new ones, I chose a 'big' number which cost the lost of an important part of the dataset but it will work perfect to show the concept of the solution.

- `minimum_to_hr`: The minimun rating which we will consider a movie as high ranked (or, in more casual words, approved by the user)


**Assumptions:**

- The genoma's datasets are taken from historical, past times, hence those do not have information of the future (time where the ratings take place) and, for these project we will not use the knwoledge gain from the training to update the genoma's datasets, which could be rather for future versions.

- With the EDA we identify that there are lots of timestamp's ratings of a user-movie that do not match with the timestamp's tag for that same combination of user-movie. In general, we'll asume that the time where was watched the movie and, therefore was known all the necessary to give it a rating, whas the older one (the smaller timestamp). This was done in order to prohibit the model to learn from future information. 

- For a movie, we will consider as a truly relevant tag the ones that have a relevance greater than the average of all the tags' relevance  associated with that movie


**Philosophy**

- We want to *'squeeze the juice'* from the variables as much as it could be. Hence, this library will help us to create many new features linked with averanges, cumulative averange, indicators of characteristics' presence as relevance of a tag with a movie, for example, interaction variables, etc.

- The objective is to build a kind of **ID** or, even better, a ***DNA*** chain for the user and other for the movie in order to have their *'genetic'* (the user's and the movie's) and with that, an idea of the tastes and characteristics for each one, just before 'they meet' each other. with this we'll put a ml model to learn which user-profile would *go out* (give a high rating) with which movie-profile.

- The variables that depend on time were built considerating the time's chronology (from past to future) in which, for the moment of the model's training and the predictions the only inforomation known are the ones that do not change by time and the ones that have past until even 'a second' before the new user watches a new movie.





In [4]:
pm = tastingmovies.prepmovies(tag_init=tag,minimun_wordfreq=2000,rating=rating,
                              genome_tags_relevance=genome_tags_relevance,movie=movie,
                              spark=spark,
                              minimun_to_hr=4)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jalfredomb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
pm.rating

Unnamed: 0,userId,movieId,rating_usr,timestamp_rating
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


In [9]:
pm.stemmer

<WordNetLemmatizer>

In [10]:
pm.stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [5]:
pm.join_tags_and_genome_ratings()

join_tags_and_genome_ratings


Unnamed: 0,userId,movieId,timestamp_movie,tag,timestamp,tagId,relevance,avg_relevance,rating_usr,timestamp_rating,title,genres_list,tag_relevance_movie
0,100031,1250,2006-10-04 02:12:54,war,2006-10-04 02:14:24,1096,0.95800,0.169697,5.0,2006-10-04 02:12:54,"Bridge on the River Kwai, The (1957)","[Adventure, Drama, War]",0.95800
1,100074,1701,2012-01-17 02:46:38,Woody Allen,2012-01-17 03:04:49,,,,5.0,2012-01-17 02:46:38,Deconstructing Harry (1997),"[Comedy, Drama]",0.01000
2,100074,1701,2012-01-17 02:46:38,hilarious,2012-01-17 03:04:56,505,0.56175,0.119156,5.0,2012-01-17 02:46:38,Deconstructing Harry (1997),"[Comedy, Drama]",0.56175
3,100074,2338,2012-01-17 03:12:52,white guy with Jamaican/Caribbean accent,2012-01-17 03:12:52,,,,2.0,2012-01-17 03:13:14,I Still Know What You Did Last Summer (1998),"[Horror, Mystery, Thriller]",0.01000
4,100074,2338,2012-01-17 03:12:52,Jack Black,2012-01-17 03:12:57,,,,2.0,2012-01-17 03:13:14,I Still Know What You Did Last Summer (1998),"[Horror, Mystery, Thriller]",0.01000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
391440,99998,1921,2015-02-28 23:57:47,insanity,2015-02-28 23:58:18,547,0.98050,0.188004,4.5,2015-02-28 23:57:47,Pi (1998),"[Drama, Sci-Fi, Thriller]",0.98050
391441,99998,3504,2015-02-28 23:23:21,Sidney Lumet,2015-02-28 23:23:29,,,,5.0,2015-02-28 23:23:21,Network (1976),"[Comedy, Drama]",0.01000
391442,99998,3730,2015-02-28 23:26:12,psychological,2015-02-28 23:26:33,823,0.93900,0.181968,4.0,2015-02-28 23:26:12,"Conversation, The (1974)","[Drama, Mystery]",0.93900
391443,99998,3730,2015-02-28 23:26:12,character study,2015-02-28 23:26:37,194,0.98075,0.181968,4.0,2015-02-28 23:26:12,"Conversation, The (1974)","[Drama, Mystery]",0.98075


Pegando: Tags, Relevance, Rating & Movie

- `tag_relevance = pd.merge(self.tag_init,self.genome_tags_relevance,how='left',on=['tag','movieId'])`

- `tags_rating00 = pd.merge(tag_relevance, self.rating, how="left", on=['userId','movieId']).sort_values(['timestamp','userId','movieId'],ascending=[1,0,0])`

- `tags_rating = tags_rating00[tags_rating00.rating_usr.notnull()]`

- `pre_tag=pd.merge(tags_rating,self.movie.drop(columns=['genres']),how='left',on=['movieId'])`


Rellenar los NA con el valor de 1% bajo la lógica que si no tiene un valor de relevancia asociado es 
porque no es tan relevante para esa película.


- `pre_tag['tag_relevance_movie']  = pre_tag['relevance'].fillna(.01)`

Tomando el timestamp más antiguo para la combinación `movie - user` entre el momento en que se genró el `tag` & `rating` ya que no suelen suceder al mismo tiempo y se toma el mínimo para considerar ése como el momento en que se la persona vio la película:

- `min_timestamps = pre_tag.groupby(['userId','movieId']).agg({'timestamp':'min','timestamp_rating':'min'}).reset_index()`

- `min_timestamps['timestamp_movie'] = min_timestamps[['timestamp','timestamp_rating']].min(axis=1)`



- `self.tag = pd.merge(min_timestamps[['userId','movieId','timestamp_movie']],pre_tag,on=['userId','movieId'],how='left')`


In [13]:
pm.process_tags01()

---Remove all the special characters
------Remove all single characters
---------Remove single characters from the start
------------Substituting multiple spaces with single space
---------------Removing prefixed "b" 
---------------------Converting to Lowercase
------------------------Lemmatization


Contando la frecuencia de las palabras de los tags
- `counts_tags = self.tag.tag_clean.str.split(expand=True).stack().value_counts()`
- `df_counts_tags = counts_tags.to_frame()`
- `df_counts_tags.columns = ['frec_tag']`
- `df_counts_tags['word'] = df_counts_tags.index`
- `df_counts_tags.reset_index(drop=True, inplace=True)`
- `df_counts_tags['num_charac'] = [len(x) for x in df_counts_tags.word]`

Identificando las que sí cumplen con el mínimo de frecuencia estipulada en la creación del objeto (en el ejercicio original fue con 2,000)
- `df_counts_tags01 = df_counts_tags[(df_counts_tags.num_charac>1) & (df_counts_tags.frec_tag>self.minimun_wordfreq)`

Habrán tags que se quedan sin palabras porque ninguna de las suyas superan el umbral establecido. Se eliminan los casos que no tienen palabras válidas y nos quedamos con sólo los que sí tengan al menos una. Además, preparamos la base que servirá para el ADN del usuario y la que servirá para el ADN de la película

- `tag_sub = self.tag[self.tag.num_words>0][['userId','movieId','timestamp_movie','tags_2consider','rating_usr','relevance']]`
- `tag_sub_movies = self.tag[self.tag.num_words>0][['userId','movieId','title','timestamp_movie','rating_usr','genres_list']]`

In [22]:
pm.minimun_wordfreq = 2000
pm.process_tags01()
[print(x) for x in [pm.tag.shape, pm.tag.shape, pm.tag_sub.shape,round(pm.tag_sub.shape[0]/pm.tag.shape[0],2)]]


(391445, 17)
(391445, 17)
(75760, 7)
0.19


[None, None, None, None]

* Con el mínimo de frecuencia por palabra de 2,000 veces, nos queda una base que es apenas un 19% de la base original

* Veamos cómo quedaría si le cambiamos ese parámetro a 200 veces

In [23]:
pm.minimun_wordfreq = 200
pm.process_tags01()
[print(x) for x in [pm.tag.shape, pm.tag.shape, pm.tag_sub.shape,round(pm.tag_sub.shape[0]/pm.tag.shape[0],2)]]

---Remove all the special characters
------Remove all single characters
---------Remove single characters from the start
------------Substituting multiple spaces with single space
---------------Removing prefixed "b" 
---------------------Converting to Lowercase
------------------------Lemmatization
(391445, 17)
(391445, 17)
(281734, 7)
0.72


[None, None, None, None]

* Nos queda una base del 72% de la original

In [112]:
pm.minimun_wordfreq = 500
pm.process_tags01()
[print(x) for x in [pm.tag.shape, pm.tag.shape, pm.tag_sub.shape,round(pm.tag_sub.shape[0]/pm.tag.shape[0],2)]]

---Remove all the special characters
------Remove all single characters
---------Remove single characters from the start
------------Substituting multiple spaces with single space
---------------Removing prefixed "b" 
---------------------Converting to Lowercase
------------------------Lemmatization
(391445, 17)
(391445, 17)
(206574, 7)
0.53


[None, None, None, None]

In [7]:
pm.minimun_wordfreq = 800
pm.process_tags01()
[print(x) for x in [pm.tag.shape, pm.tag.shape, pm.tag_sub.shape,round(pm.tag_sub.shape[0]/pm.tag.shape[0],2)]]

---Remove all the special characters
------Remove all single characters
---------Remove single characters from the start
------------Substituting multiple spaces with single space
---------------Removing prefixed "b" 
---------------------Converting to Lowercase
------------------------Lemmatization
(391445, 17)
(391445, 17)
(159385, 7)
0.41


[None, None, None, None]

In [8]:
pm.tag_sub.head(10)

Unnamed: 0,userId,movieId,timestamp_movie,tags_2consider,rating_usr,relevance,I
0,100031,1250,2006-10-04 02:12:54,[war],5.0,0.958,1
4,100074,2338,2012-01-17 03:12:52,[black],2.0,,1
5,100074,2338,2012-01-17 03:12:52,[love],2.0,,1
12,100074,3545,2012-01-17 02:46:50,"[book, based]",4.5,0.36625,1
13,100074,3545,2012-01-17 02:46:50,"[best, oscar, cinematography]",4.5,,1
14,100074,3545,2012-01-17 02:46:50,"[best, oscar]",4.5,,1
16,100074,3545,2012-01-17 02:46:50,[adapted],4.5,,1
17,100074,3545,2012-01-17 02:46:50,[history],4.5,0.225,1
18,100074,3545,2012-01-17 02:46:50,[classic],4.5,0.335,1
20,100074,4148,2012-01-17 02:42:44,[psychology],4.0,0.81075,1


In [26]:
display(pm.tag[pm.tag.userId=='100144'][['userId','movieId','genres_list','timestamp_movie','tag','timestamp','timestamp_rating','rating_usr','title']].sort_values(by=['userId','timestamp_movie']))

Unnamed: 0,userId,movieId,genres_list,timestamp_movie,tag,timestamp,timestamp_rating,rating_usr,title
84,100144,2502,"[Comedy, Crime]",2009-05-25 20:05:24,Jennifer Aniston,2009-05-26 19:43:43,2009-05-25 20:05:24,5.0,Office Space (1999)
85,100144,2502,"[Comedy, Crime]",2009-05-25 20:05:24,workplace,2009-05-26 19:43:45,2009-05-25 20:05:24,5.0,Office Space (1999)
86,100144,2502,"[Comedy, Crime]",2009-05-25 20:05:24,comedy,2009-05-26 19:43:47,2009-05-25 20:05:24,5.0,Office Space (1999)
87,100144,2502,"[Comedy, Crime]",2009-05-25 20:05:24,cult film,2009-05-26 19:43:50,2009-05-25 20:05:24,5.0,Office Space (1999)
88,100144,2502,"[Comedy, Crime]",2009-05-25 20:05:24,shenanigans,2009-05-26 19:43:58,2009-05-25 20:05:24,5.0,Office Space (1999)
...,...,...,...,...,...,...,...,...,...
224,100144,66203,"[Comedy, Drama, Romance]",2009-06-13 19:52:39,chick flick,2009-06-13 19:52:59,2009-06-13 19:52:39,1.0,He's Just Not That Into You (2009)
225,100144,66203,"[Comedy, Drama, Romance]",2009-06-13 19:52:39,romance,2009-06-13 19:53:07,2009-06-13 19:52:39,1.0,He's Just Not That Into You (2009)
226,100144,66203,"[Comedy, Drama, Romance]",2009-06-13 19:52:39,romantic comedy,2009-06-13 19:53:13,2009-06-13 19:52:39,1.0,He's Just Not That Into You (2009)
227,100144,66203,"[Comedy, Drama, Romance]",2009-06-13 19:52:39,Scarlett Johansson,2009-06-13 19:53:20,2009-06-13 19:52:39,1.0,He's Just Not That Into You (2009)


### User's DNA

**User's behaviour related with tags through time**

In [9]:
pm.cumsum_tags_by_user()

pivoting table with tags + rating


- Lo primero que hace este método es llamar a otro: `pivoting_tag_sub`, el cual:
    - Crea, de entrada, dos tipos de variables por combinación `user-movie`: 
        1. Indicadora de que esa combinación tiene *x* palabra
        2. Rating de esa combinación imputada a esa palabra 

In [30]:
display(pm.tag_sub_pivot[pm.tag_sub_pivot.userId=='100144'][['userId','movieId','war','war_rtng','comedy','comedy_rtng','romance','romance_rtng']])

Unnamed: 0,userId,movieId,war,war_rtng,comedy,comedy_rtng,romance,romance_rtng
18,100144,1222,1,4.0,0,0.0,0,0.0
19,100144,1240,0,0.0,0,0.0,0,0.0
20,100144,172,0,0.0,0,0.0,0,0.0
21,100144,2502,0,0.0,1,5.0,0,0.0
22,100144,2700,0,0.0,0,0.0,0,0.0
23,100144,30825,0,0.0,0,0.0,0,0.0
24,100144,3113,0,0.0,0,0.0,0,0.0
25,100144,316,0,0.0,0,0.0,0,0.0
26,100144,33166,0,0.0,0,0.0,0,0.0
27,100144,3527,0,0.0,0,0.0,0,0.0


- Posteriormente, esa tabla se ordena por usuario-timestamp (de la más antigua a la más reciente) y, para cada usuario, se van acumulando las películas en donde ha usado cada tag y sumando los ratings asociados a esas películas donde ha suado esos tags

    + Para un futuro join más simple, se genera el consecutivo de películas vistas por el usuario en orden cronológico que funcionará como un id: `id_movie_user`

<br>

- En estos ejemplos podemos observar cómo se van acumulando esa suma de ratings y el conteo de pelìculas en donde la persona ha usado cada tag

In [32]:
display(pm.df_cumsum_tags_by_user[(pm.df_cumsum_tags_by_user.userId=='100144')][['userId','movieId','id_movie_user','war','war_rtng','comedy','comedy_rtng','sci','sci_rtng','romance','romance_rtng']])
#display(pm.df_cumsum_tags_by_user[(pm.df_cumsum_tags_by_user.userId=='99706')][['userId','movieId','id_movie_user','comedy','comedy_rtng','war','war_rtng','hilarious','hilarious_rtng','allen','allen_rtng']])

Unnamed: 0,userId,movieId,id_movie_user,war,war_rtng,comedy,comedy_rtng,sci,sci_rtng,romance,romance_rtng
18,100144,2502,1,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0
19,100144,58162,2,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0
20,100144,52973,3,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0
21,100144,35836,4,0.0,0.0,2.0,9.0,0.0,0.0,0.0,0.0
22,100144,59725,5,0.0,0.0,2.0,9.0,0.0,0.0,1.0,0.5
23,100144,58559,6,0.0,0.0,2.0,9.0,0.0,0.0,1.0,0.5
24,100144,1240,7,0.0,0.0,2.0,9.0,0.0,0.0,1.0,0.5
25,100144,62394,8,0.0,0.0,2.0,9.0,0.0,0.0,1.0,0.5
26,100144,3527,9,0.0,0.0,2.0,9.0,0.0,0.0,1.0,0.5
27,100144,316,10,0.0,0.0,2.0,9.0,1.0,4.5,1.0,0.5


- Eventualmente, con estas cifras (el conteo acumulado y el la suma acumulada de ratings) se procederá a generar los ratings promedio acumulados para cada usuario en donde haya utilizado cada tag. 

**User's behaviour related with movies' genre through time**

In [13]:
pm.cumsum_genres_by_user()

pivoting ratings + movies´s genres


En general, la misma idea que con los tags:
    
- Lo primero que hace este método es llamar a otro: `pivoting_movies_genres`, el cual:
    - Crea, de entrada, dos tipos de variables por combinación `user-movie`: 
        1. Indicadora de que esa combinación tiene *x* género
        2. Rating de esa combinación imputada a ese género

In [34]:
display(pm.movie_pivot01[(pm.movie_pivot01.userId=='100144')].sort_values(by=['timestamp_movie'])[['userId','movieId','Comedy','Comedy_rtng','Crime','Crime_rtng','Drama','Drama_rtng']])

Unnamed: 0,userId,movieId,Comedy,Comedy_rtng,Crime,Crime_rtng,Drama,Drama_rtng
21,100144,2502,1,5.0,1,5.0,0,0.0
41,100144,58162,1,0.5,0,0.0,0,0.0
37,100144,52973,1,0.5,0,0.0,1,0.5
29,100144,35836,1,4.0,0,0.0,0,0.0
45,100144,59725,1,0.5,0,0.0,0,0.0
42,100144,58559,0,0.0,1,1.0,1,1.0
19,100144,1240,0,0.0,0,0.0,0,0.0
46,100144,62394,0,0.0,1,1.5,1,1.5
27,100144,3527,0,0.0,0,0.0,0,0.0
25,100144,316,0,0.0,0,0.0,0,0.0


- Posteriormente, esa tabla se ordena por usuario-timestamp (de la más antigua a la más reciente) y, para cada usuario, se van acumulando las películas que pertenecen a cada género y sumando los ratings asociados a esas películas que han tenido esos géneros.
   
    + Para un futuro join más simple, se genera el consecutivo de películas vistas por el usuario en orden cronológico que funcionará como un id: `id_movie_user`

<br>

- En estos ejemplos podemos observar cómo se van acumulando esa suma de ratings y el conteo de pelìculas en donde la persona ha estado en "*contacto*" con ese género

In [36]:
display(pm.df_cumsum_genres_by_user[(pm.df_cumsum_genres_by_user.userId=='100144')][['userId','movieId','id_movie_user','Comedy','Comedy_rtng','Crime','Crime_rtng','Drama','Drama_rtng']])

Unnamed: 0,userId,movieId,id_movie_user,Comedy,Comedy_rtng,Crime,Crime_rtng,Drama,Drama_rtng
18,100144,2502,1,1.0,5.0,1.0,5.0,0.0,0.0
19,100144,58162,2,2.0,5.5,1.0,5.0,0.0,0.0
20,100144,52973,3,3.0,6.0,1.0,5.0,1.0,0.5
21,100144,35836,4,4.0,10.0,1.0,5.0,1.0,0.5
22,100144,59725,5,5.0,10.5,1.0,5.0,1.0,0.5
23,100144,58559,6,5.0,10.5,2.0,6.0,2.0,1.5
24,100144,1240,7,5.0,10.5,2.0,6.0,2.0,1.5
25,100144,62394,8,5.0,10.5,3.0,7.5,3.0,3.0
26,100144,3527,9,5.0,10.5,3.0,7.5,3.0,3.0
27,100144,316,10,5.0,10.5,3.0,7.5,3.0,3.0


- Igualmente, una de las cosas que podemos observar en estos ejemplos es cómo se van acumulando los ratings por género y la cantidad de películas que el usuario ha calificado con *ese género*

In [16]:
pm.df_cumsum_genres_by_user.shape

(78061, 43)

**Final transformations to get the cumulative averange ratings for the user**

In [17]:
pm.cumulative_ratings_x_user()

Para esta parte, pasamos las bases a spark para poder generar de manera parametrizada todos los promedios de las variables con las que contemos (tags y géneros) sin tener que hacerlo de manera manual y, de esta forma, pueda ser un proceso que corra de manera automática porque es capaz de adaptarse a las variables con las que se cuente, dependiendo de los requerimientos de frecuencia mínima para considerar a una palabra/tag, válida

- Pasar a spark las bases: 
    - `spark = self.spark`
    - `self.df_cumsum_tags_by_user_sp = spark.createDataFrame(self.df_cumsum_tags_by_user)`
    - `self.df_cumsum_genres_by_user_sp = spark.createDataFrame(self.df_cumsum_genres_by_user)`
    
- Cambiando nombres de campos duplicados:
    - `cols_tags=self.df_cumsum_tags_by_user_sp.columns`
    - `cols_genres=self.df_cumsum_genres_by_user_sp.columns`
    - `cols_genres_in_tags=[x for x in cols_genres if x  in cols_tags or x.lower() in cols_tags]`
    - `self.df_cumsum_genres_by_user_sp = self.df_cumsum_genres_by_user_sp.select('*',*[F.col(a).alias(a+ '_genres') for a in cols_genres_in_tags]).drop(*cols_genres_in_tags)`
    
- Generando todas las variables promedio acumulados para el usuario, tan sencillo como darle y desplegar al data frame la lista de todos los cocientes acumulados del usuario:
    - `list_rtngacum = [F.round(F.col(x+'_rtng')/F.col(x),2).alias(x+'_avg_rtng_acum') for x in self.cols_nortng] + [F.round(F.col(x+'_rtng_genres')/F.col(x+'_genres'),2).alias(x+'_genres_avg_rtng_acum') for x in self.cols_nortng_genres]`
    - `self.df_avgrtng_acum_by_user_pd = df_cumsum_tags_genres_by_user_sp.select('userId','id_movie_user','movieId',*list_rtngacum).toPandas()`
   
<br>

- En el caso de los acumulados por *tag*, una interpretación sería: **en promedio, ¿qué rating ha dado *este usuario* a las películas en donde ha usado *determinado tag*?**, atreviéndonos a ponerlo en palabras aún más simples: **¿qué tanto le han gustado las películas con *esa descripción*?** `(descripción=tag)`

- En el caso de los acumulados por *género*, una interpretación sería: **en promedio, ¿qué rating ha dado *este usuario* a las películas que tienen *determinado género*?**, atreviéndonos a ponerlo en palabras aún más simples: **¿qué tanto le han gustado las películas de *ese género*?**

In [55]:
pm.df_avgrtng_acum_by_user_pd[(pm.df_avgrtng_acum_by_user_pd.userId=='100144')][['userId','movieId','id_movie_user','war_avg_rtng_acum','comedy_avg_rtng_acum','sci_avg_rtng_acum','romance_avg_rtng_acum','Comedy_genres_avg_rtng_acum','Crime_genres_avg_rtng_acum','Drama_genres_avg_rtng_acum']].sort_values(by=['id_movie_user'])

Unnamed: 0,userId,movieId,id_movie_user,war_avg_rtng_acum,comedy_avg_rtng_acum,sci_avg_rtng_acum,romance_avg_rtng_acum,Comedy_genres_avg_rtng_acum,Crime_genres_avg_rtng_acum,Drama_genres_avg_rtng_acum
54540,100144,2502,1,,5.0,,,5.0,5.0,
2432,100144,58162,2,,5.0,,,2.75,5.0,
38223,100144,52973,3,,5.0,,,2.0,5.0,0.5
6338,100144,35836,4,,4.5,,,2.5,5.0,0.5
23950,100144,59725,5,,4.5,,0.5,2.1,5.0,0.5
46763,100144,58559,6,,4.5,,0.5,2.1,3.0,0.75
16873,100144,1240,7,,4.5,,0.5,2.1,3.0,0.75
11116,100144,62394,8,,4.5,,0.5,2.1,2.5,1.0
64614,100144,3527,9,,4.5,,0.5,2.1,2.5,1.0
52160,100144,316,10,,4.5,4.5,0.5,2.1,2.5,1.0


In [59]:
print('Variables nuevas creadas hasta el momento por ADN del usuario  ----->  ' + str(len([x for x in pm.df_avgrtng_acum_by_user_pd.columns if x.endswith('_avg_rtng_acum')])))

Variables nuevas creadas hasta el momento por ADN del usuario  ----->  144


### Movie's DNA

In here we calculate both types of features: the ones dependent on time and independent ones

- Time dependent:

    - Averange cumulative rating
    

- Time independent:

    - The indicators for the movie genre and for the tag's relevance for each particular movie

In [60]:
pm.cumulative_ratings_x_movie_and_rlvnc_genre()

Calculating the cumulative rating per movie through chronology-ratings
Calculating: catalogue/ dictionary by pivoting table with tags + relevance by movie, not userId hence, the relevance is static and does not change with the users neither with time
Calculating: catalogue/ dictionary by pivoting table with genres by movie, since not userId, the presence of a genre is static and does not change with the users neither with time


**El paso a paso**

1. Se crea primero una base ordenada por película y por fecha (de la más antigua a la más reciente). La idea es ir acumulando la historia de la película a través de las reproducciones que los usuarios van haciendo de ella de manera cronológica

     + Se crea el *'equivalente'* al `id_movie_user` pero donde, ahora, la referencia será la película a partir de la cuál se genera el consecutivo de usuarios que la han visto en orden cronológico y que funcionará como un id: `id_user_movie`



In [62]:
print('Calculating the cumulative rating per movie through chronology-ratings')
subset = pm.tag_sub_movies[['userId','movieId','timestamp_movie','rating_usr']].drop_duplicates(subset=['userId','movieId','timestamp_movie']).sort_values(by=['movieId','timestamp_movie'])
subset['I_for_id'] = 1
gb_subset = subset.groupby(['movieId'])
subset['id_user_movie'] = gb_subset['I_for_id'].cumsum(axis=0)
subset[subset.movieId=='2502']

Calculating the cumulative rating per movie through chronology-ratings


Unnamed: 0,userId,movieId,timestamp_movie,rating_usr,I_for_id,id_user_movie
229477,4450,2502,2000-01-11 06:46:09,5.0,1,1
372136,9197,2502,2001-01-21 07:07:45,4.0,1,2
355286,86768,2502,2002-09-01 17:21:07,5.0,1,3
361698,88738,2502,2003-04-08 15:38:40,5.0,1,4
65910,12128,2502,2004-02-20 10:44:03,4.5,1,5
...,...,...,...,...,...,...
63222,120937,2502,2014-06-11 11:01:11,3.5,1,79
87921,125206,2502,2014-06-21 06:44:09,4.5,1,80
3120,101767,2502,2014-11-07 02:57:26,4.5,1,81
150416,18642,2502,2014-11-14 07:56:19,4.0,1,82


2. A continuación, se calculan los promedios acumulados de los ratings recibidos en cada película a través del tiempo:

    - `self.df_cumsum_rating_by_movie= subset.drop(columns=['userId','timestamp_movie']).groupby('movieId').expanding().sum().drop(columns=['movieId','id_user_movie']).rename(columns={'I_for_id':'id_user_movie'}).reset_index().drop(columns=['level_1'])
        self.df_cumsum_rating_by_movie['rating_avg_acum_x_movie'] = self.df_cumsum_rating_by_movie['rating_usr'] / self.df_cumsum_rating_by_movie['id_user_movie']`
    - `self.df_avgrtng_acum_by_movie = pd.merge(subset.drop(columns=['I_for_id']),self.df_cumsum_rating_by_movie.rename(columns={'rating_usr':'cumsum_rating_usrs'}),on=['movieId','id_user_movie'],how='left')`
    - Y lo anterior nos arroja lo siguiente:

In [64]:
pm.df_cumsum_rating_by_movie[pm.df_cumsum_rating_by_movie.movieId=='2502']

Unnamed: 0,movieId,rating_usr,id_user_movie,rating_avg_acum_x_movie
17689,2502,5.0,1.0,5.000000
17690,2502,9.0,2.0,4.500000
17691,2502,14.0,3.0,4.666667
17692,2502,19.0,4.0,4.750000
17693,2502,23.5,5.0,4.700000
...,...,...,...,...
17767,2502,340.5,79.0,4.310127
17768,2502,345.0,80.0,4.312500
17769,2502,349.5,81.0,4.314815
17770,2502,353.5,82.0,4.310976


In [74]:
print('Variables nuevas creadas hasta el momento por rating acumulado de la película  ----->  ' + str(len([x for x in pm.df_cumsum_rating_by_movie.columns if x.endswith('rating_avg_acum_x_movie')])))

Variables nuevas creadas hasta el momento por rating acumulado de la película  ----->  1


3. Ahora, para las palabras que resultaron válidas según nuestro mínimo de frecuencia establecido, hacemos un diccionario a nivel película de las relevancias promedio que aportan las palabras de las tags asociadas con *cada película*:

    - La idea con la base de relevancias es que se fueran actualizando conforme llegue un nuevo rating/tag de un usuario a una película y, con las palabras/tags utilizadas, recalcular su importancia. Esa importancia estará en función de algún criterio que, para fines prácticos, podríamos en este caso considerarlo como frecuencia (probablemente estandarizada) de uso de ese tag a lo largo de la historia. De no ser el criterio más atinado, podría buscarse otro que quede más *ad hoc*.
    
    - Como aquí no consideramos los ratings de cada caso, sino que es un diccionario a nivel película, no hay que ordenar nada respecto al tiempo, sino que se considera que la base de la cual se parte es histórica y su última actualización ha sido hasta un *instante* previo a la siguiente visualización de cada película
    
    - Al final, conseguimos la siguiente base:

In [70]:
pm.tag_relevance_pivot[(pm.tag_relevance_pivot.movieId=='2502') | (pm.tag_relevance_pivot.movieId=='1')][['movieId','adventure_rlvnc','animation_rlvnc','comedy_rlvnc','crime_rlvnc']]

Unnamed: 0,movieId,adventure_rlvnc,animation_rlvnc,comedy_rlvnc,crime_rlvnc
0,1,0.892,0.989614,0.618,0.0
1195,2502,0.0,0.0,0.888411,0.629


In [72]:
print('Variables nuevas creadas hasta el momento por relevancia de la película  ----->  ' + str(len([x for x in pm.tag_relevance_pivot.columns if x.endswith('_rlvnc')])))

Variables nuevas creadas hasta el momento por relevancia de la película  ----->  116


4. Ahora, generamos otro diccionario el cual será para identificar qué géneros califican para cada película

    - Como aquí no consideramos los ratings de cada caso, sino que es un diccionario a nivel película, no hay que ordenar nada respecto al tiempo.
    
    - Al final, conseguimos la siguiente base:

In [77]:
pm.genre_by_movie_pivot[(pm.genre_by_movie_pivot.movieId=='2502') | (pm.genre_by_movie_pivot.movieId=='1')][['movieId','genre_Adventure','genre_Animation','genre_Comedy','genre_Crime']]

Unnamed: 0,movieId,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime
0,1,1,1,1,0
2240,2502,0,0,1,1


In [78]:
print('Variables nuevas creadas hasta el momento por género de la película  ----->  ' + str(len([x for x in pm.genre_by_movie_pivot.columns if x.startswith('genre_')])))

Variables nuevas creadas hasta el momento por género de la película  ----->  20


5. Finalmente se pegan las 3 bases y se obtiene lo siguiente:

In [83]:
pm.df_avgrtng_acum_by_movie_and_rlvnc_Igenre[(pm.df_avgrtng_acum_by_movie_and_rlvnc_Igenre.movieId=='2502') | (pm.df_avgrtng_acum_by_movie_and_rlvnc_Igenre.movieId=='1')][['movieId','userId','id_user_movie','rating_avg_acum_x_movie','adventure_rlvnc','animation_rlvnc','comedy_rlvnc','crime_rlvnc','genre_Adventure','genre_Animation','genre_Comedy','genre_Crime']]

Unnamed: 0,movieId,userId,id_user_movie,rating_avg_acum_x_movie,adventure_rlvnc,animation_rlvnc,comedy_rlvnc,crime_rlvnc,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime
0,1,60159,1,5.000000,0.892,0.989614,0.618000,0.000,1,1,1,0
1,1,119367,2,5.000000,0.892,0.989614,0.618000,0.000,1,1,1,0
2,1,48838,3,5.000000,0.892,0.989614,0.618000,0.000,1,1,1,0
3,1,46880,4,4.750000,0.892,0.989614,0.618000,0.000,1,1,1,0
4,1,1741,5,4.600000,0.892,0.989614,0.618000,0.000,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
17767,2502,120937,79,4.310127,0.000,0.000000,0.888411,0.629,0,0,1,1
17768,2502,125206,80,4.312500,0.000,0.000000,0.888411,0.629,0,0,1,1
17769,2502,101767,81,4.314815,0.000,0.000000,0.888411,0.629,0,0,1,1
17770,2502,18642,82,4.310976,0.000,0.000000,0.888411,0.629,0,0,1,1


In [80]:
print('Variables creadas por ADN  de la película  ----->  ' + str(len([x for x in pm.df_cumsum_rating_by_movie.columns if x.endswith('rating_avg_acum_x_movie')]) + len([x for x in pm.tag_relevance_pivot.columns if x.endswith('_rlvnc')]) + len([x for x in pm.genre_by_movie_pivot.columns if x.startswith('genre_')]))) 

Variables creadas por ADN  de la película  ----->  137


### The final touch for the final dataset

It's in here where method `dataset_for_model` will join all the users' variables and the movies' variables implementing the corresponding `lag` to prohibit the model to learn from future information

In [84]:
pm.dataset_for_model()

- Aquí es donde a cada nueva combinación de usuario/película se le pegan:
    
    - El diccionario de géneros (variables indicadoras por cada género) (no depende del tiempo)
    - El diccionario de relevancias por cada palabra/tag asociada a cada película (no depende del tiempo)
    
    <br>
    
    - A través de la variable `id_user_movie_prev = id_user_movie - 1`, se pegan el rating promedio acumulado previo de cada película (hasta el usuario inmediato anterior que la vió)
    - Se elimina la primera visualización de cada película ya que no se tiene histórico previo para ese momento
    
    <br>
    
    - El rating promedio acumulado previo por tag de cada película (hasta el usuario inmediato anterior que la vió) 
    
    - El rating promedio acumulado previo por género e cada película (hasta el usuario inmediato anterior que la vió).
    
    <br>
    
    - Para la nueva película que el usuario verá (o para el nuevo usuario que verá a la película), se construye la variable `high_rating` que es igual a `1` si el rating que el usuario da es mayor o igual al valor que estipulamos en el parámetro `minimun_to_hr`, en este caso `minimun_to_hr=4`, por lo que todos los ratings que sean `>=4` propiciarán que la variable `high_rating=1`  y, en los demás casos `high_rating=0`
    
    - Finalmente, obtenemos la base completa y la base únicamente con los campos a utilizar para dividir en entrenamiento y validación:

In [95]:
sorted(pm.tbl_usrmovie_t0.columns)

['(no genres listed)_avg_rtng_acum_prev',
 '250_avg_rtng_acum_prev',
 '250_rlvnc',
 'Action_genres_avg_rtng_acum_prev',
 'Adventure_genres_avg_rtng_acum_prev',
 'Animation_genres_avg_rtng_acum_prev',
 'Children_avg_rtng_acum_prev',
 'Comedy_genres_avg_rtng_acum_prev',
 'Crime_genres_avg_rtng_acum_prev',
 'Documentary_avg_rtng_acum_prev',
 'Drama_genres_avg_rtng_acum_prev',
 'Fantasy_genres_avg_rtng_acum_prev',
 'Film-Noir_avg_rtng_acum_prev',
 'Horror_genres_avg_rtng_acum_prev',
 'IMAX_avg_rtng_acum_prev',
 'Musical_avg_rtng_acum_prev',
 'Mystery_avg_rtng_acum_prev',
 'Romance_genres_avg_rtng_acum_prev',
 'Sci-Fi_avg_rtng_acum_prev',
 'Thriller_genres_avg_rtng_acum_prev',
 'War_genres_avg_rtng_acum_prev',
 'Western_avg_rtng_acum_prev',
 'acting_avg_rtng_acum_prev',
 'acting_rlvnc',
 'action_avg_rtng_acum_prev',
 'action_rlvnc',
 'adapted_avg_rtng_acum_prev',
 'adapted_rlvnc',
 'adventure_avg_rtng_acum_prev',
 'adventure_rlvnc',
 'age_avg_rtng_acum_prev',
 'age_rlvnc',
 'alien_avg_rtng_

In [96]:
pm.tbl_usrmovie_t0[(pm.tbl_usrmovie_t0.userId=='100144')][
    ['userId','movieId','id_movie_user_prev','id_user_movie','rating_usr','high_rating','rating_avg_acum_x_movie_prev','adventure_rlvnc','animation_rlvnc','comedy_rlvnc','crime_rlvnc','genre_Adventure','genre_Animation','genre_Comedy','genre_Crime']+
    [x+'_prev' for x in ['war_avg_rtng_acum','comedy_avg_rtng_acum','sci_avg_rtng_acum','romance_avg_rtng_acum','Comedy_genres_avg_rtng_acum','Crime_genres_avg_rtng_acum','Drama_genres_avg_rtng_acum']]].sort_values(by=['id_movie_user_prev'])

Unnamed: 0,userId,movieId,id_movie_user_prev,id_user_movie,rating_usr,high_rating,rating_avg_acum_x_movie_prev,adventure_rlvnc,animation_rlvnc,comedy_rlvnc,...,genre_Animation,genre_Comedy,genre_Crime,war_avg_rtng_acum_prev,comedy_avg_rtng_acum_prev,sci_avg_rtng_acum_prev,romance_avg_rtng_acum_prev,Comedy_genres_avg_rtng_acum_prev,Crime_genres_avg_rtng_acum_prev,Drama_genres_avg_rtng_acum_prev
61187,100144,8531,1,3,1.5,0,3.5,0.0,0.0,0.67605,...,0,1,1,,4.0,,,4.0,4.0,
39830,100144,541,2,73,4.5,1,4.277778,0.0,0.0,0.0,...,0,0,0,,,,,,,3.5
45252,100144,59725,4,6,0.5,0,3.3,0.0,0.0,0.0,...,0,1,0,,,,5.0,4.33,,4.5
15481,100144,2502,8,34,5.0,1,4.333333,0.0,0.0,0.888411,...,0,1,1,,5.0,,,5.0,4.5,3.9
21615,100144,3113,8,4,4.0,1,2.0,0.0,0.0,0.0,...,0,0,0,,5.0,,,5.0,4.25,4.38
25437,100144,3527,9,19,4.5,1,3.722222,0.0,0.0,0.0,...,0,0,0,4.5,,4.83,,4.5,4.63,4.75
51301,100144,6874,12,81,3.5,0,4.0,0.0,0.0,0.0,...,0,0,1,,4.25,4.5,,5.0,4.0,4.38
38043,100144,51662,14,36,0.5,0,3.528571,0.0,0.604,0.0,...,0,0,0,,5.0,,,5.0,4.5,4.0
26059,100144,35836,16,31,4.0,1,3.566667,0.0,0.0,0.86442,...,0,1,0,,4.75,,,4.78,4.81,4.8
49521,100144,66203,18,2,1.0,0,4.0,0.0,0.0,0.99575,...,0,1,0,,5.0,,,4.7,3.88,4.04


**Dataset with id's & timestamp**

In [99]:
BASE = "tbl_usrmovie_t0_"+str(pm.minimun_wordfreq)
pm.tbl_usrmovie_t0.to_pickle("data/" + BASE + ".pkl")
pm.tbl_usrmovie_t0

Unnamed: 0,movieId,id_user_movie,userId,timestamp_movie,rating_usr,genre_(no genres listed),genre_Action,genre_Adventure,genre_Animation,genre_Children,...,Animation_genres_avg_rtng_acum_prev,Comedy_genres_avg_rtng_acum_prev,Crime_genres_avg_rtng_acum_prev,Drama_genres_avg_rtng_acum_prev,Fantasy_genres_avg_rtng_acum_prev,Horror_genres_avg_rtng_acum_prev,Romance_genres_avg_rtng_acum_prev,Thriller_genres_avg_rtng_acum_prev,War_genres_avg_rtng_acum_prev,high_rating
0,1,2,119367,2000-11-21 01:14:51,5.0,0,0,1,1,1,...,5.00,5.00,4.00,4.50,5.00,3.00,5.00,4.25,,1
1,1,3,48838,2002-03-20 19:10:11,5.0,0,0,1,1,1,...,5.00,4.50,4.29,4.62,5.00,5.00,5.00,4.09,5.00,1
2,1,4,46880,2002-11-18 18:34:18,4.0,0,0,1,1,1,...,5.00,5.00,5.00,5.00,5.00,,,5.00,,1
3,1,5,1741,2002-11-27 18:31:23,4.0,0,0,1,1,1,...,4.33,4.21,4.13,4.35,4.33,4.00,4.50,4.17,4.00,1
4,1,6,20388,2003-01-15 06:33:48,5.0,0,0,1,1,1,...,3.29,3.32,3.16,3.41,3.54,2.14,3.33,2.90,3.80,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68076,99946,2,78564,2014-03-20 18:15:07,5.0,0,0,0,0,0,...,,,1.50,2.00,,2.50,,1.50,,1
68077,99996,2,42429,2013-08-06 03:59:59,2.5,0,0,0,0,0,...,3.75,3.64,3.78,3.53,3.94,3.65,3.45,3.73,3.29,0
68078,99996,3,36085,2013-11-26 23:24:02,4.0,0,0,0,0,0,...,3.00,3.41,3.59,3.42,2.94,1.75,3.19,3.49,3.67,1
68079,99996,4,102853,2014-12-09 19:24:36,3.5,0,0,0,0,0,...,4.00,4.17,4.19,4.11,4.38,4.50,4.06,4.32,4.00,0


In [13]:
BASE = "tbl_usrmovie_t0_"+str(pm.minimun_wordfreq)
pm.tbl_usrmovie_t0.to_pickle("data/" + BASE + ".pkl")
pm.tbl_usrmovie_t0

Unnamed: 0,movieId,id_user_movie,userId,timestamp_movie,rating_usr,genre_(no genres listed),genre_Action,genre_Adventure,genre_Animation,genre_Children,...,Mystery_avg_rtng_acum_prev,Sci-Fi_avg_rtng_acum_prev,Thriller_avg_rtng_acum_prev,Western_avg_rtng_acum_prev,Action_avg_rtng_acum_prev,Comedy_avg_rtng_acum_prev,Fantasy_avg_rtng_acum_prev,Romance_avg_rtng_acum_prev,War_avg_rtng_acum_prev,high_rating
0,1,2,1741,2002-11-27 18:31:23,4.0,0,0,1,1,1,...,5.00,4.35,4.71,3.00,4.60,4.79,5.00,5.00,5.00,1
1,1,3,20388,2003-01-15 06:33:48,5.0,0,0,1,1,1,...,3.38,3.07,3.00,3.00,2.96,3.42,3.53,3.47,3.80,1
2,1,4,123297,2004-03-30 18:59:56,5.0,0,0,1,1,1,...,,5.00,5.00,,5.00,5.00,5.00,,,1
3,1,5,72073,2005-09-30 12:30:36,5.0,0,0,1,1,1,...,4.33,4.54,4.63,4.50,4.50,4.63,4.69,4.56,4.38,1
4,1,6,86768,2006-01-13 01:55:17,4.0,0,0,1,1,1,...,,5.00,5.00,,5.00,5.00,5.00,5.00,5.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42727,99917,10,134864,2015-01-30 00:58:52,4.0,0,0,0,0,0,...,,5.00,5.00,,,,,5.00,,1
42728,99996,2,42429,2013-08-06 03:59:59,2.5,0,0,0,0,0,...,3.77,3.69,3.84,4.13,3.85,3.71,3.97,3.57,3.50,0
42729,99996,3,36085,2013-11-26 23:24:02,4.0,0,0,0,0,0,...,3.77,3.38,3.58,,3.50,3.50,3.25,3.23,3.50,1
42730,99996,4,102853,2014-12-09 19:24:36,3.5,0,0,0,0,0,...,4.20,4.00,4.43,,4.58,4.14,4.50,4.33,4.00,0


**Dataset without id's & timestamp**    

- Finally we get our dataset. This one has 42,732 observations, 97 *(102-5)* features and the objective variable `high_rating`.

*(this one we'll use to train/test the model)*

In [14]:
BASE = "ds_tastingmovies_"+str(pm.minimun_wordfreq)
pm.ds_tastingmovies.to_pickle("data/" + BASE + ".pkl")
pm.ds_tastingmovies

Unnamed: 0,genre_(no genres listed),genre_Action,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Fantasy,...,Mystery_avg_rtng_acum_prev,Sci-Fi_avg_rtng_acum_prev,Thriller_avg_rtng_acum_prev,Western_avg_rtng_acum_prev,Action_avg_rtng_acum_prev,Comedy_avg_rtng_acum_prev,Fantasy_avg_rtng_acum_prev,Romance_avg_rtng_acum_prev,War_avg_rtng_acum_prev,high_rating
0,0,0,1,1,1,1,0,0,0,1,...,5.00,4.35,4.71,3.00,4.60,4.79,5.00,5.00,5.00,1
1,0,0,1,1,1,1,0,0,0,1,...,3.38,3.07,3.00,3.00,2.96,3.42,3.53,3.47,3.80,1
2,0,0,1,1,1,1,0,0,0,1,...,,5.00,5.00,,5.00,5.00,5.00,,,1
3,0,0,1,1,1,1,0,0,0,1,...,4.33,4.54,4.63,4.50,4.50,4.63,4.69,4.56,4.38,1
4,0,0,1,1,1,1,0,0,0,1,...,,5.00,5.00,,5.00,5.00,5.00,5.00,5.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42727,0,0,0,0,0,0,0,0,0,0,...,,5.00,5.00,,,,,5.00,,1
42728,0,0,0,0,0,1,0,0,1,0,...,3.77,3.69,3.84,4.13,3.85,3.71,3.97,3.57,3.50,0
42729,0,0,0,0,0,1,0,0,1,0,...,3.77,3.38,3.58,,3.50,3.50,3.25,3.23,3.50,1
42730,0,0,0,0,0,1,0,0,1,0,...,4.20,4.00,4.43,,4.58,4.14,4.50,4.33,4.00,0


In [100]:
BASE = "ds_tastingmovies_"+str(pm.minimun_wordfreq)
pm.ds_tastingmovies.to_pickle("data/" + BASE + ".pkl")
pm.ds_tastingmovies

Unnamed: 0,genre_(no genres listed),genre_Action,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Fantasy,...,Animation_genres_avg_rtng_acum_prev,Comedy_genres_avg_rtng_acum_prev,Crime_genres_avg_rtng_acum_prev,Drama_genres_avg_rtng_acum_prev,Fantasy_genres_avg_rtng_acum_prev,Horror_genres_avg_rtng_acum_prev,Romance_genres_avg_rtng_acum_prev,Thriller_genres_avg_rtng_acum_prev,War_genres_avg_rtng_acum_prev,high_rating
0,0,0,1,1,1,1,0,0,0,1,...,5.00,5.00,4.00,4.50,5.00,3.00,5.00,4.25,,1
1,0,0,1,1,1,1,0,0,0,1,...,5.00,4.50,4.29,4.62,5.00,5.00,5.00,4.09,5.00,1
2,0,0,1,1,1,1,0,0,0,1,...,5.00,5.00,5.00,5.00,5.00,,,5.00,,1
3,0,0,1,1,1,1,0,0,0,1,...,4.33,4.21,4.13,4.35,4.33,4.00,4.50,4.17,4.00,1
4,0,0,1,1,1,1,0,0,0,1,...,3.29,3.32,3.16,3.41,3.54,2.14,3.33,2.90,3.80,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68076,0,0,0,0,0,0,0,0,1,0,...,,,1.50,2.00,,2.50,,1.50,,1
68077,0,0,0,0,0,1,0,0,1,0,...,3.75,3.64,3.78,3.53,3.94,3.65,3.45,3.73,3.29,0
68078,0,0,0,0,0,1,0,0,1,0,...,3.00,3.41,3.59,3.42,2.94,1.75,3.19,3.49,3.67,1
68079,0,0,0,0,0,1,0,0,1,0,...,4.00,4.17,4.19,4.11,4.38,4.50,4.06,4.32,4.00,0
