# Read the films genres

In [1]:
import pandas as pd

films = pd.read_csv('./dataset/movies.csv')
films.set_index('movieId', inplace=True)
films.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


# Read the Ratings

In [2]:
ratings = pd.read_csv('./dataset/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


<br>
<br>
<br>

# 1. Recomending by Views

#### Get the movie titles with the mean rating and views

In [3]:
import time
from datetime import datetime, timedelta

def movieRatingViews(days_ago=0):
    if days_ago != 0:
        range_time = (datetime.now() - timedelta(days=days_ago)).timestamp()
        recent_ratings = ratings[ratings.timestamp >= range_time]
    else:
        recent_ratings = ratings

    view_rating = recent_ratings.groupby(by="movieId").agg({"rating": "mean", "timestamp": "count"})
    view_rating.rename(columns={"timestamp": "views"}, inplace=True)

    movies_views = films.merge(view_rating, on="movieId", how="inner")
    movies_views.sort_values(by=["views", "rating"], ascending=False, inplace=True)
    
    return movies_views

### 1.2. Top 10 movies according with the Views and Ratings

In [4]:
movies = movieRatingViews()
movies.sort_values(by=["views", "rating"], ascending=False, inplace=True)

movies.head(10)

Unnamed: 0_level_0,title,genres,rating,views
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.164134,329
318,"Shawshank Redemption, The (1994)",Crime|Drama,4.429022,317
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.197068,307
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,4.16129,279
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,4.192446,278
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.231076,251
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,3.75,238
110,Braveheart (1995),Action|Drama|War,4.031646,237
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,3.970982,224
527,Schindler's List (1993),Drama|War,4.225,220


### 1.1. Most viewed recently

In [5]:
TARGET_DAYS_AGO = 395

# Get and filter the previous days
recent_ratings_movies = movieRatingViews(days_ago=TARGET_DAYS_AGO)
recent_ratings_movies.head(10)

Unnamed: 0_level_0,title,genres,rating,views
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,5.0,2
112552,Whiplash (2014),Drama,5.0,2
109374,"Grand Budapest Hotel, The (2014)",Comedy|Drama,4.75,2
148626,"Big Short, The (2015)",Drama,4.75,2
122916,Thor: Ragnarok (2017),Action|Adventure|Sci-Fi,4.5,2
177765,Coco (2017),Adventure|Animation|Children,4.5,2
79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,4.25,2
106782,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama,4.25,2
106920,Her (2013),Drama|Romance|Sci-Fi,4.25,2
119145,Kingsman: The Secret Service (2015),Action|Adventure|Comedy|Crime,4.25,2


<br>
<br>
<br>

# 2. Find by the the same genres

    The variable bellow represents the movie that user already viewed

In [6]:
LAST_MOVIE = "Comedy|Crime"

    Filter according with the history from the user and returns the most visualized movies of the same genres

In [7]:
def moviesByGenres(last_genre):
    movies = movieRatingViews()
    
    movies.sort_values(by=["views", "rating"], ascending=False, inplace=True)
    movies.reset_index(inplace=True)

    return movies[movies.genres == last_genre]

In [8]:
moviesByGenres(LAST_MOVIE).head(20)

Unnamed: 0,movieId,title,genres,rating,views
116,1732,"Big Lebowski, The (1998)",Comedy|Crime,3.924528,106
150,2502,Office Space (1999),Comedy|Crime,4.090426,94
227,69122,"Hangover, The (2009)",Comedy|Crime,3.631579,76
247,1079,"Fish Called Wanda, A (1988)",Comedy|Crime,3.922535,71
292,1234,"Sting, The (1973)",Comedy|Crime,3.976562,64
300,4025,Miss Congeniality (2000),Comedy|Crime,3.054688,64
437,910,Some Like It Hot (1959),Comedy|Crime,4.01,50
580,3301,"Whole Nine Yards, The (2000)",Comedy|Crime,3.52381,42
615,762,Striptease (1996),Comedy|Crime,2.414634,41
641,2580,Go (1999),Comedy|Crime,3.961538,39


<br>
<br>
<br>

## 3. Find the most similar persons using rating

#### Create a method to get the near people based on the watched films

In [9]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

def findNearPersons(user, n_people=5):
    #####################
    # EXTRACT DUMMIES
    movies_rating = ratings.merge(pd.DataFrame(films["title"]), on="movieId", how="inner")
    movies_rating.drop(columns=["movieId", "timestamp"], inplace=True)
    users = movies_rating['userId'].drop_duplicates().values

    movies_rating = movies_rating.pivot_table(columns=['title'], index='userId', values='rating')

    movies_rating.replace({np.nan: 0}, inplace=True)
    #####################
    
    X = movies_rating.values
    
    #####################
    # Create a model KNN
    model = NearestNeighbors(n_neighbors=n_people, algorithm='ball_tree').fit(X)
    #####################
    
    # Get the near people
    distances, indices = model.kneighbors([X[movies_rating.index.get_loc(user)]])
    
    return (distances[0], indices[0],)
    

In [10]:
def moviesByNearPeople(user, n_people=5):
    # Find the people in commom
    distances, indices = findNearPersons(TARGET_USER, n_people=n_people)
    
    # Get the movies
    movies = movieRatingViews()
    movies.reset_index(inplace=True)

    # Filter the reviews by the near people
    mask = ratings.userId.isin(indices)
    moviesId = ratings[mask]

    # Remove the duplicates
    moviesId = moviesId.movieId.drop_duplicates()

    # Remove films already viewed by the user
    user_films = ratings[ratings.userId == TARGET_USER].movieId.values # Get the user movies id
    mask = moviesId.isin(user_films) == False # Check if not exists in user_films
    moviesId = moviesId[mask]


    # Filtering and sort the movies
    return movies[movies.movieId.isin(moviesId.values)].sort_values(by=["views", "rating"], ascending=False)

In [11]:
from IPython.display import display, HTML

# Define the actual user
TARGET_USER = 98
display(HTML('<h3>Target: <i>{}</i><h3/>'.format(TARGET_USER)))

# Near people
_, indices = findNearPersons(TARGET_USER)
display(HTML('<h3>Near people: <i>{}</i><h3/>'.format(indices)))

In [12]:
moviesByNearPeople(user=TARGET_USER).head(20)

Unnamed: 0,movieId,title,genres,rating,views
1,318,"Shawshank Redemption, The (1994)",Crime|Drama,4.429022,317
2,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.197068,307
4,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,4.192446,278
5,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.231076,251
6,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,3.75,238
7,110,Braveheart (1995),Action|Drama|War,4.031646,237
8,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,3.970982,224
9,527,Schindler's List (1993),Drama|War,4.225,220
10,2959,Fight Club (1999),Action|Crime|Drama|Thriller,4.272936,218
12,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,4.21564,211


<br>
<br>


# 4. Recomending after the watch some movie

> To solve that problem i'm considering the rating dataset work like a history of watched movies

#### Configuring

In [13]:
from IPython.display import display, HTML

# Proposal film
TARGET_MOVIE = 296

movie_name = films[films.index == TARGET_MOVIE].title.iloc[0]
display(HTML('<h3>Target: <i>{}</i><h3/>'.format(films[films.index == TARGET_MOVIE].title.iloc[0])))

#### Search the next movie

In [14]:
import pandas as pd
from collections import Counter

def peopleWatchedAfter(target):
    ratings.sort_values(by="timestamp", inplace=True)

    # Create a counter
    movies = Counter()

    # Find the user witch already watched
    users = ratings[ratings.movieId == TARGET_MOVIE].userId.values

    # Pass by each user
    for user in users:
        # Find the movies
        user_movies = ratings[ratings.userId == user].movieId.values

        # Find the target movie and sum at counter
        for i in range(len(user_movies)):
            if user_movies[i] == target:
                try:
                    movies[user_movies[i + 1]] += 1
                except IndexError:
                    continue
    
    next_movies = pd.DataFrame(list(movies.items()), columns=["movieId", "views"])
    next_movies.sort_values(by="views", ascending=False, inplace=True)
    next_movies.reset_index(drop=True, inplace=True)

    movies = movieRatingViews()
    
    return next_movies.join(movies[["rating", "title"]], on="movieId", how="inner")

In [15]:
watch_next_movies = peopleWatchedAfter(TARGET_MOVIE)
watch_next_movies.sort_values(by=["views", "rating"], ascending=False, inplace=True)
watch_next_movies.head(20)

Unnamed: 0,movieId,views,rating,title
0,380,21,3.497191,True Lies (1994)
1,150,19,3.845771,Apollo 13 (1995)
2,356,17,4.164134,Forrest Gump (1994)
3,593,12,4.16129,"Silence of the Lambs, The (1991)"
4,592,9,3.428571,Batman (1989)
6,2959,6,4.272936,Fight Club (1999)
5,1213,6,4.25,Goodfellas (1990)
7,1089,6,4.20229,Reservoir Dogs (1992)
8,344,6,3.040373,Ace Ventura: Pet Detective (1994)
9,50,5,4.237745,"Usual Suspects, The (1995)"


<br>
<br>

# 5. Watch the movies again

#### Configuring

In [17]:
from IPython.display import display, HTML

# Proposal film
TARGET_USER = 98

display(HTML('<h3>User Target: <i>{}</i><h3/>'.format(TARGET_USER)))

In [104]:
def moviesWatchAgain(user, batch_size=10, selection_size=None):
    already_watched = ratings[ratings.userId == user][["movieId", "rating", "timestamp"]]
    already_watched.sort_values(by=["rating"], ascending=False, inplace=True) # Chorological sort
    
    movies = movieRatingViews()[["title"]]
    
    movies = already_watched.join(movies, on="movieId", how="inner")
    movies.reset_index(drop=True, inplace=True)
    
    selection = movies[movies.index < (batch_size * 2 if selection_size is None else selection_size)]
    
    batch_size = min(batch_size, selection.shape[0])
    
    return selection.sample(batch_size)

In [107]:
moviesWatchAgain(TARGET_USER, batch_size=20)

Unnamed: 0,movieId,rating,timestamp,title
15,74458,5.0,1532457910,Shutter Island (2010)
39,81847,4.5,1532457483,Tangled (2010)
6,1197,5.0,1532457921,"Princess Bride, The (1987)"
29,1214,4.5,1532457890,Alien (1979)
12,50872,5.0,1532458002,Ratatouille (2007)
5,1258,5.0,1532457919,"Shining, The (1980)"
35,168250,4.5,1532457987,Get Out (2017)
10,122916,5.0,1532457746,Thor: Ragnarok (2017)
22,97921,5.0,1532457833,Silver Linings Playbook (2012)
36,104841,4.5,1532457992,Gravity (2013)
