# Read the films genres

In [1]:
import pandas as pd

films = pd.read_csv('./dataset/movies.csv')
films.set_index('movieId', inplace=True)
films.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


# Read the Ratings

In [2]:
ratings = pd.read_csv('./dataset/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


<br>
<br>
<br>

# 1. Recomending by Views

### 1.1. Most viewed recently

In [3]:
import time
from datetime import datetime, timedelta

range_time = (datetime.now() - timedelta(days=395)).timestamp()
recent_ratings = ratings[ratings.timestamp >= range_time]

recent_ratings = recent_ratings.groupby(by="movieId").agg({"timestamp": "count"})
recent_ratings.rename(columns={"timestamp": "views"}, inplace=True)
recent_ratings.sort_values(by="views", inplace=True, ascending=False)

recent_ratings_movies = films.merge(recent_ratings, on="movieId", how="inner")
recent_ratings_movies.sort_values(by=["views"], ascending=False, inplace=True)

recent_ratings_movies.head(10)

Unnamed: 0_level_0,title,genres,views
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,3
33794,Batman Begins (2005),Action|Crime|IMAX,2
177765,Coco (2017),Adventure|Animation|Children,2
3793,X-Men (2000),Action|Adventure|Sci-Fi,2
116797,The Imitation Game (2014),Drama|Thriller|War,2
102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,2
35836,"40-Year-Old Virgin, The (2005)",Comedy|Romance,2
119145,Kingsman: The Secret Service (2015),Action|Adventure|Comedy|Crime,2
91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX,2
112552,Whiplash (2014),Drama,2


### 1.2. Top 10 movies according with the Views and Ratings

In [4]:
view_mean = ratings.groupby(by="movieId").agg({"rating": "mean", "timestamp": "count"})
view_mean.rename(columns={"timestamp": "views"}, inplace=True)
view_mean.sort_values(by="views", inplace=True, ascending=False)

first_recomendation = films.merge(view_mean, on="movieId", how="inner")
first_recomendation.sort_values(by=["views", "rating"], ascending=False, inplace=True)

first_recomendation.head(10)

Unnamed: 0_level_0,title,genres,rating,views
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.164134,329
318,"Shawshank Redemption, The (1994)",Crime|Drama,4.429022,317
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.197068,307
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,4.16129,279
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,4.192446,278
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.231076,251
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,3.75,238
110,Braveheart (1995),Action|Drama|War,4.031646,237
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,3.970982,224
527,Schindler's List (1993),Drama|War,4.225,220


<br>
<br>
<br>

# 2. Same genres watched previously

    The variable bellow represents the movie that user already viewed

In [5]:
USER_WATCHING = "Comedy|Crime"

    Filter according with the history from the user and returns the most visualized movies of the same genres

In [6]:
top_same_genre = first_recomendation[[USER_WATCHING in genre for genre in first_recomendation.genres]]
top_same_genre.head(10)

Unnamed: 0_level_0,title,genres,rating,views
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.197068,307
608,Fargo (1996),Comedy|Crime|Drama|Thriller,4.116022,181
367,"Mask, The (1994)",Action|Comedy|Crime|Fantasy,3.184713,157
153,Batman Forever (1995),Action|Adventure|Comedy|Crime,2.916058,137
1732,"Big Lebowski, The (1998)",Comedy|Crime,3.924528,106
778,Trainspotting (1996),Comedy|Crime|Drama,4.039216,102
2987,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...,3.572165,97
2502,Office Space (1999),Comedy|Crime,4.090426,94
4027,"O Brother, Where Art Thou? (2000)",Adventure|Comedy|Crime,3.808511,94
4011,Snatch (2000),Comedy|Crime|Thriller,4.155914,93


<br>
<br>
<br>

## 3. Find the most similar persons using rating

In [7]:
import numpy as np

movies_rating = ratings.merge(pd.DataFrame(films["title"]), on="movieId", how="inner")
movies_rating.drop(columns=["movieId", "timestamp"], inplace=True)
users = movies_rating['userId'].drop_duplicates().values

movies_rating = movies_rating.pivot_table(columns=['title'], index='userId', values='rating')

movies_rating.replace({np.nan: 0}, inplace=True)

#### Convert in train objects

In [8]:
X = movies_rating.values
y = movies_rating.index.values

<br>
<br>

#### Modeling

In [9]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(X)

<br>
<br>

#### Find near peaple

In [10]:
from IPython.display import HTML, display

target = 98

# Find the most similar
distances, indices = model.kneighbors([X[target]])

display(HTML("<h3>Similar people id:</h3> <h4>{}</h4>".format(str(indices[0].tolist())[1:-1])))

<br>
<br>

### Recomending based in what similar people already watched

In [11]:
mask = ratings.userId.isin(indices[0][1:])

# Filter the reviews by the near people
near_reviews = ratings[mask]

# Group by films
near_reviews = near_reviews.groupby("movieId").agg({"rating": "mean"})

# Remove films already viewed by the user
user_films = ratings[ratings.userId == target].movieId.values

# Check if the user not already watched the movies
mask = near_reviews.index.isin(user_films) == False
near_reviews = near_reviews[mask]
near_reviews.reset_index(inplace=True)

# Merge the movies
movies_views = films.merge(view_mean, on="movieId", how="inner")[["title", "genres", "views"]]
movies_views = movies_views.merge(near_reviews, on="movieId", how="inner")
recommended_movies = movies_views

# Sort descending by rating
recommended_movies.sort_values(["rating", "views"], inplace=True, ascending=False)

recommended_movies.head(20)

Unnamed: 0,movieId,title,genres,views,rating
11,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,204,5.0
90,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,200,5.0
59,590,Dances with Wolves (1990),Adventure|Drama|Western,164,5.0
358,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,149,5.0
18,165,Die Hard: With a Vengeance (1995),Action|Crime|Thriller,144,5.0
29,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,133,5.0
146,2329,American History X (1998),Crime|Drama,129,5.0
237,5349,Spider-Man (2002),Action|Adventure|Sci-Fi|Thriller,122,5.0
171,2918,Ferris Bueller's Day Off (1986),Comedy,109,5.0
338,48516,"Departed, The (2006)",Crime|Drama|Thriller,107,5.0


<br>
<br>


# 4. Recomending after the watch some movie

> To solve that problem i'm considering the rating dataset work like a history of watched movies

#### Configuring

In [130]:
# Proposal film
TARGET_MOVIE = 296

movie_name = films[films.index == TARGET_MOVIE].title.iloc[0]
display(HTML('<h3>Target: <i>{}</i><h3/>'.format(films[films.index == TARGET_MOVIE].title.iloc[0])))

#### Search the next movie

In [122]:
from collections import Counter

ratings.sort_values(by="timestamp", inplace=True)

# Create a counter
movies = Counter()

# Find the user witch already watched
users = ratings[ratings.movieId == TARGET_MOVIE].userId.values

# Pass by each user
for user in users:
    # Find the movies
    user_movies = ratings[ratings.userId == user].movieId.values
    
    # Find the target movie and sum at counter
    for i in range(len(user_movies)):
        if user_movies[i] == TARGET_MOVIE:
            try:
                movies[user_movies[i + 1]] += 1
            except IndexError:
                continue

In [131]:
import pandas as pd

next_movies = pd.DataFrame(list(movies.items()), columns=["movieId", "views"])
next_movies.sort_values(by="views", ascending=False, inplace=True)
next_movies.reset_index(drop=True, inplace=True)

next_movies_titles = next_movies.join(films, on="movieId", how="inner")

next_movies_titles.head(20)

Unnamed: 0,movieId,views,title,genres
0,380,18,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller
1,356,17,Forrest Gump (1994),Comedy|Drama|Romance|War
2,150,16,Apollo 13 (1995),Adventure|Drama|IMAX
3,593,11,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
4,592,10,Batman (1989),Action|Crime|Thriller
5,590,7,Dances with Wolves (1990),Adventure|Drama|Western
6,527,6,Schindler's List (1993),Drama|War
7,2959,6,Fight Club (1999),Action|Crime|Drama|Thriller
8,1089,6,Reservoir Dogs (1992),Crime|Mystery|Thriller
9,1213,6,Goodfellas (1990),Crime|Drama
