In [None]:
import pandas as pd
import numpy as np        
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)

In [None]:
movie_df = pd.read_csv("movie.csv")
rating_df = pd.read_csv("rating.csv")

In [None]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

check_df(movie_df)

##################### Shape #####################
(27278, 3)
##################### Types #####################
movieId     int64
title      object
genres     object
dtype: object
##################### Head #####################
   movieId                               title                                       genres
0        1                    Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy
1        2                      Jumanji (1995)                   Adventure|Children|Fantasy
2        3             Grumpier Old Men (1995)                               Comedy|Romance
3        4            Waiting to Exhale (1995)                         Comedy|Drama|Romance
4        5  Father of the Bride Part II (1995)                                       Comedy
##################### Tail #####################
       movieId                          title                    genres
27273   131254   Kein Bund für's Leben (2007)                    Comedy
27274   131256  Feuer, Eis 

In [None]:
check_df(rating_df)

##################### Shape #####################
(319682, 4)
##################### Types #####################
userId         int64
movieId        int64
rating       float64
timestamp     object
dtype: object
##################### Head #####################
   userId  movieId  rating            timestamp
0       1        2     3.5  2005-04-02 23:53:47
1       1       29     3.5  2005-04-02 23:31:16
2       1       32     3.5  2005-04-02 23:33:39
3       1       47     3.5  2005-04-02 23:32:07
4       1       50     3.5  2005-04-02 23:29:40
##################### Tail #####################
        userId  movieId  rating            timestamp
319677    2184     1732     4.0  2004-07-25 04:42:16
319678    2184     1747     4.0  2004-07-25 04:45:18
319679    2184     1784     4.0  2004-07-25 04:33:15
319680    2184     1805     1.5  2004-07-25 16:37:52
319681    2184     1889     4.5                  NaN
##################### NA #####################
userId       0
movieId      0
rating   

In [None]:
df=rating_df.merge(movie_df, how="left", on="movieId")

In [None]:
df.shape

(319682, 6)

In [None]:
def create_user_movie_df():
    import pandas as pd
    movie = pd.read_csv('movie.csv')
    rating = pd.read_csv('rating.csv')
    df = rating_df.merge(movie_df, how="left", on="movieId")
    comment_counts = pd.DataFrame(df["title"].value_counts())
    rare_movies = comment_counts[comment_counts["title"] <= 100].index
    common_movies = df[~df["title"].isin(rare_movies)]
    user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
    return user_movie_df

In [None]:
user_movie_df = create_user_movie_df()

In [None]:
int(pd.Series(user_movie_df.index).sample(1).values)

321

In [None]:
random_user_df = user_movie_df[user_movie_df.index == 321]

In [None]:
movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist()
movies_watched

['American Beauty (1999)',
 'American History X (1998)',
 'Apocalypse Now (1979)',
 'Braveheart (1995)',
 'City of God (Cidade de Deus) (2002)',
 'Desperado (1995)',
 'Die Hard 2 (1990)',
 'Donnie Darko (2001)',
 'Eternal Sunshine of the Spotless Mind (2004)',
 'Fight Club (1999)',
 'Game, The (1997)',
 'Goodfellas (1990)',
 'Groundhog Day (1993)',
 'Halloween (1978)',
 'Jungle Book, The (1967)',
 'L.A. Story (1991)',
 'Little Miss Sunshine (2006)',
 'Memento (2000)',
 'Men in Black (a.k.a. MIB) (1997)',
 'Outbreak (1995)',
 'Pulp Fiction (1994)',
 'Requiem for a Dream (2000)',
 'Reservoir Dogs (1992)',
 'Risky Business (1983)',
 'Scarface (1983)',
 'Silence of the Lambs, The (1991)',
 'Star Wars: Episode I - The Phantom Menace (1999)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Taxi Driver (1976)',
 'Terminator, The (1984)',
 'Untouchables, The (1987)',
 'Usual Suspects, The (1995)']

In [None]:
movies_watched_df = user_movie_df[movies_watched]

In [None]:
user_movie_count = movies_watched_df.T.notnull().sum()
user_movie_count = user_movie_count.reset_index()

In [None]:
user_movie_count.columns = ["userId", "movie_count"]
perc = len(movies_watched) * 60 / 100
users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"]

In [None]:
users_same_movies

23        24
53        54
57        58
90        91
115      116
        ... 
2105    2107
2111    2113
2149    2151
2156    2158
2169    2171
Name: userId, Length: 126, dtype: int64

In [None]:
final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)],
                          random_user_df[movies_watched]])

In [None]:
corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
corr_df = pd.DataFrame(corr_df, columns=["corr"])
corr_df.index.names = ['user_id_1', 'user_id_2']
corr_df = corr_df.reset_index()

In [None]:
top_users = corr_df[(corr_df["user_id_1"] == 321) & (corr_df["corr"] >= 0.65)][
        ["user_id_2", "corr"]].reset_index(drop=True)
top_users = top_users.sort_values(by='corr', ascending=False)
top_users.rename(columns={"user_id_2": "userId"}, inplace=True)

In [None]:
top_users_ratings = top_users.merge(rating_df[["userId", "movieId", "rating"]], how='inner')

In [None]:
top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']

In [None]:
recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
recommendation_df = recommendation_df.reset_index()

In [None]:
movies_to_rec = recommendation_df.loc[recommendation_df["weighted_rating"] >=3.5].sort_values("weighted_rating", ascending=False)

In [None]:
movies_to_rec.merge(movie_df[["movieId", "title"]]).head()

Unnamed: 0,movieId,weighted_rating,title
0,1208,5.0,Apocalypse Now (1979)
1,1209,5.0,Once Upon a Time in the West (C'era una volta ...
2,111,5.0,Taxi Driver (1976)
3,2078,4.5,"Jungle Book, The (1967)"
4,2959,4.5,Fight Club (1999)
