# Imorting Files

In [1]:
import pandas as pd

In [2]:
links = pd.read_csv('csv_files/links.csv')

In [3]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies = pd.read_csv('csv_files/movies.csv')

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [7]:
ratings = pd.read_csv('csv_files/ratings.csv')

In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [10]:
ratings['rating'].value_counts(normalize=True)

4.0    0.265957
3.0    0.198808
5.0    0.131015
3.5    0.130271
4.5    0.084801
2.0    0.074884
2.5    0.055040
1.0    0.027877
1.5    0.017762
0.5    0.013586
Name: rating, dtype: float64

In [11]:
tags =  pd.read_csv('csv_files/tags.csv')

In [12]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [13]:
new_df = ratings.drop(columns='timestamp')

# Recommendation System

In [14]:
from surprise import Reader, Dataset
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [15]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(new_df[['userId', 'movieId', 'rating']], reader)

## Chosing Prediction Algorithms

In [16]:

benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  

KeyboardInterrupt: 

For time constraints the model will be using BaselineOnly

## Fine Tunning Parameters

In [17]:
# Fine tuning paramaters
param_grid = {'bsl_options': {'method': ['als', 'sgd'],
                             'reg': [0.2, 0.4, 0.6],
                             'reg_i': [5,10,15],
                             'reg_u' : [10,15,20],
                             'learning_rate': [0.003, 0.005, 0.008],
                             'n_epochs': [75,100,125]}}

In [18]:
gs = GridSearchCV(BaselineOnly, param_grid = param_grid , measures=['rmse'], cv=3)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


KeyboardInterrupt: 

In [19]:
print('Using ALS')
bsl_options = {'method': 'als',
               'reg': 0.2,
               'n_epochs': 100,
               'reg_u': 10,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

Using ALS
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([0.86999141, 0.86495343, 0.87325869]),
 'fit_time': (1.7239274978637695, 1.7334814071655273, 1.8169958591461182),
 'test_time': (0.5612306594848633, 0.39148998260498047, 0.4154846668243408)}

In [20]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 0.8764


0.876361393188517

## Top 5 Recommendations

In [21]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['est'] =  round(df['est'] * 2) / 2
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)

In [22]:
df = pd.merge(df, movies, left_on = 'iid', right_on = 'movieId')

In [23]:
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [24]:
 df.loc[df['uid'] == 1].sort_values(by='err').head()

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err,movieId,title,genres
162,1,592,4.0,4.0,{'was_impossible': False},169,130,0.0,592,Batman (1989),Action|Crime|Thriller
13839,1,2273,4.0,4.0,{'was_impossible': False},169,29,0.0,2273,Rush Hour (1998),Action|Comedy|Crime|Thriller
10301,1,2012,4.0,4.0,{'was_impossible': False},169,68,0.0,2012,Back to the Future Part III (1990),Adventure|Comedy|Sci-Fi|Western
8942,1,1222,5.0,5.0,{'was_impossible': False},169,74,0.0,1222,Full Metal Jacket (1987),Drama|War
8854,1,2542,5.0,5.0,{'was_impossible': False},169,50,0.0,2542,"Lock, Stock & Two Smoking Barrels (1998)",Comedy|Crime|Thriller


# Cold Start #1

User rates list of movies and is given recommendations based on input

In [25]:
def movie_rater(movie_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movie_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list     

In [26]:
user_rating = movie_rater(movies, 4, 'Action')

      movieId            title            genres
2573     3441  Red Dawn (1984)  Action|Drama|War
How do you rate this movie on a scale of 1-5, press n if you have not seen :

      movieId                                 title         genres
2113     2807  Universal Soldier: The Return (1999)  Action|Sci-Fi
How do you rate this movie on a scale of 1-5, press n if you have not seen :

      movieId          title                       genres
6087    41997  Munich (2005)  Action|Crime|Drama|Thriller
How do you rate this movie on a scale of 1-5, press n if you have not seen :

      movieId               title         genres
1987     2641  Superman II (1980)  Action|Sci-Fi
How do you rate this movie on a scale of 1-5, press n if you have not seen :



In [27]:
new_ratings_df = new_df.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

ValueError: could not convert string to float: 

In [None]:
# train a model using the new combined DataFrame
baseline = BaselineOnly(bsl_options=bsl_options)
baseline.fit(new_data.build_full_trainset())

In [None]:
# make predictions for the user
# you'll probably want to create a list of tuples in the format (movie_id, predicted_score)
list_of_movies = []
for m_id in new_df['movieId'].unique():
    list_of_movies.append( (m_id,baseline.predict(1000,m_id)[3]))

In [None]:
# order the predictions from highest to lowest rated
ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True)

In [None]:
# return the top n recommendations using the 
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break
            
recommended_movies(ranked_movies,movies,5)

# Cold Start #2

Display a list of the most watched movies based on the amount of times the movie has been rated and the average rating of the moive

In [28]:
top = df.groupby('title').mean()

In [29]:
top.drop(['uid', 'est', 'Iu', 'err', 'iid'], axis = 1, inplace = True)

In [30]:
top.rename(columns = {'rui':'average_rating', 'Ui':'number_of_ratings'}, inplace = True) 
top.sort_values(by = ['number_of_ratings', 'average_rating'], ascending = False).head()

Unnamed: 0_level_0,average_rating,number_of_ratings,movieId
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Forrest Gump (1994),4.25,257.0,356.0
"Shawshank Redemption, The (1994)",4.395833,245.0,318.0
Pulp Fiction (1994),4.157534,234.0,296.0
"Silence of the Lambs, The (1991)",4.007143,209.0,593.0
"Matrix, The (1999)",4.328571,208.0,2571.0


In [31]:
top.reset_index(inplace=True)

In [32]:
top['title'] = [top['title'][i].strip() for i in range(len(top))]

In [33]:
top['release_year'] = [top['title'][i][-5:-1] for i in range(len(top))]

In [34]:
top.drop([1180, 3679, 3871, 4160, 5043], inplace=True)

In [35]:
top['release_year']= top['release_year'].astype(float)

ValueError: could not convert string to float: 'ron '

In [None]:
top['decade'] = top['release_year']//10*10

In [None]:
full = pd.merge(top, movies, on = 'title')
full.drop('movieId_y', axis=1, inplace=True)

In [None]:
def user_data(gender, age):
    u_age = 2020-age
    if gender == 'male':
        ud = full[(full['genres'].str.contains('Action' or 'Adventure' or 'Thriller')) & (full['release_year'] >= age)].head()
    if gender == 'female':
        ud = full[(full['genres'].str.contains('Drama' or 'Romance' or 'Musical')) & (full['release_year'] >= age)].head()
    ud = ud.sort_values(by = ['number_of_ratings', 'average_rating'], ascending = False).head()
    display(ud)

In [None]:
user_data('male', 23)

# EDA

In [36]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

In [37]:
chart = sns.barplot(x='decade', y='average_rating', data = top, palette = 'Set1')
chart.set_xticklabels(chart.get_xticklabels(), rotation=45)
chart.set_title('Average Movie Rating Per Decade');

ValueError: Could not interpret input 'decade'

In [None]:
c = top[(top['release_year'] >= 1990)& (top['release_year'] <= 1999)]

In [None]:
chart = sns.barplot(x='release_year', y='average_rating', data = c, palette = 'Set1')
chart.set_xticklabels(chart.get_xticklabels(), rotation=45)
chart.set_title('Average Movie Rating in the 90\'s');

In [None]:
sns.distplot(top['number_of_ratings'])
plt.title('Number of Ratings Distribution');

In [None]:
print('% unrated movies with 0 reviews:')
print(round((len(top[top['number_of_ratings'] == 0]) / len(top) * 100), 2))
print('-----')
print('% unrated movies with less than 10 reviews:')
print(round((len(top[top['number_of_ratings'] <= 10]) / len(top) * 100), 2))


In [None]:
sns.distplot(top['average_rating'])
plt.title('Rating Distribution');

In [None]:
sns.distplot(df['rui']);

In [None]:
sns.scatterplot(x='number_of_ratings', y='average_rating', data = top)
plt.title('Average Rating per Number of Reviews');

In [38]:
s = top[top['average_rating'] == 5]
s.sort_values(by='number_of_ratings', ascending=False)

Unnamed: 0,title,average_rating,number_of_ratings,movieId,release_year
2211,Happiness (1998),5.0,15.0,2318.0,1998
3398,Monsters University (2013),5.0,15.0,103141.0,2013
2750,Kelly's Heroes (1970),5.0,14.0,3836.0,1970
261,American Sniper (2014),5.0,13.0,114662.0,2014
3183,Manhattan Murder Mystery (1993),5.0,13.0,492.0,1993
...,...,...,...,...,...
2643,Jane Eyre (1944),5.0,0.0,6983.0,1944
3819,Particle Fever (2013),5.0,0.0,109687.0,2013
2718,Jump In! (2007),5.0,0.0,143031.0,2007
3748,Oscar (1967),5.0,0.0,142020.0,1967
