# Collaborative Filtering
* MovieLens Dataset
* Recompy Library ( https://github.com/CanBul/recompy )
* Matrix Factorisation
* Model Based (FunkSVD)

## User-Item Matrix

In [None]:
%%capture
!pip install recompy

In [None]:
from recompy import load_movie_data
import pandas as pd

data = load_movie_data()

In [None]:
print(data.shape)
data

(100000, 3)


array([[1.96e+02, 2.42e+02, 3.00e+00],
       [1.86e+02, 3.02e+02, 3.00e+00],
       [2.20e+01, 3.77e+02, 1.00e+00],
       ...,
       [2.76e+02, 1.09e+03, 1.00e+00],
       [1.30e+01, 2.25e+02, 2.00e+00],
       [1.20e+01, 2.03e+02, 3.00e+00]])

In [None]:
df_ratings = pd.DataFrame(data, columns = ['userId', 'itemId', 'rating'])

In [None]:
df_ratings.head()

Unnamed: 0,userId,itemId,rating
0,196.0,242.0,3.0
1,186.0,302.0,3.0
2,22.0,377.0,1.0
3,244.0,51.0,2.0
4,166.0,346.0,1.0


In [None]:
df_ratings = df_ratings.astype(int)

In [None]:
df_ratings_pivot = df_ratings.pivot(index='itemId', columns='userId', values='rating')

User-Item Matrix is very sparse as you can see. What can we do?
* We can do nothing, keep original matrix and just take intersections while computing similarities.
* We can fill NaN cells with global mean.
* We can fill NaN cells with user's mean.

In [None]:
df_ratings_pivot

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,904,905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,930,931,932,933,934,935,936,937,938,939,940,941,942,943
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,5.0,4.0,,,4.0,4.0,,,,4.0,,,3.0,,1.0,5.0,4.0,5.0,,3.0,5.0,,5.0,,5.0,3.0,,,,,,,,,,,,5.0,,,...,,,,5.0,,,4.0,,,2.0,,,4.0,3.0,3.0,4.0,,3.0,5.0,3.0,5.0,,,5.0,,3.0,3.0,,4.0,3.0,2.0,3.0,4.0,,4.0,,,5.0,,
2,3.0,,,,3.0,,,,,,,,3.0,,,,,,,,,2.0,,,,,,,,3.0,,,,,,,,,,,...,,,,,,,,,,,,,3.0,,,,,,,,3.0,,,,,,,,,,4.0,,,,,,,,,5.0
3,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,2.0,,,,,,3.0,1.0,,,,,,4.0,,,,,,,,,,,,,4.0,,,,,,,
4,3.0,,,,,,5.0,,,4.0,,5.0,5.0,,,5.0,,3.0,4.0,,,5.0,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,4.0,,,4.0,,,1.0,,,,,,,,,,,,,,3.0,5.0,,,,,,2.0,,,
5,3.0,,,,,,,,,,,,1.0,,,,,,,,2.0,,,,,,,3.0,,,,,,,,,,,,,...,,,,5.0,,,,,,,,,3.0,,,4.0,,,,,,4.0,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1679,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1680,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1681,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


* Null Interaction Rate

In [None]:
df_ratings_pivot.isna().sum().sum()/(df_ratings_pivot.shape[0]*df_ratings_pivot.shape[1])

0.9369533063577546

* How many movies have watched and rated by an average person?

In [None]:
(1682 - df_ratings_pivot.isna().sum()).mean()

106.04453870625663

* How many people watched and rated an average movie?

In [None]:
(943 - df_ratings_pivot.isna().T.sum()).mean()

59.45303210463734

## FuncSVD Model

* **S**ingular **V**alue **D**ecomposition :
    * Data Reduction
    * Basis PCA (Key Correlations)
* **FunkSVD :** Funk SVD decomposes a matrix (with missing values) into two components \(U\) and \(V\). The singular values are folded into these matrices. The approximation for the original matrix can be obtained by **\(R = UV'\)**.

In [None]:
class FunkSVD():

    def __init__(self):
        # Initialize default hyperparameters
        self.set_hyperparameters()


    def set_hyperparameters(self, initialization_method='random', max_epoch=5, n_latent=10, learning_rate=0.01, regularization=0.1, early_stopping=False, init_mean=0, init_std=1):
        """Initialization method, epoch num, latent feature num, learning rate,
           regularization, early stopping condition...
        """
        self.initialization_method = initialization_method
        self.max_epoch = max_epoch
        self.n_latent = n_latent
        self.learning_rate = learning_rate
        self.regularization = regularization
        self.early_stopping = early_stopping
        self.init_mean = init_mean
        self.init_std = init_std

        self.min_train_error = np.inf
        self.min_test_error = np.inf

    # unique users, unique items, train-test split, user informed etc.    
    def __set_data(self, data, test_portion):

        # get distinct users, items and user_existing_ratings, items_existing_users
        self.user_existing_ratings = {}
        self.items_rated_by_users = {}
        self.user_ids = []
        self.item_ids = []

        np.random.shuffle(data)

        # variables for train and test split
        user_dictionary = {}
        item_dictionary = {}
        self.train_data = []
        self.test_data = []

        self.train_data_user_ids = []
        self.train_data_item_ids = []
        self.test_data_user_ids = []
        self.test_data_item_ids = []

        for user, item, score in data:
            # Unique users and items

            try:
                user = int(user)
            except:
                pass
            try:
                item = int(item)
            except:
                pass

            user = str(user)
            item = str(item)
            score = float(score)

            if user not in self.user_existing_ratings:
                self.user_ids.append(user)
            if item not in self.items_rated_by_users:
                self.item_ids.append(item)

            self.items_rated_by_users.setdefault(item, []).append(user)
            self.user_existing_ratings.setdefault(user, []).append(item)

            ratio = len(self.test_data) / (len(self.train_data)+0.001)

            if self.test_split:
                # train and test set
                user_dictionary.setdefault(user, 0)
                item_dictionary.setdefault(item, 0)

                if user_dictionary[user] * test_portion >= 1 and item_dictionary[item] * test_portion >= 1 and ratio <= test_portion+0.02:

                    self.test_data.append([user, item, score])
                    if user not in self.test_data_user_ids: self.test_data_user_ids.append(user)
                    if item not in self.train_data_item_ids: self.test_data_item_ids.append(item)

                    user_dictionary[user] -= 1
                    item_dictionary[item] -= 1

                else:
                    self.train_data.append([user, item, score])
                    if user not in self.train_data_user_ids: self.train_data_user_ids.append(user)
                    if item not in self.train_data_item_ids: self.train_data_item_ids.append(item)

                    user_dictionary[user] += 1
                    item_dictionary[item] += 1
            else:
                self.train_data.append([user, item, score])
                if user not in self.train_data_user_ids: self.train_data_user_ids.append(user)
                if item not in self.train_data_item_ids: self.train_data_item_ids.append(item)

        print('Your data has {} distinct users and {} distinct items.'.format(
            len(self.user_ids), len(self.item_ids)))

        if len(self.test_data) < 1 and self.test_split:
            self.test_split = False
            self.early_stopping = False
            print("Training set doesn't have enough data for given test portion.")

        if self.test_split:

            print('Your data has been split into train and test set.')
            print('Length of training set is {}. Length of Test set is {}'.format(
                len(self.train_data), len(self.test_data)))
        else:

            print('Your data has no test set.')
            print('Length of training set is {}'.format(len(self.train_data)))

    def fit(self, data, test_split=True, test_portion=0.1, search_parameter_space=False):

        # Set train_data, test_data, user_ids etc. if search parameter is False
        # If True, this lets us search parameter space with the same train-test split
        if not search_parameter_space:

            self.test_split = test_split
            self.__set_data(data, test_portion)

        # Initialization
        print('Initializing features for Users and Items...')
        initial = initializer(self.user_ids, self.item_ids, self.initialization_method,
                              self.n_latent, self.init_mean, self.init_std)

        self.user_features, self.item_features = initial.initialize_latent_vectors()

        # Training
        print('Starting training...')
        error_counter = 0
        for epoch in range(self.max_epoch):

            # updating user and item features
            for user, item, rating in self.train_data:

                error = rating - \
                    np.dot(self.user_features[user], self.item_features[item])
                # Use temp to update each item and user feature in sync.
                temp = self.user_features[user]

                # Update user and item feature for each user, item and rating pair
                self.user_features[user] += self.learning_rate * \
                    (error * self.item_features[item] -
                     self.regularization * self.user_features[user])
                self.item_features[item] += self.learning_rate * \
                    (error * temp - self.regularization *
                     self.item_features[item])

            # Calculate errors
            error_counter += 1
            train_error = Test.rmse_error(
                self.train_data, self.user_features, self.item_features)

            # Show error to Client
            if self.test_split:
                test_error = Test.rmse_error(
                    self.test_data, self.user_features, self.item_features)
                print('Epoch Number: {}/{} Training RMSE: {:.2f} Test RMSE: {}'.format(epoch+1, self.max_epoch,
                                                                                       train_error, test_error))

            else:
                print('Epoch Number: {}/{} Training RMSE: {:.2f}'.format(epoch+1, self.max_epoch,
                                                                         train_error))

            # Save best features depending on test_error
            if self.test_split and test_error < self.min_test_error:
                self.min_test_error = test_error
                self.best_user_features = copy.deepcopy(self.user_features)
                self.best_item_features = copy.deepcopy(self.item_features)

                error_counter = 0
            # Save best features if test data is False
            elif not self.test_split and train_error < self.min_train_error:
                self.min_train_error = train_error
                self.best_user_features = copy.deepcopy(self.user_features)
                self.best_item_features = copy.deepcopy(self.item_features)

            # Break if test_error didn't improve for the last n rounds and early stopping is true
            if self.early_stopping and error_counter >= self.early_stopping:

                print("Test error didn't get lower for the last {} epochs. Training is stopped.".format(
                    error_counter))
                print('Best test error is: {:.2f}. Best features are saved.'.format(
                    self.min_test_error))
                break

        print('Training has ended...')
        self.user_features = copy.deepcopy(self.best_user_features)
        self.item_features = copy.deepcopy(self.best_item_features)

    def get_recommendation_for_existing_user(self, user_id, howMany=10):
        result_list = []
        # this might be more effective using matrix multiplication
        for item in self.item_ids:
            # if user did not already rate the item
            if item not in self.user_existing_ratings[user_id]:
                prediction = np.dot(
                    self.user_features[user_id], self.item_features[item])
                bisect.insort(result_list, [prediction, item])

        return [x[1] for x in result_list[::-1][0:howMany]]

    def get_recommendation_for_new_user(self, user_ratings,
                                        similarity_measure='mean_squared_difference', howManyUsers=3, howManyItems=5):

        # Get user predictions on same movies
        user_predictions = self.__user_prediction_for_same_movies(user_ratings)
        # Find most most similar user_ids
        user_ids = Similarities.get_most_similar_users(
            user_ratings, user_predictions, similarity_measure, howManyUsers)

        result_list = []
        # get user features for users who are most similar to given new user
        for user in user_ids:
            for item, item_feature in self.item_features.items():
                # predict ratings for most similar users
                prediction = np.dot(
                    self.user_features[user], item_feature)
                bisect.insort(result_list, [prediction, item])

        # remove duplicates
        return_list = []
        for pair in result_list:
            if len(return_list) >= howManyItems:
                break
            if pair[1] in return_list:
                continue

            return_list.append(pair[1])

        return return_list

    def get_similar_products(self, item_id, howMany=10):

        result_list = []
        product_features = self.item_features[item_id]

        for item in self.item_ids:

            if item == item_id:
                continue
            # add cosine sim function from similarites
            cos_sim = Similarities.cosine_similarity(
                self.item_features[item], product_features)

            bisect.insort(result_list, [cos_sim, item])

        return [x[1] for x in result_list[::-1][0:howMany]]

    def __user_prediction_for_same_movies(self, user_ratings):
        result = {}
        for key in user_ratings:
            if key not in self.item_features:
                continue

            for user in self.user_features:
                result.setdefault(user, []).append(
                    np.dot(self.user_features[user], self.item_features[key]))

        return result

## Training

In [None]:
from recompy import load_movie_data, FunkSVD

# get MovieLens data
data = load_movie_data()
# initialization of FunkSVD model
myFunk = FunkSVD()
myFunk.set_hyperparameters(max_epoch=200, n_latent=15,early_stopping=True, initialization_method='random', regularization=0.1)
# training of the model
myFunk.fit(data)

Your data has 943 distinct users and 1682 distinct items.
Your data has been split into train and test set.
Length of training set is 89285. Length of Test set is 10715
Initializing features for Users and Items...
Starting training...
Epoch Number: 1/200 Training RMSE: 0.96 Test RMSE: 0.9670916963503041
Epoch Number: 2/200 Training RMSE: 0.93 Test RMSE: 0.9522431850824344
Epoch Number: 3/200 Training RMSE: 0.92 Test RMSE: 0.9478483827565287
Epoch Number: 4/200 Training RMSE: 0.91 Test RMSE: 0.9458730577563561
Epoch Number: 5/200 Training RMSE: 0.91 Test RMSE: 0.9446572171923738
Epoch Number: 6/200 Training RMSE: 0.91 Test RMSE: 0.9436758029458507
Epoch Number: 7/200 Training RMSE: 0.90 Test RMSE: 0.9427027755865681
Epoch Number: 8/200 Training RMSE: 0.90 Test RMSE: 0.9416069334010283
Epoch Number: 9/200 Training RMSE: 0.90 Test RMSE: 0.9402940836122284
Epoch Number: 10/200 Training RMSE: 0.89 Test RMSE: 0.9386958780769502
Epoch Number: 11/200 Training RMSE: 0.89 Test RMSE: 0.9367785368

## Testing

In [None]:
new_user = {'1':5,
            '2':4,
            '4':3}

# To find the most similar user resulting from cosine similarity. Recommend 5 items using the most similar user 
myFunk.get_recommendation_for_new_user(new_user, 
                                       similarity_measure = 'cosine_similarity', 
                                       howManyUsers = 1, howManyItems = 5)

['1304', '1618', '906', '1254', '987']

# Content Based Filtering

* **Used Dataset :** https://www.kaggle.com/rounakbanik/the-movies-dataset/data
* **Latest MovieLens Dataset :** https://grouplens.org/datasets/movielens/latest/



**Files :**
  * _movies_metadata_ : Features belong to movies (~45k)
  * _keywords_ : Keywords extracted from plot of the movie
  * _credits_ : Cast and crew information
  * _links_ : TMDB and IMDB IDs of all movies
  * _ratings_ : User-Movie interactions
  

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from numpy import dot
from numpy.linalg import norm
import numpy as np

metadata = pd.read_csv('/content/drive/MyDrive/applied_ai_enes_safak/recommender_systems/MovieLens/movies_metadata.csv', low_memory=False)

metadata['overview'] = metadata['overview'].fillna('')

metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


## Scoring with Weighted Ratings

---
\begin{equation}
\text Weighted Rating (\bf WR) = \left({{\bf v} \over {\bf v} + {\bf m}} \cdot R\right) + \left({{\bf m} \over {\bf v} + {\bf m}} \cdot C\right)
\end{equation}

---

* v is the number of votes for the movie;

* m is the minimum votes required to be listed in the chart;

* R is the average rating of the movie;

* C is the mean vote across the whole report.



---



* Average rating of a movie on IMDB is around 5.6 on a scale of 10

In [None]:
C = metadata['vote_average'].mean()
C

5.618207215133889

* Only %10 of all movies rated more than 160 times

In [None]:
m = metadata['vote_count'].quantile(0.90)
m

160.0

In [None]:
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
print(q_movies.shape)
print(metadata.shape)

(4555, 24)
(45466, 24)


In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

## Sort by Score

In [None]:
q_movies = q_movies.sort_values('score', ascending=False)

q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20).reset_index(drop=True)

Unnamed: 0,title,vote_count,vote_average,score
0,The Shawshank Redemption,8358.0,8.5,8.445869
1,The Godfather,6024.0,8.5,8.425439
2,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
3,The Dark Knight,12269.0,8.3,8.265477
4,Fight Club,9678.0,8.3,8.256385
5,Pulp Fiction,8670.0,8.3,8.251406
6,Schindler's List,4436.0,8.3,8.206639
7,Whiplash,4376.0,8.3,8.205404
8,Spirited Away,3968.0,8.3,8.196055
9,Life Is Beautiful,3643.0,8.3,8.187171


## **Overview** is the Content

* With **Term Frequency-Inverse Document Frequency (TF-IDF)** Analysis

In [None]:
pd.DataFrame(metadata[['original_title','overview']].head())

Unnamed: 0,original_title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [None]:
metadata = metadata.iloc[:20000,:] # Similarity matrix can be very large

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

In [None]:
metadata['overview'] = metadata['overview'].fillna('')

* 47487 different words in overview for 20k Movies

In [None]:
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

tfidf_matrix.shape

(20000, 47487)

* **Similarity Score :** Cosine Similarity will be used. 

In [None]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim.shape

(20000, 20000)

In [None]:
cosine_sim[0]

array([1.        , 0.01575748, 0.        , ..., 0.        , 0.        ,
       0.        ])

* Construct a serie for mapping movies and IDs  

In [None]:
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [None]:
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

* **get_recommendations :** 
  * Takes title and similarity matrix
  * Returns 10 most similar movies as recommendations

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
  
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) 
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    return pd.DataFrame(metadata['title'].iloc[movie_indices]).reset_index(drop=True)

In [None]:
get_recommendations("Inglourious Basterds")

Unnamed: 0,title
0,The Last Metro
1,Black Book
2,The Dirty Dozen: The Fatal Mission
3,"Goodbye, Columbus"
4,The Diary of Anne Frank
5,Don't Look Now: We're Being Shot At
6,A Letter from Death Row
7,Kelly's Heroes
8,Defiance
9,Schindler's List


In [None]:
get_recommendations('There Will Be Blood') # kesin izleyin...

Unnamed: 0,title
0,The Stars Fell on Henrietta
1,Dallas - War of The Ewings
2,Tulsa
3,On Deadly Ground
4,Written on the Wind
5,The Formula
6,Hellfighters
7,Simon Magus
8,Terror in a Texas Town
9,Local Hero


## Recommendation with Overview Vectors

In [None]:
%%capture
!pip3 install spacy
!python3 -m spacy download en_core_web_sm

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
metadata = pd.read_csv('/content/drive/MyDrive/applied_ai_enes_safak/recommender_systems/MovieLens/movies_metadata.csv', low_memory=False)

metadata['overview'] = metadata['overview'].fillna('')

In [None]:
pd.DataFrame(metadata[['original_title','overview']].head())

Unnamed: 0,original_title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [None]:
movie_docs = pd.DataFrame(metadata[['original_title','overview']])

In [None]:
movie_docs.shape

(45466, 2)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

In [None]:
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

tfidf_matrix.shape

(45466, 75827)

In [None]:
# spacy
from tqdm.notebook import tqdm
movie_docs_vectors = []
for idx, row in tqdm(metadata.iterrows(), total=len(metadata)):
  doc = nlp(row["overview"])
  movie_docs_vectors.append(doc.vector)
movie_docs["overview vectors"] = movie_docs_vectors

HBox(children=(FloatProgress(value=0.0, max=45466.0), HTML(value='')))




In [None]:
with open('/content/drive/MyDrive/applied_ai_enes_safak/recommender_systems/MovieLens/word_vecs.npy', 'wb') as f:
    np.save(f, np.array(movie_docs['overview vectors']), allow_pickle=True)

In [None]:
with open('/content/drive/MyDrive/applied_ai_enes_safak/recommender_systems/MovieLens/word_vecs.npy', 'rb') as f:
    word_vecs = np.load(f, allow_pickle=True)

In [None]:
movie_docs["overview vectors"] = word_vecs

In [None]:
movie_docs.head()

Unnamed: 0,original_title,overview,overview vectors
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[0.29593316, 0.1624922, -0.0122375265, 0.31845..."
1,Jumanji,When siblings Judy and Peter discover an encha...,"[-0.06933068, 0.53941554, -0.22379777, -0.2023..."
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[0.40886644, 0.47620142, -0.14130907, 0.581963..."
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[0.12919456, 0.6737056, -0.3598231, -0.0721177..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[0.45354685, 0.5896248, 0.1983986, -0.37084275..."


In [None]:
def get_recommendations(title, recommend_k, df):
  #cos_sim = dot(a, b)/(norm(a)*norm(b))
  vector = df[df["original_title"] == title]["overview vectors"].iloc[0]
  
  shape = vector.shape[0]
  
  list_of_vecs = df["overview vectors"].tolist()
  
  similarities = []
  for b in list_of_vecs:
    try:
      similarities.append(np.dot(vector, b)/(np.linalg.norm(vector)*np.linalg.norm(b)))
    except:
      pass
  indices = np.array(similarities).argsort()[0:recommend_k][::-1]

  return [df.iloc[index, 0] for index in indices]

In [None]:
get_recommendations("The Departed", 5, movie_docs)

['The Long and the Short and the Tall',
 'Tower Heist',
 'Eu Não Faço a Menor Ideia do que eu Tô Fazendo Com a Minha Vida',
 'Pioneer Woman',
 'Los cronocrímenes']

In [None]:
get_recommendations("Happy Feet", 5, movie_docs)

['Tower Heist',
 'Youngblood',
 'Eu Não Faço a Menor Ideia do que eu Tô Fazendo Com a Minha Vida',
 'Pioneer Woman',
 'Los cronocrímenes']

In [None]:
get_recommendations("Whiplash", 5, movie_docs)

['티끌모아 로맨스',
 'Gli occhi freddi della paura',
 'Under the Boardwalk: The Monopoly Story',
 'Pioneer Woman',
 'Los cronocrímenes']

In [None]:
get_recommendations("Kill Bill: Vol. 1", 5, movie_docs)

['Gli occhi freddi della paura',
 'Under the Boardwalk: The Monopoly Story',
 '티끌모아 로맨스',
 'Pioneer Woman',
 'Los cronocrímenes']