## Popularity-Based Filtering

### Getting Data.

In [None]:
import pandas as pd
movies = pd.read_csv("movies.csv")

### Calculating Weighted Rating.

Formulae: WR = (V / (V + M)) * R + (M / (V + M)) * C
V - Number of votes for a movie. 
M - Minimum votes required.
R - Average rating of a movie. 
C = Average rating of movie of all movies. 

In [None]:
m = movies["vote_count"].quantile(0.9)
c = movies["vote_average"].mean()

In [None]:
movies_filtered = movies.copy().loc[movies["vote_count"] >= m]

In [None]:
def weighted_rating(row, m=m, c=c):
    r =row["vote_average"]
    v = row["vote_count"]
    wr = (v / (v + m)) * r + (m / (v + m)) * c
    return wr

In [None]:
movies_filtered["weighted_rating"] = movies.apply(weighted_rating, axis=1)
movies_filtered.sort_values("weighted_rating", ascending=False)[["title", "weighted_rating"]].head(10)

Unnamed: 0,title,weighted_rating
1881,The Shawshank Redemption,8.059258
662,Fight Club,7.939256
65,The Dark Knight,7.92002
3232,Pulp Fiction,7.904645
96,Inception,7.863239
3337,The Godfather,7.851236
95,Interstellar,7.809479
809,Forrest Gump,7.803188
329,The Lord of the Rings: The Return of the King,7.727243
1990,The Empire Strikes Back,7.697884


## Content-Based Filtering

### Getting Data.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

movies = pd.read_csv("movies.csv")
movies["overview"] = movies["overview"].fillna("")

### TFIDF(Term Frequency Inverse Document Frequency) Matrix

In [None]:
tf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tf.fit_transform(movies["overview"])

In [None]:
pd.DataFrame(tfidf_matrix.toarray(), columns=tf.get_feature_names_out())

Unnamed: 0,00,000,007,07am,10,100,1000,101,108,10th,...,zuckerberg,zula,zuzu,zyklon,æon,éloigne,émigré,été,única,über
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Similarity Matrix

In [None]:
similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)
similarity_matrix

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.02160533, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.01488159, 0.        ,
        0.        ],
       ...,
       [0.        , 0.02160533, 0.01488159, ..., 1.        , 0.01609091,
        0.00701914],
       [0.        , 0.        , 0.        , ..., 0.01609091, 1.        ,
        0.01171696],
       [0.        , 0.        , 0.        , ..., 0.00701914, 0.01171696,
        1.        ]])

In [None]:
def similar_movies(movie_title, nr_movies):
    try:
        id: int = movies.loc[movies["title"] == movie_title].index[0]
    except IndexError:
        return "incorrect Output"
    scores = list(enumerate(similarity_matrix[id]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    movies_indices = [movie[0] for movie in scores[1:nr_movies+1]]
    movie_titles = list(movies.iloc[movies_indices]["title"])
    return movie_titles

### Testing Our Function to Get Similar Movies

In [None]:
similar_movies("Kung Fu Panda 3", 3)

['Kung Fu Panda 2',
 'My Big Fat Greek Wedding 2',
 'Once Upon a Time in the West']

## Collaborative-Based Filtering

### Getting Data.

In [None]:
import pandas as pd
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")[["userId", "movieId", "rating"]]
ratings.head(4)

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0


### Creating Dataset

In [None]:
from surprise import Dataset
from surprise import Reader

reader = Reader(rating_scale=(1.0, 5.0))
dataset = Dataset.load_from_df(ratings, reader)

### Creating Trainset

In [None]:
trainset = dataset.build_full_trainset()

### Training the ML Model

In [None]:
from surprise import SVD
svd = SVD()

In [None]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f8abd17fc10>

In [None]:
svd.predict(15, 1956)

Prediction(uid=15, iid=1956, r_ui=None, est=3.3706366339928264, details={'was_impossible': False})

### Validation

In [None]:
from surprise import model_selection

model_selection.cross_validate(svd, dataset, measures=["RMSE", "MAE"])

{'test_rmse': array([0.893818  , 0.90419728, 0.90142166, 0.89069038, 0.89079341]),
 'test_mae': array([0.6873145 , 0.69407223, 0.6947108 , 0.68620154, 0.68666567]),
 'fit_time': (0.7769606113433838,
  0.7872774600982666,
  0.7007441520690918,
  0.7294678688049316,
  0.6844596862792969),
 'test_time': (0.06550407409667969,
  0.06833672523498535,
  0.0736539363861084,
  0.0635826587677002,
  0.16712474822998047)}

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=bb7d452f-b909-4c72-912a-a220cf860c05' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>