In [57]:
#D:\DATA_SCIENCE_SELF\ml-25m\movies

In [58]:
import pandas as pd
movies = pd.read_csv("D:\DATA_SCIENCE_SELF\ml-25m\movies.csv")

In [59]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [60]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


### 1. Cleaning Movies Titles With Regex

In [61]:
import re
# learn more about regex -- https://www.w3schools.com/python/python_regex.asp

In [62]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "",title)

In [63]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [64]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


### 2. Creating a TFIDF Matrix

##### TF gives us information on how often a term appears in a document and IDF gives us information about the relative rarity of a term in the collection of documents. By multiplying these values together we can get our final TF-IDF value.

#### TF(Term Frequency) 
 * TF = (Frequency of a word in the document) / (Total number of words in the document)

#### IDF(Inverse Document Frequency)
 * IDF = log(total number of documents / number of documents including the word)

In [65]:
#learn more about TFIDF MAtrix 
# https://www.capitalone.com/tech/machine-learning/understanding-tf-idf/

In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [67]:
Vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = Vectorizer.fit_transform(movies["clean_title"])

### 3. Creating a Search Function

In [68]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

### Cosine Similarity
* we calculate the Cosine Similarity between the two non-zero vectors. A vector is a single dimesingle-dimensional signal NumPy   array. Cosine similarity is a measure of similarity, often used to measure document similarity in text analysis. We use the     below formula to compute the cosine similarity.

* Similarity = (A.B) / (||A||.||B||)

In [69]:
# Learn More on Cosine Similarity -- https://www.geeksforgeeks.org/how-to-calculate-cosine-similarity-in-python/

In [70]:
def search(title):
    #title = "Avengers"
    title = clean_title(title)
    query_vec = Vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [71]:
# for np.argpartition -- https://www.geeksforgeeks.org/numpy-argpartition-in-python/

### 4. Building Interactive Search Box

In [72]:
import ipywidgets as widgets
from IPython.display import display

In [73]:
movie_input = widgets.Text(
    value="Toy Story",
    description="Movies Title:",
    disabled=False)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))


movie_input.observe(on_type,names='value')
display(movie_input, movie_list)

Text(value='Toy Story', description='Movies Title:')

Output()

### 5. Reading in movie ratings data

In [74]:
#import pandas as pd
ratings = pd.read_csv("D:/DATA_SCIENCE_SELF/ml-25m/ratings.csv")

In [75]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [76]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [77]:
#users who liked movies 
movie_id = 1

In [78]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [79]:
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533], dtype=int64)

In [80]:
similar_user_rec = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [81]:
similar_user_rec

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [82]:
# 10% user who like movies that similar to us
similar_user_rec = (similar_user_rec.value_counts() / len(similar_users)) * 100
similar_user_rec = similar_user_rec[similar_user_rec > 10]

In [83]:
similar_user_rec

1        100.000000
318       44.560658
260       40.376958
356       37.021503
296       36.729493
            ...    
953       10.305283
551       10.119458
1222      10.087603
745       10.034510
48780     10.018582
Name: movieId, Length: 113, dtype: float64

### 6. Finding how much all users like movies

In [85]:
all_users = ratings[(ratings["movieId"].isin(similar_user_rec.index)) & (ratings["rating"] > 4)]

In [86]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [95]:
all_users_rec = all_users["movieId"].value_counts() / len(all_users["userId"].unique()) *100

In [96]:
all_users_rec 

318      34.222028
296      28.467366
2571     24.403343
356      23.526568
593      22.590856
           ...    
551       4.091836
50872     3.911051
745       3.703115
78499     3.513059
2355      2.509139
Name: movieId, Length: 113, dtype: float64

### 7. Creating a recommendation score

In [97]:
rec_percentages = pd.concat([similar_user_rec, all_users_rec], axis =1)
rec_percentages.columns = ["similar", "all"]

In [98]:
rec_percentages

Unnamed: 0,similar,all
1,100.000000,12.472849
32,16.071144,10.029270
34,13.055482,5.222902
47,22.590921,14.446917
50,27.560393,20.051256
...,...,...
59315,10.459251,5.426865
60069,17.063977,7.630722
68954,15.917175,6.494358
78499,15.295992,3.513059


In [99]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [101]:
rec_percentages = rec_percentages.sort_values("score", ascending = False)

In [102]:
rec_percentages # Higher the score the better the recommendation is.

Unnamed: 0,similar,all,score
1,100.000000,12.472849,8.017414
3114,28.064773,5.370576,5.225654
2355,11.053889,2.509139,4.405452
78499,15.295992,3.513059,4.354038
4886,23.514733,7.081082,3.320783
...,...,...,...
2858,21.672418,16.763350,1.292845
296,36.729493,28.467366,1.290232
79132,16.681710,13.138377,1.269693
4973,14.250066,11.240464,1.267747


In [104]:
rec_percentages.head(10).merge(movies, left_index = True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,100.0,12.472849,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,28.064773,5.370576,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,11.053889,2.509139,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,15.295992,3.513059,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,23.514733,7.081082,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,21.6618,6.751298,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,22.81391,7.226769,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,17.940005,5.997695,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,20.350411,6.845333,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,25.34112,8.576367,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


### 8. Building a recommendation function

In [105]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_rec = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    similar_user_rec = (similar_user_rec.value_counts() / len(similar_users)) * 100
    similar_user_rec = similar_user_rec[similar_user_rec > 10]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_rec.index)) & (ratings["rating"] > 4)]
    all_users_rec = all_users["movieId"].value_counts() / len(all_users["userId"].unique()) *100
    
    rec_percentages = pd.concat([similar_user_rec, all_users_rec], axis =1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    rec_percentages = rec_percentages.sort_values("score", ascending = False)
    
    return rec_percentages.head(10).merge(movies, left_index = True, right_on="movieId")[["score","title","genres"]]
    

### 9. Creating an interactive recommendation widget

In [108]:
movie_name_input = widgets.Text(
    value="Toy Story",
    description="Movies Title:",
    disabled=False)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) >5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_name_input.observe(on_type, names="value")
display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movies Title:')

Output()