In [1]:
import os 

notebook_path = os.path.abspath(os.getcwd())
proj_root = os.path.dirname(notebook_path)

csv_path = os.path.join(proj_root, "data/processed/processed_data.csv")

In [2]:
import pandas as pd

df = pd.read_csv(csv_path)
df = df.drop(columns={"Unnamed: 0"})
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import PorterStemmer
import pickle

In [4]:
ps = PorterStemmer()
def stemmer(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [5]:
df["tags"] = df["tags"].apply(stemmer)
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [6]:
cv = CountVectorizer(max_features=5000, stop_words="english")
vector = cv.fit_transform(df["tags"]).toarray()

In [7]:
vector.shape

(4806, 5000)

In [8]:
# these are the mostly used words in this df corpus
len(cv.get_feature_names_out())

5000

In [9]:
similarity = cosine_similarity(vector)
similarity[0] # this is the similarity score of first movie with all the movies

array([1.        , 0.13957263, 0.08206099, ..., 0.        , 0.        ,
       0.        ])

In [10]:
def recommend_movie(movie: str):
    movie_idx = df[df["title"] == movie].index[0]
    distances = similarity[movie_idx]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    for i in movies_list:
        print(df.iloc[i[0]].title)

In [30]:
recommend_movie("Iron Man")

Iron Man 3
Iron Man 2
Avengers: Age of Ultron
The Helix... Loaded
Teenage Mutant Ninja Turtles II: The Secret of the Ooze


In [14]:
pickle.dump(df, open(os.path.join(proj_root, "models/movies.pkl"), "wb"))

In [26]:
recommend_movie("Kung Fu Panda 3")

The Book of Life
Khumba
The Peanuts Movie
The Legend of Tarzan
Stripes


In [16]:
pickle.dump(similarity, open(os.path.join(proj_root, "models/cosine_similarity.pkl"), "wb"))

In [28]:
df[df["title"] == "Avatar"].index[0]

np.int64(0)