# Project Portfolio - Build a Movie Recommendation System in Python

### Reading in Our Movie Data in Pandas

In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv("movies.csv")

You can download the .csv file from [here](https://files.grouplens.org/datasets/movielens/ml-25m.zip)

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Cleaning Movie Titles Using Regex

In [5]:
import re

In [56]:
# This function takes in a title and returns the cleaned title. It should remove any character that isn't a letter, digit, or a space.
def cleaning_title(title):

    return re.sub(r"[^a-zA-Z0-9\s]*", "", title)

In [57]:
movies["clean_title"] = movies["title"].apply(cleaning_title)

In [58]:
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


### Creating a TFIDF Matrix

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [62]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

In [63]:
tfidf_matrix = vectorizer.fit_transform(movies["clean_title"])

In [72]:
vectorizer.get_feature_names_out()[10:]

array(['009', '009 re', '0091', ..., 'zyzzyx rd', 'zzero', 'zzero 1974'],
      dtype=object)