In [1]:
import pandas as pd
import tqdm
from tqdm import tqdm

import spacy
nlp = spacy.load("en_core_web_lg")

OSError: [E050] Can't find model 'en_core_web_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [None]:
df = pd.read_csv('netflix_titles.csv')
df.head()

### Testing out Spacy on 1 Review

In [None]:
df['description'][0]

In [None]:
doc = nlp(df['description'][0])
doc

In [None]:
doc = [token.lemma_ for token in doc if (token.is_stop == False) and (token.is_punct == False)]
doc

### Tokenize, Remove Stop words, and Punct

In [None]:
#Extract tokens from reviews using Spacy
tokens = []

for index,row in tqdm(df.iterrows()):
    review = row['description']
    listed_in = row['listed_in']
    doc = review + listed_in
    doc = nlp(doc)
    doc = [token.lemma_ for token in doc if (token.is_stop == False) and (token.is_punct == False)]
    tokens.append(doc)

In [None]:
tokens[0]

In [None]:
df['tokens'] = tokens
df.head()

### Vectorize Tokens (TFIDF)

In [None]:
df['full_description'] = df['listed_in'] + '. ' + df['description']
df['full_description'][0]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,2),
                      max_df=0.97,
                      min_df=2,
                      max_features = 5000)

dtm = tfidf.fit_transform(df['full_description'])

dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

dtm.head()

### K-NearestNeighbor

In [None]:
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(dtm)

In [None]:
nn.kneighbors([dtm.iloc[0].values])

In [None]:
nn.kneighbors([dtm.iloc[0]])

### Query Movie Recommender

In [None]:
movie = ["action kids movie high school"]

In [None]:
new = tfidf.transform(movie)

In [None]:
new

In [None]:
nn.kneighbors(new.todense())

In [None]:
recommendations = nn.kneighbors(new.todense())[1].tolist()[0]
recommendations

In [None]:
df['full_description'][3501]

In [None]:
df[['type','title','cast','description']].iloc[recommendations]