## **Import data**

In [35]:
import json
import pandas as pd

In [36]:
with open('scrape/result.json', 'r') as json_file:
    data = json.load(json_file)
movies_info = data["results"][0]["content"]["movies"]
df = pd.DataFrame(movies_info)
df.head()

Unnamed: 0,title,overview
0,1. Inside the Yellow Cocoon Shell,"A man returns to his hometown, where he's haun..."
1,2. The Last Wife,"In the midst of the Nguyen Dynasty, a reluctan..."
2,3. Furies,Prequel to Furie (2019). Tells the story about...
3,4. Wolfoo and Friends,Wolfoo is a cute wolf living on a hill in a sm...
4,5. Cyclo,When a poor bicycle-taxi driver has his cyclo ...


In [37]:
df.shape

(50, 2)

## **Preprocessing**

In [38]:
df['title'] = df['title'].str.replace(r'^\d+\.\s', '', regex=True)
df.head()

Unnamed: 0,title,overview
0,Inside the Yellow Cocoon Shell,"A man returns to his hometown, where he's haun..."
1,The Last Wife,"In the midst of the Nguyen Dynasty, a reluctan..."
2,Furies,Prequel to Furie (2019). Tells the story about...
3,Wolfoo and Friends,Wolfoo is a cute wolf living on a hill in a sm...
4,Cyclo,When a poor bicycle-taxi driver has his cyclo ...


In [39]:
df = df.dropna()
df.shape

(47, 2)

In [40]:
df

Unnamed: 0,title,overview
0,Inside the Yellow Cocoon Shell,"A man returns to his hometown, where he's haun..."
1,The Last Wife,"In the midst of the Nguyen Dynasty, a reluctan..."
2,Furies,Prequel to Furie (2019). Tells the story about...
3,Wolfoo and Friends,Wolfoo is a cute wolf living on a hill in a sm...
4,Cyclo,When a poor bicycle-taxi driver has his cyclo ...
5,Bridge of Destiny,Cuong is an admirable man. He has a successful...
6,Song of the South,Song of the South (Dat Rung Phuong Nam) is bas...
7,Furie,"When her daughter is kidnapped, a desperate ye..."
8,Ngay Xua Ngay Xua,Once Upon a Time is the Vietnamese children's ...
9,Sister Sister,A late night radio host offers a room at her o...


## **Vectorization**

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [42]:
vector = cv.fit_transform(df['overview']).toarray()

In [43]:
vector.shape

(47, 780)

## **Main Function**

In [44]:
from sklearn.metrics.pairwise import cosine_similarity

In [45]:
similarity = cosine_similarity(vector)

In [46]:
similarity

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.08908708, ..., 0.        , 0.        ,
        0.03779645],
       [0.        , 0.08908708, 1.        , ..., 0.        , 0.06299408,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.06299408, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.03779645, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [47]:
df[df['title'] == "Inside the Yellow Cocoon Shell"].index[0]

0

In [48]:
def recommend(movie):
    index = df[df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(df.iloc[i[0]].title)

In [50]:
recommend("Inside the Yellow Cocoon Shell")

Goodbye Mother
Bridge of Destiny
Three Seasons
Taste
Dreamy Eyes


## **Export**

In [51]:
import pickle

In [52]:
pickle.dump(df,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))