
# Data
- https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

In [7]:
from IPython.display import display, HTML
display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 95%; }
    div#maintoolbar-container { width: 99%; }

</style>
"""))

In [None]:
import requests

import pandas as pd
import numpy as np
import copy
import json

from ast import literal_eval

import torch
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel
from transformers import OwlViTProcessor, OwlViTForObjectDetection
from transformers import pipeline
from transformers import GPT2TokenizerFast
from PIL import Image

import pickle

In [1]:
import matplotlib.pyplot as plt
from typing import List, Tuple, Dict

import sklearn.datasets as datasets
import sklearn.manifold as manifold

In [3]:
import openai
import os
import sys
from dotenv import load_dotenv

load_dotenv()    
openai.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
cur_os = sys.platform

In [5]:
model_path = f"D:/github" if cur_os.startswith('win') else None

## 데이터셋

In [1]:
movies_metadata = pd.read_csv('./movie_data/movies_metadata.csv', sep=",", dtype=str)
print(movies_metadata.shape)
movies_metadata.head()

NameError: name 'pd' is not defined

In [2]:
movies_metadata = movies_metadata[['id', 'genres', 'title', 'overview', 'release_date']]
movies_metadata.head()

NameError: name 'movies_metadata' is not defined

In [10]:
movies_metadata['genres'] = movies_metadata['genres'].apply(literal_eval)
movies_metadata.head()

Unnamed: 0,id,genres,title,overview,release_date
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"Led by Woody, Andy's toys live happily in his ...",1995-10-30
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,When siblings Judy and Peter discover an encha...,1995-12-15
2,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Grumpier Old Men,A family wedding reignites the ancient feud be...,1995-12-22
3,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",1995-12-22
4,11862,"[{'id': 35, 'name': 'Comedy'}]",Father of the Bride Part II,Just when George Banks has recovered from his ...,1995-02-10


## 사용할 컬럼 설정

In [11]:
def get_genre(x):
    names = [i['name'] for i in x]
    if len(names) > 3:
        names = names[:3]
    return " ".join(names)

In [12]:
movies_metadata['genres'] = movies_metadata['genres'].apply(lambda x : get_genre(x))

In [13]:
movies_metadata.head()

Unnamed: 0,id,genres,title,overview,release_date
0,862,Animation Comedy Family,Toy Story,"Led by Woody, Andy's toys live happily in his ...",1995-10-30
1,8844,Adventure Fantasy Family,Jumanji,When siblings Judy and Peter discover an encha...,1995-12-15
2,15602,Romance Comedy,Grumpier Old Men,A family wedding reignites the ancient feud be...,1995-12-22
3,31357,Comedy Drama Romance,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",1995-12-22
4,11862,Comedy,Father of the Bride Part II,Just when George Banks has recovered from his ...,1995-02-10


In [14]:
movies_metadata = movies_metadata[movies_metadata['overview'].notnull()]
movies_metadata = movies_metadata[movies_metadata['title'].notnull()]
movies_metadata.isna().sum()

id               0
genres           0
title            0
overview         0
release_date    71
dtype: int64

In [15]:
movies_metadata['feature'] = movies_metadata['genres'] + " / " + movies_metadata['title'] + " / " + movies_metadata['overview']

# HuggingFace embedding

In [2]:
if cur_os.startswith('win'):
    model = SentenceTransformer(f'{model_path}/distiluse-base-multilingual-cased-v2')    
else:
    model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v2")

model

NameError: name 'cur_os' is not defined

In [None]:
movies_metadata['hf_embeddings'] = movies_metadata['feature'].apply(lambda x : model.encode(x))
movies_metadata.shape

In [1]:
print(movies_metadata.shape)
movies_metadata.head()

NameError: name 'movies_metadata' is not defined

In [49]:
movies_metadata.to_csv('./movie_data/movies_metadata_em.csv')

# OpenAI Embedding

In [26]:
openai_embedding_model = "text-embedding-ada-002"

In [27]:
def get_doc_embedding(text: str) -> List[float]:
    return get_embedding(text, openai_embedding_model)

In [28]:
def get_embedding(text: str, model: str) -> List[float]:
    result = openai.Embedding.create(
      model=model,
      input=text)
    return result["data"][0]["embedding"]

In [29]:
# movies_metadata['openai_embeddings'] = movies_metadata['feature'].apply(lambda x : get_embedding(x, openai_embedding_model))

# 1. 코사인 유사도로 사용자 query 처리하기

In [30]:
top_k = 5

In [671]:
def get_query_sim_top_k(query, model, df, top_k):
    query_encode = model.encode(query)
    cos_scores = util.pytorch_cos_sim(query_encode, df['hf_embeddings'])[0]
    top_results = torch.topk(cos_scores, k=top_k)
    return top_results

query로 어떤 정보를 받으면 query가 가지고 있는 embedding 정보를 가져오고, 위에서 구한 huggingface embedding과 유사도를 구함.
마지막으로 torch.topk를 활용해 top 개수만큼 가져옴

In [32]:
query = "Are there any documentary films?"
top_result = get_query_sim_top_k(query, model, movies_metadata)

  b = torch.tensor(b)


In [33]:
top_result

torch.return_types.topk(
values=tensor([0.5390, 0.5117, 0.5093, 0.5067, 0.4992]),
indices=tensor([24020, 10124, 22428, 35263, 22273]))

In [34]:
movies_metadata.iloc[top_result[1].numpy(), :][['title', 'overview', 'genres']]

Unnamed: 0,title,overview,genres
24020,The 50 Worst Movies Ever Made,There are some movies that are so bad they're ...,Documentary
10124,Trekkies 2,sequel to the 1997 documentary film Trekkies.,Documentary
22428,The Spanish Earth,A propaganda film made during the Spanish Civi...,War Documentary
35263,Tomorrow,Documentary film about global warming.,Documentary
22273,I Know That Voice,A documentary about voice-over actors.,Documentary


In [35]:
query = "Are there any movies about natural disasters?"
top_result = get_query_sim_top_k(query, model, movies_metadata)
movies_metadata.iloc[top_result[1].numpy(), :][['title', 'overview', 'genres']]

Unnamed: 0,title,overview,genres
38573,Catastrophe,A film cataloguing some of the world's largest...,Thriller Documentary
11441,When the Levees Broke: A Requiem in Four Acts,"In August 2005, the American city of New Orlea...",Documentary
2404,Earthquake,Earthquake is a 1974 American disaster film th...,Action Drama Thriller
35263,Tomorrow,Documentary film about global warming.,Documentary
41943,Disaster!,"A spoof of disaster films, an asteroid is comi...",Action Animation Comedy


In [36]:
query = "Are there any movies about heros?"
top_result = get_query_sim_top_k(query, model, movies_metadata)
movies_metadata.iloc[top_result[1].numpy(), :][['title', 'overview', 'genres']]

Unnamed: 0,title,overview,genres
36936,The Flying Man,"A new superhero is coming, only this time it's...",Action Mystery Science Fiction
30101,"Up, Up, and Away",A boy is the only family member without superp...,Action Family TV Movie
24646,The Four Feathers,They made him a hero by branding him a coward ...,TV Movie Adventure Drama
12672,Hancock,Hancock is a down-and-out superhero who's forc...,Fantasy Action
4216,Too Late the Hero,A WWII film set on a Pacific island. Japanese ...,Drama Action War


https://www.imdb.com/title/tt0211174/?ref_=fn_al_tt_1

# 2. ChatGPT 이용

- 2개의 chatgpt 이용
    - 1개는 이 질문의 의도를 파악하는 것. 설명을 원하는 것인지, 추천을 해달라는 것인지
    - 각 분류에 따라 문구가 달라짐
        - 질문의 의도면 가장 유사한 텍스트를 가져와서 설명해주는 것
        - 추천이면, cossim topk를 가져와서 출력하도록

In [1]:
# 기본적인 chatGPT python example 입니다.
# import openai
#
# completion = openai.ChatCompletion.create(
#     model="gpt-3.5-turbo",
#     messages=[{"role": "user", "content": "Tell the world about the ChatGPT API in the style of a pirate."}]
# )
#
# print(completion)

In [2]:
# Response format
# https://platform.openai.com/docs/guides/chat/introduction

# {
#     'id': 'chatcmpl-6p9XYPYSTTRi0xEviKjjilqrWU2Ve',
#     'object': 'chat.completion',
#     'created': 1677649420,
#     'model': 'gpt-3.5-turbo',
#     'usage': {'prompt_tokens': 56, 'completion_tokens': 31, 'total_tokens': 87},
#     'choices': [
#         {
#             'message': {
#                 'role': 'assistant',
#                 'content': 'The 2020 World Series was played in Arlington, Texas at the Globe Life Field, which was the new home stadium for the Texas Rangers.'},
#             'finish_reason': 'stop',
#             'index': 0
#         }
#     ]
# }

In [37]:
def print_msg(msg):
    completion = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=msg
                    )
    return completion['choices'][0]['message']['content'] # chatGPT 답

## Prompt test

ChatGPT의 prompt는 사용자의 의도(intent)를 파악하는 prompt -> 따로 분류 모델을 만들지 않고도 ChatGPT로 분류 모델과 같은 zero-shot 형태를 구축가능!

In [38]:
messages = [
    {"role": "system", "content": "You are a helpful assistant who understands the intent of the user's question."},
    {"role": "user", "content": "Which category does the sentence below belong to: 'description', 'recommended'? Show only categories. \n context: tell me about instagram \n A:"}
]


print_msg(messages)

'description'

In [39]:
messages = [
    {"role": "system", "content": "You are a helpful assistant who understands the intent of the user's question."},
    {"role": "user", "content": "Which category does the sentence below belong to: 'description', 'recommend'? Show only categories. \n context: What apps are similar to Instagram? \n A:"}
]


print_msg(messages)



'recommend'

In [40]:
messages = [
    {"role": "system", "content": "You are a helpful assistant who understands the intent of the user's question."},
    {"role": "user", "content": "Which category does the sentence below belong to: 'description', 'recommend'? Show only categories. \n context: Recommend apps similar to Instagram. \n A:"}
]


print_msg(messages)

'recommend'

In [41]:
messages = [
    {"role": "system", "content": "You are a helpful assistant who recommend contents."},
    {"role": "user", "content": "Simply repeat the provided context and put a sentence in front of the context. \n context: Recommend apps similar to Instagram."}
]


print_msg(messages)

'Here are some apps similar to Instagram that you might want to check out:'

In [42]:
messages = [
    {"role": "system", "content": "You are a helpful assistant who recommend contents."},
    {"role": "user", "content": "Simplify the sentences for recommending services \n context: Recommend apps similar to Instagram."}
]


print_msg(messages)

'Here are some apps like Instagram that you might find helpful!'

In [43]:
messages = [
    {"role": "system", "content": "You are a helpful assistant who kindly answers."},
    {"role": "user", "content": "Please write a simple greeting starting with 'of course' to explain the item to the user."}
]


print_msg(messages)

"Of course! I'd be happy to explain the item to you."

In [44]:
messages = [
    {"role": "system", "content": "You are a helpful assistant who recommend contents based on user question."},
    {"role": "user", "content": "Write 1 sentence of a simple greeting that starts with 'Of course!' to recommend items to users."}
]


print_msg(messages)

"Of course! I'd be happy to recommend some great items for you."

In [103]:
movies_metadata.iloc[top_result[1].numpy(), :][['title', 'overview', 'genres']]

Unnamed: 0,title,overview,genres
36936,The Flying Man,"A new superhero is coming, only this time it's...",Action Mystery Science Fiction
30101,"Up, Up, and Away",A boy is the only family member without superp...,Action Family TV Movie
24646,The Four Feathers,They made him a hero by branding him a coward ...,TV Movie Adventure Drama
12672,Hancock,Hancock is a down-and-out superhero who's forc...,Fantasy Action
4216,Too Late the Hero,A WWII film set on a Pacific island. Japanese ...,Drama Action War


## 3. 필요한 Prompt 설정

- 추천인가? 설명인가? 의도 파악인가? 후 답변 내놓도록 분기 처리!

In [815]:
# 사용자의 쿼리 의도에 따라 진행할 수 있게끔 chatGPT 한테 넣어줄 쿼리문 미리 작성

msg_prompt = {
    'recom' : {
                'system' : "You are a helpful assistant who recommend movie based on user question.", 
                'user' : "Write 1 sentence of a simple greeting that starts with 'Of course!' to recommend movie items to users.", 
              },
    'desc' : {
                'system' : "You are a helpful assistant who kindly answers.", 
                'user' : "Please write a simple greeting starting with 'of course' to explain the item to the user.", 
              },
    'intent' : {
                'system' : "You are a helpful assistant who understands the intent of the user's question.",
                'user' : "Which category does the sentence below belong to: 'description', 'recommended', 'search'? Show only categories. \n context:"
                }
}

In [870]:
user_msg_history = []

In [871]:
# chatGPT 답변 내놓는 함수

def get_chatgpt_msg(msg):
    completion = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=msg
                    )
    return completion['choices'][0]['message']['content']

In [872]:
# 검색, 추천, 설명 파악 후 답변 내놓는 분기 처리하는 프론포터 설정

def set_prompt(intent, query, msg_prompt_init, model):
    '''prompt 형태를 만들어주는 함수'''
    m = dict()
    # 검색 또는 추천이면
    if ('recom' in intent) or ('search' in intent):
        msg = msg_prompt_init['recom'] # 추천해달라는 소개문구 시스템 메세지를 가지고오고
    # 설명문이면
    elif 'desc' in intent:
        msg = msg_prompt_init['desc'] # 설명해달라는 소개문구 시스템 메세지를 가지고오고
    # intent 파악
    else:
        msg = msg_prompt_init['intent']
        msg['user'] += f' {query} \n A:' # msg user에 사용자 질문 추가
    for k, v in msg.items():
        m['role'], m['content'] = k, v
    return [m]

In [872]:
def user_interact(query, model, msg_prompt_init):
    # 1. 사용자의 의도를 파악
    user_intent = set_prompt('intent', query, msg_prompt_init, None)
    user_intent = get_chatgpt_msg(user_intent).lower()
    print("user_intent : ", user_intent)
    
    # 2. 사용자의 의도에 따라 prompt 생성 & 의도에 맞는 소개 문구 출력
    intent_data = set_prompt(user_intent, query, msg_prompt_init, model)
    intent_data_msg = get_chatgpt_msg(intent_data).replace("\n", "").strip()
    print("intent_data_msg : ", intent_data_msg)
    
    # 3-1. 추천 또는 검색이면
    if ('recom' in user_intent) or ('search' in user_intent):
        recom_msg = str()
        # 기존에 메세지가 있으면 쿼리로 대체
        if (len(user_msg_history) > 0 ) and (user_msg_history[-1]['role'] == 'assistant'):
            query = user_msg_history[-1]['content']['feature']
        # 유사 아이템 가져오기
        #top_result = get_query_sim_top_k(query, model, movies_metadata, top_k=1 if 'recom' in user_intent else 3) # 추천 개수 설정하려면!
        top_result = get_query_sim_top_k(query, model, movies_metadata, top_k=3)
        #print("top_result : ", top_result)
        # 검색이면, 자기 자신의 컨텐츠는 제외 -> 추천 , 검색 되는 영화 index
        top_index = top_result[1].numpy() if 'recom' in user_intent else top_result[1].numpy()[1:]
        #print("top_index : ", top_index)
        # 장르, 제목, overview를 가져와서 출력
        r_set_d = movies_metadata.iloc[top_index, :][['genres', 'title', 'overview']]
        r_set_d = json.loads(r_set_d.to_json(orient="records"))
        for r in r_set_d:
            for _, v in r.items():
                recom_msg += f"{v} \n"
            recom_msg += "\n"
        user_msg_history.append({'role' : 'assistant', 'content' : f"{intent_data_msg} {str(recom_msg)}"})
        print(f"\n recom data : {intent_data_msg} {str(recom_msg)}")
    # 3-2. 설명이면
    elif 'desc' in user_intent:
        # 이전 메세지에 따라서 설명을 가져와야 하기 때문에 이전 메세지 컨텐츠를 가져옴
        top_result = get_query_sim_top_k(user_msg_history[-1]['content'], model, movies_metadata, top_k=1)
        # feature가 상세 설명이라고 가정하고 해당 컬럼의 값을 가져와 출력
        r_set_d = movies_metadata.iloc[top_result[1].numpy(), :][['feature']]
        r_set_d = json.loads(r_set_d.to_json(orient="records"))[0]
        user_msg_history.append({'role' : 'assistant', 'content' : r_set_d})
        print(f"\n describe : {intent_data_msg} {r_set_d}")

## 쿼리에 따른 추천 프로세스 실행

In [874]:
query = "Please recommend a movie similar to a marvel heros movie."
user_interact(query, model, copy.deepcopy(msg_prompt))

user_intent :  recommended
intent_data_msg :  Of course! Here are some top-rated movie items that you might enjoy.

 recom data : Of course! Here are some top-rated movie items that you might enjoy. 

X-Men 
Two mutants, Rogue and Wolverine, come to a private academy for their kind whose resident superhero team, the X-Men, must oppose a terrorist organization with similar powers. 




In [876]:
query = "Can you describe on the above?"
user_interact(query, model, copy.deepcopy(msg_prompt))

user_intent :  description
intent_data_msg :  Of course! Let me explain what this item is and how it works.

 describe : Of course! Let me explain what this item is and how it works. {'feature': 'Adventure Action Science Fiction / X-Men / Two mutants, Rogue and Wolverine, come to a private academy for their kind whose resident superhero team, the X-Men, must oppose a terrorist organization with similar powers.'}


In [877]:
user_msg_history

[{'role': 'assistant',
  'content': 'Of course! Here are some top-rated movie items that you might enjoy. \n\nX-Men \nTwo mutants, Rogue and Wolverine, come to a private academy for their kind whose resident superhero team, the X-Men, must oppose a terrorist organization with similar powers. \n\n'},
 {'role': 'assistant',
  'content': {'feature': 'Adventure Action Science Fiction / X-Men / Two mutants, Rogue and Wolverine, come to a private academy for their kind whose resident superhero team, the X-Men, must oppose a terrorist organization with similar powers.'}}]

In [878]:
query = "Are there other movies that are similar to the ones above?"
user_interact(query, model, copy.deepcopy(msg_prompt))

user_intent :  'search'
intent_data_msg :  Of course! We have a great selection of movie items that will fit your every need.

 recom data : Of course! We have a great selection of movie items that will fit your every need. 
X-Men: Days of Future Past 
The ultimate X-Men ensemble fights a war for the survival of the species across two time periods as they join forces with their younger selves in an epic battle that must change the past – to save our future. 


