# doc2vec 추천

# ---------------------------------------------

## 라이브러리 호출

In [None]:
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import time
import nltk

from PIL import Image
from io import BytesIO

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from gensim.models import Word2Vec # word2vec의 알고리즘 호출
from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec

from sklearn.metrics.pairwise import cosine_similarity

# Download stopwords
nltk.download('stopwords')


# 전체 데이터 개수

In [None]:
df = pd.read_excel("gamedata_eng.xlsx")
print('Total number of documents:', len(df))

In [None]:
# Preview the first 5 rows
df.head(10)

# 날짜순 정렬은 힘들다. => 최신 버전은 skip
- df['release_date'] = pd.to_datetime(df['release_date'])
- Unknown string format: 1/ago./2019
- time data 'Feb 2021' does not match format '%d %b, %Y' (match) 
- 날짜가 이상하게 나옴


# 데이터 살펴보기

In [None]:
print('데이터의 총 개수: ', len(df))
print('metacritic이 없는 데이터의 개수 : ', df['metacritic'].isnull().sum())
print('추천이 없는 데이터의 개수 : ',df['recommendations'].isnull().sum())

# metacritic 없는 데이터를 제거한 dataset 생성

In [None]:
# metacritc 없는 데이터를 제거한 dataset 생성
df_metacritic=df.dropna(subset=['metacritic'],how='any',axis=0) 
print('metacritic 개수 : ',len(df_metacritic))

In [None]:
df_metacritic.columns

# metacritic이 있는 데이터에서 상위 200개를 추출 (recommendation 고려)

In [None]:
# 정렬을 하기위해, 숫자형으로 변환하고 정렬
df_metacritic['metacritic'] = pd.to_numeric(df_metacritic['metacritic'])
df_metacritic['recommendations'] = pd.to_numeric(df_metacritic['recommendations'])

df_metacritic = df_metacritic.sort_values(by=['metacritic','recommendations'], ascending=False)

In [None]:
df_metacritic.head(10)

In [None]:
# metacritic 있는 데이터에서 200개를 game_list에 담아준다.
game_list = pd.DataFrame()
game_list = pd.concat((game_list,df_metacritic[:201]))
game_list


# 데이터 정제

In [None]:
# Data cleaning functions

def _removeNonAscii(s):
    return "".join(i for i in str(s) if  ord(i) < 128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text


In [None]:
# Data cleaning

df['cleaned'] = df['short_description'].apply(_removeNonAscii)
df['cleaned'] = df['cleaned'].apply(make_lower_case)
df['cleaned'] = df['cleaned'].apply(remove_stop_words)
df['cleaned'] = df['cleaned'].apply(remove_punctuation)
df['cleaned'] = df['cleaned'].apply(remove_html)


# 빈 행 제거

In [None]:
# Remove empty rows
df['cleaned'].replace('', np.nan, inplace=True)
df = df[df['cleaned'].notna()]
print('Total number of documents after cleaning:', len(df))

In [None]:
df.head(2)

# word2Vec 모델 만들기

1천억 단어 규모의 구글 뉴스 데이터로 300만개의 단어의 임베딩을 
미리 학습시킨 Word2Vec 임베딩을 다운 받을 수 있다. 
"GoogleNews-vectors-negative300.bin.gz" 다운받아 압축을 풀면 약 3GB 크기의 파일이 생긴다.

In [None]:
import gensim
print(gensim.__version__)

In [None]:
pip install --upgrade gensim==3.8.3

# import IPython
# IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
print(gensim.__version__)

In [None]:
# Build Word2Vec model
corpus = [words.split() for words in df['cleaned']]
word2vec_model = Word2Vec(vector_size=300, window=5, min_count=2, workers=-1) # size 300, window 5 설정
word2vec_model.build_vocab(corpus)


In [None]:
word2vec_model.wv.vectors_lockf

In [None]:
word2vec_model.build_vocab(corpus)
word2vec_model.wv.vectors_lockf = np.ones(len(word2vec_model.wv),dtype=np.float32)

In [None]:
word2vec_model.wv.intersect_word2vec_format('GoogleNews-vectors-negative300.bin.gz', lockf=1.0, binary=True) # pre-trained data
word2vec_model.train(corpus, total_examples=word2vec_model.corpus_count, epochs=15) 

In [None]:
# 임베딩 잘 되었는지 확인
word2vec_model.save('word2vec.model')
word2vec_model=Word2Vec.load('word2vec.model')
word2vec_model.wv['cleaned']

# 각 문서의 벡터를 추출, 이를 이용하여 유사도 행렬 계산

In [None]:
# Get document vectors
def get_document_vectors(document_list, model):
    document_embedding_list = []
    
    for line in document_list:
        doc2vec = None
        count = 0
        for word in line.split():
            if word in model.wv.vocab:
                count += 1
                if doc2vec is None:
                    doc2vec = model[word]
                else:
                    doc2vec = doc2vec + model[word]
        if doc2vec is not None:
            doc2vec = doc2vec / count
            document_embedding_list.append(doc2vec)
    
    return document_embedding_list

document_embedding_list = get_document_vectors(df['cleaned'], word2vec_model)
print('Number of document vectors:', len(document_embedding_list))
print(document_embedding_list)

# Calculate cosine similarity matrix
cosine_similarities = cosine_similarity(document_embedding_list, document_embedding_list)
print('Size of cosine similarity matrix:', cosine_similarities.shape)




# 선택한 게임에 대해 1개당 추천 20개씩 추출

In [None]:
# 게임에 대한 appid를 추출하는 함수
import pandas as pd

def get_game_indices(name, df, cosine_similarities):
    # Get the index of the game based on its name
    indices = pd.Series(df.index, index=df['name']).drop_duplicates()
    idx = indices[name]
    print(idx)
    
    # Get the indices of the most similar games based on the cosine similarities
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores,reverse=True)
    sim_scores = sim_scores[1:21]
    game_indices = [i[0] for i in sim_scores]

    return game_indices


In [None]:
# 선택한 게임에 대한 정보를 추출하는 함수
def info_recommendations(name, df, cosine_similarities):
    games = df[['name', 'price', 'genres', 'image']]

    # Get the indices of the recommended games
    game_indices = get_game_indices(name, df, cosine_similarities)
    
    # Get the recommended games based on their indices
    recommend = games.iloc[game_indices].reset_index(drop=True)

    return recommend


# 팀원들 데이터로 유의미한 값인지 확인

In [None]:
import requests

users = {"sua":"76561198797386305","ong":"76561198099903362","yeon":"76561198135409603"}
appid=users["sua"]

res = requests.get(f'https://api.steampowered.com/IPlayerService/GetOwnedGames/v1/?key=C3FFEC0142E5F08F003DD715237DD9AA&steamid={appid}').json()

game_cnt=res['response']["game_count"]

print(game_cnt)

user_games = res['response']['games']
print(res['response']['games'])


game_lst = []
for i in range(game_cnt):
    game_lst.append(user_games[i]["appid"])
    
print(game_lst)
    

In [None]:
# 선택한 유저의 게임 개수와 유저의 게임 항목 체크
name_list=[]
for i in range(game_cnt):
    appid = game_lst[i]
    name_list.extend(df.loc[df['appid']== appid, 'name'].tolist())

print('해당 유저의 게임 개수 : ',len(name_list))
print('해당 유저의 게임 리스트 : ',name_list)

# 추천해준 게임에 대한 정보 반환

In [None]:
# 추천 결과를 저장할 데이터프레임을 생성합니다.
recommend_df = pd.DataFrame()

for i in range(len(name_list)):
    game_name = name_list[i]

    try:
        recommend_info = info_recommendations(game_name, df, cosine_similarities)
        game_list=pd.concat(([game_list,recommend_info]))
        
    except:
        continue

print('추천 게임 개수 : ', len(game_list))

# 랜덤으로 60개를 비복원 추출한다.

In [None]:
# 복원추출을 하려면 replace=True
game_list.sample(n=60)

# 시각화로 확인

In [None]:
def show_recommendations(name, df, cosine_similarities):
    games = df[['name', 'image']]

    # Get the indices of the recommended games
    game_indices = get_game_indices(name, df, cosine_similarities)
    print(game_indices)

    # Get the recommended games based on their indices
    recommend = games.iloc[game_indices].reset_index(drop=True)
    print(recommend)

    fig, axs = plt.subplots(1, 5, figsize=(20, 30))

    # Display the images of the recommended games
    for index, row in recommend.iterrows():
        try:
            response = requests.get(row['image'])
            img = Image.open(BytesIO(response.content))
            axs[index].imshow(img)
            axs[index].set_title(row['name'])
        except:
            continue
#     plt.show()
    

In [None]:
# 추천 결과를 저장할 데이터프레임을 생성합니다.
recommend_df = pd.DataFrame()

for i in range(len(name_list)):
    game_name = name_list[i]

    try:
        show_recommendations(game_name, df, cosine_similarities)
#         game_list=pd.concat(([game_list,recommend_info]))
        
    except:
        continue

print('추천 게임 개수 : ', len(game_list))

In [None]:
show_recommendations('Hades', df, cosine_similarities)