# 문서 임베딩 유사도 기반 추천 리스트를 DB에 저장

In [31]:
# Basic
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests
import bs4
import csv

# NLP
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.corpus import stopwords
import re

# Download
from requests import get

# Crawling
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
options = webdriver.ChromeOptions()
options.add_argument("headless")

In [32]:
# 간단한 NLP 전처리
def data_text_cleaning(data):
 
    # 영문자 이외 문자는 공백으로 변환
    only_english = re.sub('[^a-zA-Z]', ' ', data)
 
    # 소문자 변환
    no_capitals = only_english.lower().split()
 
    # 불용어 제거
    stops = set(stopwords.words('english'))
    no_stops = [word for word in no_capitals if not word in stops]
 
    # 어간 추출
    stemmer = nltk.stem.SnowballStemmer('english')
    stemmer_words = [stemmer.stem(word) for word in no_stops]
 
    # 공백으로 구분된 문자열로 결합하여 결과 반환
    return stemmer_words

In [None]:
# DB 접속
client = MongoClient("###")
db = client['final_project']
similarity = db['similarity']

In [33]:
# 데이터 확보
result = list(db.repository.find())

In [35]:
# repo url 확보
base = 'https://github.com/'
links = []
link2rid = {}
for r in tqdm(result):
    link = base + r['login'] + '/' + r['repo_name']
    links.append(link)
    link2rid[link] = r['rid']

100%|██████████| 2698/2698 [00:00<00:00, 467825.55it/s]


In [13]:
# Repo마다 Readme, about, tag 크롤링
readme_dic = {}
about_dic = {}
tag_dic = {}
doc_dic = {}
for link in tqdm(links):
    try:
        driver_path = '/Users/pilkyu/Driver/chromedriver'
        url = link
        driver = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
        driver.get(url)
        source = driver.page_source
        bs = bs4.BeautifulSoup(source,'lxml')
        readme = bs.find('article','markdown-body entry-content container-lg')
        readme = readme.get_text().replace('\n','').replace('\xa0','')
        about = bs.find('p','f4 my-3')
        about = about.get_text().replace('\n','')   
        tag = bs.find_all('a','topic-tag topic-tag-link')
        tag = ''.join([t.get_text() for t in tag]).replace('\n','')
        readme_dic[link2rid[link]] = readme
        about_dic[link2rid[link]] = about
        tag_dic[link2rid[link]] = tag
        doc_dic[link2rid[link]] = tag_dic[link2rid[link]] + ' ' + about_dic[link2rid[link]] + ' ' + readme_dic[link2rid[link]]
        doc_dic[link2rid[link]] = data_text_cleaning(doc_dic[link2rid[link]])
        driver.quit()
    except:
        pass

  driver = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
  driver = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
100%|██████████| 2698/2698 [2:14:01<00:00,  2.98s/it]  


In [15]:
# 피클로 저장
pd.to_pickle(readme_dic, 'readme_dic.pkl')
pd.to_pickle(about_dic, 'about_dic.pkl')
pd.to_pickle(tag_dic, 'tag_dic.pkl')
pd.to_pickle(doc_dic, 'doc_dic.pkl')

In [16]:
doc_dic = pd.read_pickle('doc_dic.pkl')

##### Doc2Vec

In [17]:
tagged_corpus_list = [TaggedDocument(tags=[str(key)], words=value) for key, value in zip(doc_dic.keys(),doc_dic.values())]

In [20]:
model = Doc2Vec(min_count=1,vector_size=100)

# Vocabulary 빌드
model.build_vocab(tagged_corpus_list)

# Doc2Vec 학습
model.train(tagged_corpus_list, total_examples=model.corpus_count, epochs=50)

In [21]:
doc_sim = {}
for rid in tqdm(doc_dic.keys()):
    doc_sim[rid] = [int(rid[0]) for rid in model.docvecs.most_similar(str(rid))]

100%|██████████| 2658/2658 [00:00<00:00, 2794.19it/s]


In [22]:
sim_list = []
for rid, doc_sim_list in tqdm(doc_sim.items()):
    temp_dic = {}
    temp_dic['rid'] = rid
    temp_dic['doc_sim'] = doc_sim_list
    sim_list.append(temp_dic)

100%|██████████| 2658/2658 [00:00<00:00, 254525.24it/s]


In [None]:
similarity.insert_many(sim_list)