# Doc2Vec
## 2. 코퍼스 생성
- 리뷰 데이터 코퍼스 생성 및 결과를 DB에 저장한다.

#### 2.1 DB에서 데이터 출력하기

In [None]:
import mysql.connector
import pandas as pd

In [None]:
table_config = {
    'user': 'root',
    'password': '1234',
    'host': 'localhost',
    'port': 3306,
    'database': 'db_test',
    'raise_on_warnings': True,
    'charset' : 'utf8'
}

In [None]:
try:
    conn = mysql.connector.connect(**table_config)
    curs = conn.cursor()
    sql_select_data = "select idx, user_review from naver_movie_info" 
    df = pd.read_sql(sql_select_data, con=conn, columns=True)

except Exception as e:
    print(e)
    
finally:
    conn.close()

In [None]:
print(df.head(5))

#### 2.2 코퍼스 생성하기
- 속도가 빠른 Twitter 한글 형태소 분석기를 사용했다.

In [None]:
from konlpy.tag import Twitter

In [None]:
# 품사 제외하고 리스트화
def make_corpus(doc):
    tokens = nlp.pos(doc, norm=True, stem=True)
    # print(tokens)
    token_all = []
    for token in tokens:
        # 명사/동사/형용사만 저장하기
        if token[1] in ['Noun', 'Verb', 'Adjective']:
            token_all.append(token[0])
    return token_all

In [None]:
nlp = Twitter()
corpus_list = []

for review in df['user_review']:
    corpus = make_corpus(review)
    corpus = ', '.join(corpus)
    corpus_list.append(corpus)
    
df['tokenized_user_review'] = corpus_list

In [None]:
print(df.head(10))

#### 2.3 생성된 코퍼스 저장하기

In [None]:
try:
    conn = mysql.connector.connect(**table_config)
    curs = conn.cursor()
    
    # 1) 생성된 코퍼스를 삽입할 열을 생성한다.
    sql_create_col = "ALTER TABLE naver_movie_info ADD tokenized_user_review mediumtext AFTER user_review"
    curs.execute(sql_create_col)
    conn.commit()

    # 2) 리뷰를 정제한 날짜를 삽입할 열을 생성한다.
    sql_create_col = "ALTER TABLE naver_movie_info ADD review_defined_date mediumtext AFTER tokenized_user_review"
    curs.execute(sql_create_col)
    conn.commit()
    
except Exception as e:
    print(e)
    
finally:
    conn.close()

In [None]:
import datetime 

sql_update_data = "UPDATE naver_movie_info SET tokenized_user_review = %s, review_defined_date = %s where idx = %s"
cnt = 0

try:
    
    conn = mysql.connector.connect(**table_config)
    curs = conn.cursor()
    nowDate = datetime.datetime.now().strftime('%Y-%m-%d')
    
    for idx, tokenized_user_review in zip(df['idx'], df['tokenized_user_review']):
        review_defined_date = nowDate
        values = (tokenized_user_review, review_defined_date, idx)
        curs.execute(sql_update_data, values)
        # print(values)
        cnt = cnt + 1
        print(cnt)
        
        if cnt == 50:
            conn.commit()
            cnt = 0
            
    if cnt != 50:
        print(cnt)
        conn.commit()

except Exception as e:
    print(e)
    
finally:
    conn.close()    