In [1]:
# n-gram
# n-gram 은 n개 어절/음절을 연쇄적으로 분류해 그 빈도를 따진다
# n=1 일 때는 unigram, 2일 때는 bigram, 3 - trigram 
# 긴 텍스트를 분석하게 된다면 같은 n-gram 이 여러개 나온다 
# n-gram 들의 수를 세서 리스트로 만든 것을 빈도 리스트 (frequency list) 라고 한다 
# 이상의 날개에서 가장 많이 나온 n-gram 이 뭘까 궁금해서 Python으로 n-gram 분석을 해봤습니다. 

In [2]:
with open('nalgae.txt', 'r', encoding='utf-8', newline='\n') as f:
    file = f.read()
sample_file = ''.join(file)[6:]


In [3]:
sample_file[0:300]

"박제(剝製)가 되어 버린 천재'를 아시오? 나는 유쾌하오. 이런 때 연애까지가 유쾌하오.\r\n육신이 흐느적흐느적하도록 피로했을 때만 정신이 은화처럼 맑소. 니코틴이 내 횟배 앓는 뱃속\r\n으로 스미면 머릿속에 으레 백지가 준비되는 법이오. 그 위에다 나는 위트와 파라독스를 바둑 포\r\n석처럼 늘어 놓소. 가공할 상식의 병이오.\r\n나는 또 여인과 생활을 설계하오. 연애기법에마저 서먹서먹해진 지성의 극치를 흘깃 좀 들여다\r\n본 일이 있는, 말하자면 일종의 정신분일자(정신이 제멋대로 노는 사람)말이오. 이런 여인의 반\r\n----그것은 온갖 것"

In [4]:
#어절 n-gram 분석
#sentence: 분석할 문장, num_gram: n-gram 단위
def word_ngram(sentence, num_gram):
    # in the case a file is given, remove escape characters
    sentence = sentence.replace('\n', ' ').replace('\r', ' ').replace('  ',' ')
    text = tuple(sentence.split(' '))
    ngrams = [text[x:x+num_gram] for x in range(0, len(text))]
    return tuple(ngrams)

In [5]:
word_ngram(sample_file,2)[0:20]

(('박제(剝製)가', '되어'),
 ('되어', '버린'),
 ('버린', "천재'를"),
 ("천재'를", '아시오?'),
 ('아시오?', '나는'),
 ('나는', '유쾌하오.'),
 ('유쾌하오.', '이런'),
 ('이런', '때'),
 ('때', '연애까지가'),
 ('연애까지가', '유쾌하오.'),
 ('유쾌하오.', '육신이'),
 ('육신이', '흐느적흐느적하도록'),
 ('흐느적흐느적하도록', '피로했을'),
 ('피로했을', '때만'),
 ('때만', '정신이'),
 ('정신이', '은화처럼'),
 ('은화처럼', '맑소.'),
 ('맑소.', '니코틴이'),
 ('니코틴이', '내'),
 ('내', '횟배'))

In [6]:
#음절 n-gram 분석
#sentence: 분석할 문장, num_gram: n-gram 단위
def phoneme_ngram(sentence, num_gram):
    text = tuple(sentence) # split the sentence into an array of characters
    ngrams = [text[x:x+num_gram] for x in range(0, len(text))]
    return ngrams


In [7]:
phoneme_ngram(sample_file,3)[0:10]

[('박', '제', '('),
 ('제', '(', '剝'),
 ('(', '剝', '製'),
 ('剝', '製', ')'),
 ('製', ')', '가'),
 (')', '가', ' '),
 ('가', ' ', '되'),
 (' ', '되', '어'),
 ('되', '어', ' '),
 ('어', ' ', '버')]

In [8]:
#n-gram 빈도 리스트 생성
def make_freqlist(ngrams):
    freqlist = {}
 
    for ngram in ngrams:
        if (ngram in freqlist):
            freqlist[ngram] += 1
        else:
            freqlist[ngram] = 1
    return freqlist


In [9]:
ngrams = word_ngram(sample_file, 3)
freqlist = make_freqlist(ngrams)

In [10]:
#sorted_freqlist = sorted(freqlist.items(), key=operator.itemgetter(1))
sorted_freqlist = sorted(freqlist.items(), key=lambda elem: elem[1],reverse=True)
print(sorted_freqlist[0:10]) #freq 내림차순정렬

[(('알', '수', '없다.'), 4), (('그', '돈', '오'), 4), (('돈을', '놓고', '가는'), 3), (('것', '같았다.', '나는'), 3), (('오', '원', '돈을'), 3), (('해가', '드는', '것을'), 2), (('수', '없다.', '나는'), 2), (('나는', '내', '아내'), 2), (('내', '아내', '외의'), 2), (('아내', '외의', '다른'), 2)]


# 확률론적 언어 모형
확률론적 언어 모형(Probabilistic Language Model)은  mm 개의 단어  w1,w2,…,wmw1,w2,…,wm열(word sequence)이 
주어졌을 때 문장으로써 성립될 확률  P(w1,w2,…,wm)P(w1,w2,…,wm)  을 출력함으로써 이 단어 열이 실제로 현실에서 
사용될 수 있는 문장(sentence)인지를 판별하는 모형이다.


__유니그램 모형 (Unigram Model)¶__

만약 모든 단어의 활용이 완전히 서로 독립이라면 단어 열의 확률은 다음과 같이 각 단어의 확률의 곱이 된다. 이러한 모형을 유니그램 모형 (Unigram Model)이라고 한다.

$P(w_1,w_2,…,w_m)=\prod_{i=1}^m P(w_i)$
 
__바이그램 모형 (Bigram Model)¶__

만약 단어의 활용이 바로 전 단어에만 의존한다면 단어 열의 확률은 다음과 같다. 이러한 모형을 Bigram 모형 또는 마코프 모형(Markov Model)이라고 한다.

$P(w_1,w_2,…,w_m)=P(w_1)\prod_{i=2}^m P(w_i|w_{i−1})$
 
__N-그램 모형 (N-gram Model)¶__

만약 단어의 활용이 바로 전  n 개의 단어에만 의존한다면 단어 열의 확률은 다음과 같다. 이러한 모형을 N-gram 모형이라고 한다.
$P(w_1,w_2,…,w_m)=P(w_1)\prod_{i=n}^m P(w_i|w_{i−1},…,w_{i−n})$

# 확률 추정 방법
실제 텍스트 코퍼스(corpus)에서 확률을 추정하는 방법은 다음과 같다. 여기에서는 바이그램의 경우를 살펴본다.

일단 모든 문장에 문장의 시작과 끝을 나타내는 특별 토큰을 추가한다. 
예를 들어 문장의 시작은 SS, 문장의 끝은 SE 이라는 토큰을 사용할 수 있다.

바이그램 모형에서는 전체 문장의 확률은 다음과 같이 조건부 확률의 곱으로 나타난다.

P(SS I am a boy SE)=P(I|SS)⋅P(am|I)⋅P(a|am)⋅P(boy|a)⋅P(SE|boy)


# 예제
다음은 nltk 패키지의 샘플 코퍼스인 movie_reviews의 텍스트를 기반으로 N-그램 모형을 추정하고 모형 확률로부터 
랜덤하게 문장을 생성하는 예제이다.

In [12]:
from nltk.corpus import movie_reviews

# 문서를 문장으로 분리
sentences = list(movie_reviews.sents())

import random
# 섞는다.
random.seed(180607)
random.shuffle(sentences)

In [14]:
sentences[0][0:10]

['wayne', "'", 's', 'greatest', 'ambition', 'is', 'to', 'become', 'a', 'night']

In [15]:
# 이제 이 입력으로부터 확률값을 추정한다.
import collections, math
from math import log
from collections import Counter
from konlpy.utils import pprint


def stringify_context(context):
    return(" ".join(context))


boundaryToken = ""
def ngrams(n, sentences, boundaryToken=boundaryToken, verbose=False):
    c = {}
    q = []
    for i in range(n-1):
        q.append(boundaryToken)
    for sentence in sentences:
        for w in sentence + [boundaryToken]:
            context_gram = stringify_context(q)
            if verbose:
                print(q)
                print(context_gram)
                print(w)
            if not context_gram in c:
                c[context_gram] = Counter() #Counter 클래스의 객체 c
            c[context_gram][w] += 1
            q.pop(0)
            q.append(w)
    return(c)

In [16]:
ngrams(2, sentences[:1000])["we"] #Counter 객체를 리턴 

Counter({"'": 7,
         'are': 4,
         'can': 3,
         'do': 1,
         'don': 1,
         'feel': 1,
         'focus': 1,
         'get': 2,
         'have': 2,
         'join': 1,
         'just': 1,
         'know': 2,
         'later': 1,
         'learn': 1,
         'like': 1,
         'might': 1,
         'never': 1,
         'should': 1,
         'were': 1})

In [17]:
import numpy as np
class BigramModel:
    
    def __init__(self, training_sentences, smoothing='none'):
        train = ngrams(2, training_sentences) #앞서 정의된 ngram함수
        self.probs = {}
        if smoothing == 'none':
            for context_gram in train.keys():
                N = sum(train[context_gram].values())
                self.probs[context_gram] = Counter({k:v/N for k,v in train[context_gram].items()})

    def prob(self, word, context):
        """takes a word string and a context which is a list of word strings, and returns the probability of the word"""
        c = stringify_context(context)
        return(self.probs[c][word])

    def scoreSentence(self, sentence, verbose=False):
        context = [boundaryToken]
        result = 0
        for w in sentence + [boundaryToken]:
            lp = log(self.prob(w, context))
            result = result + lp
            if verbose:
                pprint([context, w, lp])
            context = [w]
        return result

    def generateSentence(self, verbose=False, goryDetails=False):
        context = [boundaryToken]
        result = []
        w = None
        while not w == boundaryToken:
            r = random.random() # returns a random float between 0 and 1
            x = 0
            c = self.probs[stringify_context(context)] # this will be a Counter
            w = c.keys()[np.argmax(np.random.multinomial(1, c.values(), (1,))[0])]
            result.append(w)
            context = [w]
            if verbose:
                print(w)
        result.pop() # drop the boundary token
        return result

In [18]:
m = BigramModel(sentences)  #모델생성

In [19]:
# 트레이닝이 끝나면 조건부 확률의 값을 보거나 샘플 문장을 입력해서 문장의 로그 확률을 구할 수 있다.

# "i" 라는 단어가 나온 뒤에 "am"이라는 단어가 나올 확률을 계산하면
m.prob("am", ["i"]) #조건부확률의 형태

0.018562267971650354

In [20]:
m.prob("", ["."])  # .(마침표) 뒤에 문장이 끝날 확률

0.9624749529418908

In [21]:
print(m.prob("the", ["in"]))  # in 뒤에 the 가 올 확률
print(m.prob("in", ["the"]))  # the 뒤에 in 이 올 확률

0.26083768673815416
0.0001437363613793464


In [22]:
m.probs["apple"]

Counter({"'": 0.03571428571428571,
         ',': 0.17857142857142858,
         '-': 0.07142857142857142,
         '.': 0.10714285714285714,
         'cider': 0.03571428571428571,
         'computer': 0.03571428571428571,
         'hasn': 0.03571428571428571,
         'in': 0.03571428571428571,
         'orchard': 0.03571428571428571,
         'picker': 0.03571428571428571,
         'pickers': 0.07142857142857142,
         'pie': 0.2857142857142857,
         'that': 0.03571428571428571})

In [23]:
test_sentence = ['in', 'the', '1970s', '.']
m.scoreSentence(test_sentence, verbose=True) 
#각각 로그확률을 계산해 더한 score를 리턴

[[''], 'in', -3.7639298908174825]
[['in'], 'the', -1.343856955005301]
[['the'], '1970s', -9.45366556371934]
[['1970s'], '.', -1.413693335308005]
[['.'], '', -0.038247236076315826]


-16.013392980926444

In [24]:
m.scoreSentence(["i", "am", "a", "boy", "."], verbose=True)

[[''], 'i', -3.3655219750193166]
[['i'], 'am', -3.9866243623410944]
[['am'], 'a', -2.6441463991227296]
[['a'], 'boy', -7.370073198683084]
[['boy'], '.', -2.4904468301636156]
[['.'], '', -0.038247236076315826]


-19.895060001406158

# 확률론적 언어 모형의 활용
확률론적 언어 모형은 다음과 같은 분야에 광범위하게 활용할 수 있다.

철자 및 문법 교정(Spell Correction)
음성 인식(Speech Recognition)
자동 번역(Machine Translation)
자동 요약(Summarization)
챗봇(Question-Answering)

https://datascienceschool.net/view-notebook/a0c848e1e2d343d685e6077c35c4203b/

# Word2vec

In [27]:
##w word2vec을 쓰기 위해서 gensim을 다운받아야 한다
## terminal에 easy_install -U gensim 혹은 pip install --upgrade gensim을 친다

#설치오류시 참고
##https://blog.naver.com/sans223/221274010123 -> gensim 설치
##https://blog.naver.com/ddonae_/221190968528 -> gensim 설치
##https://blog.naver.com/vangarang/220934552201 ->nltk 설치

from glob import glob
from codecs import open as codecs_open
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import stopwords
from re import sub
import re
from gensim.models.word2vec import Word2Vec
from os import path, mkdir
from multiprocessing import cpu_count
from sklearn.manifold import TSNE
from pandas import DataFrame
from matplotlib import pyplot as plt



In [31]:
## 경로를 지정하고 파일을 읽어온다 바탕화면에 다운로드 할 경우 Users와 user를 본인 컴에 맞게 지정할 것

def read_books(location):
    if path.exists(location):
        return sorted(glob(path.join(location, "*.txt")))
    else:
        raise NotADirectoryError(location)
books = read_books(r'''C:\Users\Chankoo\Desktop\180607_텍마_Python\books''')
read_books(r'''C:\Users\Chankoo\Desktop\180607_텍마_Python\books''')

['C:\\Users\\Chankoo\\Desktop\\180607_텍마_Python\\books\\HPBook1.txt',
 'C:\\Users\\Chankoo\\Desktop\\180607_텍마_Python\\books\\HPBook2.txt',
 'C:\\Users\\Chankoo\\Desktop\\180607_텍마_Python\\books\\HPBook3.txt',
 'C:\\Users\\Chankoo\\Desktop\\180607_텍마_Python\\books\\HPBook4.txt',
 'C:\\Users\\Chankoo\\Desktop\\180607_텍마_Python\\books\\HPBook5.txt',
 'C:\\Users\\Chankoo\\Desktop\\180607_텍마_Python\\books\\HPBook6.txt',
 'C:\\Users\\Chankoo\\Desktop\\180607_텍마_Python\\books\\HPBook7.txt',
 'C:\\Users\\Chankoo\\Desktop\\180607_텍마_Python\\books\\HPBook8.txt']

In [32]:
## corpus 생성
def create_corpus(books):
    raw_corpus = u''
    for book in books:
        print("Reading {0}".format(book))
        with codecs_open(book, 'r', 'utf-8') as book_file:
            raw_corpus += book_file.read()
        print("Corpus is now {0} characters long".format(len(raw_corpus)))
    return raw_corpus
raw_corpus2 = create_corpus(books)

##뒤에 불용어 처리할때 (stop_word) 소문자 감안 하여 미리 소문자로 바꿔준다 (나중에 바꾸는 것을 추천, 대문자 필요할 때 있으니까)
raw_corpus = raw_corpus2.lower()

Reading C:\Users\Chankoo\Desktop\180607_텍마_Python\books\HPBook1.txt
Corpus is now 449988 characters long
Reading C:\Users\Chankoo\Desktop\180607_텍마_Python\books\HPBook2.txt
Corpus is now 949340 characters long
Reading C:\Users\Chankoo\Desktop\180607_텍마_Python\books\HPBook3.txt
Corpus is now 1575163 characters long
Reading C:\Users\Chankoo\Desktop\180607_텍마_Python\books\HPBook4.txt
Corpus is now 2688627 characters long
Reading C:\Users\Chankoo\Desktop\180607_텍마_Python\books\HPBook5.txt
Corpus is now 4204918 characters long
Reading C:\Users\Chankoo\Desktop\180607_텍마_Python\books\HPBook6.txt
Corpus is now 5209621 characters long
Reading C:\Users\Chankoo\Desktop\180607_텍마_Python\books\HPBook7.txt
Corpus is now 6404966 characters long
Reading C:\Users\Chankoo\Desktop\180607_텍마_Python\books\HPBook8.txt
Corpus is now 6681889 characters long


In [29]:
## 문장 단위의 token을 생성 
def tokenize_corpus(raw_corpus):
    tokenizer = PunktSentenceTokenizer()
    return tokenizer.tokenize(raw_corpus)

tokenize_corpus(raw_corpus)
token = tokenize_corpus(raw_corpus)



In [30]:
#문장 단위의 token을 word로 쪼갠다
def sentence_to_words_list(token):
    stop_words = stopwords.words('english')
    words = re.sub("[^a-zA-Z]", " ", str(token)).split()
    return [word for word in words if word not in stop_words]
tokens = sentence_to_words_list(token)
sentence_to_words_list(token)

['chapter',
 'one',
 'r',
 'n',
 'r',
 'n',
 'r',
 'nthe',
 'boy',
 'lived',
 'r',
 'n',
 'r',
 'n',
 'r',
 'nmr',
 'mrs',
 'dursley',
 'number',
 'four',
 'privet',
 'drive',
 'proud',
 'say',
 'perfectly',
 'normal',
 'thank',
 'much',
 'last',
 'people',
 'expect',
 'involved',
 'anything',
 'strange',
 'mysterious',
 'hold',
 'nonsense',
 'mr',
 'dursley',
 'director',
 'firm',
 'called',
 'grunnings',
 'made',
 'drills',
 'big',
 'beefy',
 'man',
 'hardly',
 'neck',
 'although',
 'large',
 'mustache',
 'mrs',
 'dursley',
 'thin',
 'blonde',
 'nearly',
 'twice',
 'usual',
 'amount',
 'neck',
 'came',
 'useful',
 'spent',
 'much',
 'time',
 'craning',
 'garden',
 'fences',
 'spying',
 'neighbors',
 'dursleys',
 'small',
 'son',
 'called',
 'dudley',
 'r',
 'n',
 'r',
 'nopinion',
 'finer',
 'boy',
 'anywhere',
 'dursleys',
 'everything',
 'wanted',
 'also',
 'secret',
 'greatest',
 'fear',
 'somebody',
 'would',
 'discover',
 'think',
 'could',
 'bear',
 'anyone',
 'found',
 'potter

In [None]:
## 세글자 이상의 단어만 수집하기로 한다(r, n 이런 것 때문에!) 10분정도 걸린다 
def tokens_to_words(tokens):
    words = [sentence_to_words_list(token) for token in tokens if len(token) > 2]
    print("The corpus contains {0:,} tokens".format(sum([len(word) for word in words])))
    return words
words = tokens_to_words(tokens)

In [None]:
## workers = multiprocessing.cpu_count(), min_count = 50 등장횟수 50이하인 단어는 제외, size=100 100개의 차원으로 embedding
## sg=0 이면 CBOW sg=1이면 skip.gram  (sg=skip.gram)
def build_vocab(words, num_features, min_word_count, num_workers, context_size):
    word2vec = Word2Vec(sg=1, workers=num_workers, size=num_features, min_count=min_word_count,
                        window=context_size)
    print("Building Vocabulary")
    word2vec.build_vocab(words)
    return word2vec

model = build_vocab(words,100,50,4,10)


##모델을 저장하고 불러와서 다시 training시킬 수 있다
model.save(r'''C:\Users\100\Desktop\텍스트마이닝\6_word2vec\\model.w2v''')
model = Word2Vec.load(r'''C:\Users\100\Desktop\텍스트마이닝\6_word2vec\model.w2v''')

In [None]:
model.wv['voldemort']

In [None]:
## word_vector와 내장함수로 여러가지를 구해볼 수 있다
def word_correlation(word_vector, a, b, c):
    return word_vector.most_similar_cosmul(positive=[a, c], negative=[b])[0][0]


def word_find_most_similar(word_vector, word):
    return word_vector.most_similar(word)[0][0]


def word_odd_one(word_vector, phrase):
    return word_vector.doesnt_match(phrase.split())

def similarity(word_vector,a,b):
    return word_vector.similarity(a,b)

print(word_correlation(model.wv, 'harry', 'voldemort', 'ron'))
print(word_find_most_similar(model.wv, 'ron'))
print(word_odd_one(model.wv,'He had been hugged by a complete stranger'))
print(similarity(model.wv,'harry','ron'))

In [None]:
## 시각화하기 mincount 조절해서 selective하게 그리면 된다. 이것도 5분정도 걸린다
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
tsne_plot(model)

# wine2vec 
https://www.kaggle.com/zynicide/word2vec

In [33]:
import pandas as pd
import numpy as np
import nltk
import re
import multiprocessing
import gensim.models.word2vec as w2v

In [34]:
data = pd.read_csv('winemag-data_first150k.csv')
#contains 10 columns and 150k rows of wine reviews.

In [37]:
data.head()


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [60]:
labels = data['variety']
descriptions = data['description']
# 맛(oaky, tannic, acidic, berry, etc.)에 대한 description으로 wine의 type(Pinot Noir, Cabernet Sav., etc.) 예측하는 것이 목표

In [36]:
print('{}   :   {}'.format(labels.tolist()[0], descriptions.tolist()[0]))
print('{}   :   {}'.format(labels.tolist()[56], descriptions.tolist()[56]))
print('{}   :   {}'.format(labels.tolist()[93], descriptions.tolist()[93]))

Cabernet Sauvignon   :   This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.
Sauvignon Blanc   :   Delicious while also young and textured, this wine comes from biodynamically grown grapes. It has a strong sense of minerality as well as intense citrus and green fruits. It's tight at the moment and needs to round out, so drink from 2018.
Chardonnay   :   A smoky scent and earthy, crisp-apple flavors make this medium-bodied wine a change of pace from the average butterball Chardonnay. It has welcome acidity and a nicely smooth texture.


In [38]:
varietal_counts = labels.value_counts()
print(varietal_counts[:5])

Chardonnay                  14482
Pinot Noir                  14291
Cabernet Sauvignon          12800
Red Blend                   10062
Bordeaux-style Red Blend     7347
Name: variety, dtype: int64


In [39]:
corpus_raw = ""
for description in descriptions[:10000]:
    corpus_raw += description

In [40]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [41]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [42]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw) #[^a-zA-Z] means any character that IS NOT a-z OR A-Z , [^a-zA-Z]를 공백으로 대체
    words = clean.split()
    return words

In [43]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [44]:
print(raw_sentences[234])
print(sentence_to_wordlist(raw_sentences[234]))

Tart cherry lingers on the finish.A deeper salmon color with elegantly lacy bubbles and a slight cloudy appearance, this sparkler by Norm Yost offers dessicated watermelon, dried orange blossoms, yeast, citrus rinds and fresher strawberry notes on the nose.
['Tart', 'cherry', 'lingers', 'on', 'the', 'finish', 'A', 'deeper', 'salmon', 'color', 'with', 'elegantly', 'lacy', 'bubbles', 'and', 'a', 'slight', 'cloudy', 'appearance', 'this', 'sparkler', 'by', 'Norm', 'Yost', 'offers', 'dessicated', 'watermelon', 'dried', 'orange', 'blossoms', 'yeast', 'citrus', 'rinds', 'and', 'fresher', 'strawberry', 'notes', 'on', 'the', 'nose']


In [45]:
token_count = sum([len(sentence) for sentence in sentences])
print('The wine corpus contains {0:,} tokens'.format(token_count))

The wine corpus contains 408,741 tokens


In [46]:
num_features = 300
min_word_count = 10
num_workers = multiprocessing.cpu_count()
context_size = 10
downsampling = 1e-3
seed=1993

In [47]:
wine2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [48]:
wine2vec.build_vocab(sentences)

In [49]:
print('Word2Vec vocabulary length:', len(wine2vec.wv.vocab))

Word2Vec vocabulary length: 2612


In [50]:
print(wine2vec.corpus_count)

17323


In [58]:
wine2vec.train(sentences, total_examples=wine2vec.corpus_count, epochs=wine2vec.epochs )

(1354033, 2043705)

__Playing with the Model__

Now that we have a trained model we can get to the fun part and start playing around with the results. As you can tell from the outputs below, there is definitely still some noise in the data that could be worked out by tuning the parameters further, but overall we are getting pretty good results.

Words closest to a given word
"melon," "berry," and "oak" are words that someone might use to describe the taste/smell of a wine.

In [59]:
wine2vec.wv.most_similar('melon')

[('papaya', 0.7407190203666687),
 ('honeydew', 0.7113039493560791),
 ('banana', 0.6920315027236938),
 ('cantaloupe', 0.6835112571716309),
 ('Melon', 0.681281328201294),
 ('pit', 0.6758646965026855),
 ('mango', 0.6728077530860901),
 ('kiwi', 0.6680706739425659),
 ('bath', 0.6440451145172119),
 ('mealy', 0.6407906413078308)]

In [56]:
wine2vec.wv.most_similar('acidic') #1. 매우 신   2. 산성의

[('tartness', 0.8361387252807617),
 ('cloying', 0.8280996084213257),
 ('watery', 0.8243080377578735),
 ('tad', 0.8226309418678284),
 ('punchy', 0.8133155703544617),
 ('snap', 0.8117392063140869),
 ('sticky', 0.8109685182571411),
 ('lacking', 0.8065575361251831),
 ('flat', 0.803278386592865),
 ('angular', 0.7976535558700562)]

In [55]:
wine2vec.wv.most_similar('Chardonnay') #백포도주의 일종

[('Gris', 0.8386585116386414),
 ('Chenin', 0.8004210591316223),
 ('Blanc', 0.7875776886940002),
 ('Grigio', 0.7813867926597595),
 ('Marsanne', 0.7705585360527039),
 ('Roussanne', 0.7692981958389282),
 ('Viognier', 0.7582399249076843),
 ('Muscat', 0.753264307975769),
 ('Champagne', 0.7360697984695435),
 ('Verdejo', 0.7329103946685791)]