# Gensim style cleaning + flow

In [1]:
import pickle
with open('df_cleaned.pickle','rb') as read_file:
    df = pickle.load(read_file)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# NLP
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from gensim import corpora, models, similarities, matutils
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text 

# Clean text- remove punc, make lower case, remove numbers

In [3]:
# # Helper lambda fx to remove numbers and make lowercase
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

In [4]:
df['description'] = df['description'].apply(alphanumeric).apply(punc_lower)

In [5]:
df['description'].sample(10)

76770     this blend of    cabernet franc     malbec    ...
32685     aged only in stainless steel  this pretty soav...
23589     pure cabernet sauvignon from the yellow jacket...
9983      winemaker nick goldschmidt plies his trade mai...
60407     clean  crisp and fresh  which is enough to sat...
71827     this is pure malbec from the gamache vineyard ...
63012     bright pink in color  this is fresh and light ...
127693    it shows the classic aromas of alto adige sauv...
54308     the concerted impact of bergamot  grapefruit  ...
73942     dry in style and exuberantly fizzy  this fresh...
Name: description, dtype: object

# Customize stop words
- For now only adding Designation. Region + Varietal might provide info because of regional difs in tastes

In [6]:
# # Make columns lowercase so we can add them to stop words and they catch the already lowercase tokenized desc.
# df.winery = df.winery.str.lower()
# df.variety = df.variety.str.lower()
# df.region_1 = df.region_1.str.lower()
# df.region_2 = df.region_2.str.lower()
df.designation = df.designation.str.lower()

# # Need to make this a tuple because lists are mutable so can't add it to the set
# # Tuple is adding it as one element of all the wineries...not single words

# winery_list = set(df.winery.to_list())
# varietal_list = set(df.variety.to_list())
# region1_list = set(df.region_1.to_list())
# region2_list = set(df.region_2.to_list())
designation_list = set(df.designation.to_list())

In [7]:
designation_list = [str(i) for i in designation_list]

In [8]:
designation_list

['nan',
 'rhinefarm vineyard',
 'dineen vineyard',
 'fighting bulls',
 'la grande cuvée',
 'morgado da canita',
 'el fanio',
 'selleck vineyard',
 "verna's vineyard",
 'dolce bianco',
 "chevalier d'anthelme",
 'sceales vineyard',
 'satrapezo',
 'estate stonessence in the rocks vineyard',
 'francis reserve white',
 'monte da peceguina tinto',
 'domaine de grand croix',
 'domaine du manay',
 'spellonu red',
 'evolution big time',
 'gilt',
 'russian camp',
 'the tri-centenary',
 'vendimia seleccionada blanco',
 'select',
 "matthew's block",
 'la piazza alta',
 'louvau vineyard old vines',
 'baranoff vineyard estate grown',
 'made with organic grapes',
 'quinta do encontro q do e',
 'punto final mlb',
 'les complices',
 'les bartavelles',
 'clifton vineyard',
 'dauntless',
 'rieflé bihl',
 'audaz branco',
 'scharzhoferger spätlese',
 'arió extra dry',
 'grapes organically grown',
 'vigna della madonnina riserva',
 'légende des toques',
 'bishop creek vineyard',
 'the cigar menzies vineyard

In [10]:
x = [d.split(' ') for d in designation_list]

In [23]:
des_lis = []
for i in x:
    des_lis.append(i)
    

In [24]:
des_lis

[['nan'],
 ['rhinefarm', 'vineyard'],
 ['dineen', 'vineyard'],
 ['fighting', 'bulls'],
 ['la', 'grande', 'cuvée'],
 ['morgado', 'da', 'canita'],
 ['el', 'fanio'],
 ['selleck', 'vineyard'],
 ["verna's", 'vineyard'],
 ['dolce', 'bianco'],
 ['chevalier', "d'anthelme"],
 ['sceales', 'vineyard'],
 ['satrapezo'],
 ['estate', 'stonessence', 'in', 'the', 'rocks', 'vineyard'],
 ['francis', 'reserve', 'white'],
 ['monte', 'da', 'peceguina', 'tinto'],
 ['domaine', 'de', 'grand', 'croix'],
 ['domaine', 'du', 'manay'],
 ['spellonu', 'red'],
 ['evolution', 'big', 'time'],
 ['gilt'],
 ['russian', 'camp'],
 ['the', 'tri-centenary'],
 ['vendimia', 'seleccionada', 'blanco'],
 ['select'],
 ["matthew's", 'block'],
 ['la', 'piazza', 'alta'],
 ['louvau', 'vineyard', 'old', 'vines'],
 ['baranoff', 'vineyard', 'estate', 'grown'],
 ['made', 'with', 'organic', 'grapes'],
 ['quinta', 'do', 'encontro', 'q', 'do', 'e'],
 ['punto', 'final', 'mlb'],
 ['les', 'complices'],
 ['les', 'bartavelles'],
 ['clifton', 'viney

In [8]:
# # Add stop words to sklearn stopwrods text
# stop_words_sk = text.ENGLISH_STOP_WORDS.union(winery_list).union(varietal_list).union(region1_list)\
# .union(region2_list).union(designation_list)

In [18]:
stop_words_sk = text.ENGLISH_STOP_WORDS.union(designation_list)

In [19]:
new_stop_words = []
new_stop_words.append(text.ENGLISH_STOP_WORDS)

In [21]:
stop_words_sk

frozenset({'rhinefarm vineyard',
           'dineen vineyard',
           'outer limits',
           'light horse',
           'fighting bulls',
           'la grande cuvée',
           'paso doble malbec-corvina',
           'flò',
           'morgado da canita',
           'les galets single vineyard',
           'petit monts premier cru',
           'quartilho branco',
           'cepas centenarias',
           'bacchus',
           'el fanio',
           'selleck vineyard',
           'clone 115 indigène estate',
           'kremser wachtberg reserve',
           'fortulla',
           'monte foscarino',
           "verna's vineyard",
           'dolce bianco',
           'vz van zellers',
           'california',
           "chevalier d'anthelme",
           'altar',
           'the cynic',
           'sceales vineyard',
           'satrapezo',
           'estate stonessence in the rocks vineyard',
           'francis reserve white',
           'monte da peceguina tinto',
        

In [11]:
#stop_words_sk

# CVectorizer to gensim LDA

In [12]:
# CountVectorizer
cv1 = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words=stop_words_sk, token_pattern="\\b[a-z][a-z]+\\b")

In [13]:
cv1.fit(df.description)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None,
                stop_words=frozenset({nan, '#19 phantom limb belmont vineyard',
                                      '#50 mon chou', '#socialsecret', '%@#$!',
                                      '&', "'61 rosé", "'a rina",
                                      "'blend 105' red wi...
                                      "'na vota", "'p'", "'s'", "'unfiltered'",
                                      "'vie!'", '(+)', '(steen)', '*%#&@!',
                                      '+ brut metodo classico',
                                      '+ summa  varietals syrah-cabernet '
                                      'sauvignon-petit verdot',
                                      '+7', '0 degree dry',
                

In [14]:
# Create the term-document matrix
# Transpose it so the terms are the rows
doc_word = cv1.transform(df.description).transpose()

In [15]:
doc_word

<699388x119988 sparse matrix of type '<class 'numpy.int64'>'
	with 4689911 stored elements in Compressed Sparse Column format>

In [16]:
doc_word.shape

(699388, 119988)

In [31]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word)

In [32]:
corpus

<gensim.matutils.Sparse2Corpus at 0x1a2ab6a850>

In [33]:
id2word = dict((v, k) for k, v in cv1.vocabulary_.items())
# order is actually word to id

In [34]:
len(id2word)

699388

In [35]:
lda = models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=10)

KeyboardInterrupt: 

In [None]:
lda.print_topics()