In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# NLP
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.probability import FreqDist
import gensim
from gensim import corpora, models, similarities, matutils
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [145]:
!ls

ExploreAmazonDatasets.ipynb df_cleaned.pickle
ExploreReviewDataset.ipynb  df_test_cleaned.pickle
WineNLP_2.ipynb             lda_cv_test_lemmat_2.pickle
WineNLP_Cleaning+LDA.ipynb  low_occuring_words.pickle
WineNLP_LSA_NMF.ipynb       stop_words.pickle
Wine_Explore.ipynb


In [146]:
# with open('df_cleaned.pickle', 'rb') as read_file1:
#     df = pickle.load(read_file1)

In [2]:
with open('df_cleaned_final.pickle', 'rb') as read_file1:
    df = pickle.load(read_file1)

In [3]:
df.shape

(119988, 17)

In [4]:
# Load in stop_words to use here too
with open('stop_words.pickle','rb') as read_file:
    stop_words = pickle.load(read_file)

# Load in TEST DF (smaller). Begin testing tfidf/cv on LSA NMF
import pickle
with open('df_test_cleaned.pickle','rb') as read_file:
    df_test = pickle.load(read_file)

# LSA - on smaller test df. Full df after this section
## Testing with lemmat col + CV first
- Basically SVD applied to language
- Can use CV or TFIDF

In [5]:
## 1,2 gram LSA test WITH LEMMAT
cv_test_lemmat = CountVectorizer(ngram_range=(2,3))
cv_test_lemmat_desc = cv_test_lemmat.fit_transform(df_test['lemmat_test'])

In [6]:
cv_test_lemmat_desc.shape

(40000, 860172)

In [7]:
# DF with docs as cols, and words as rows. Just to peak at structure. Pretty useless.
#pd.DataFrame(cv_test_lemmat_desc.toarray(), index=df_test['lemmat_test'], columns=cv_test_lemmat.get_feature_names()).head(10)


In [8]:
lsa = TruncatedSVD(20, random_state=44) # truncated because we are telling it to only keep the first n. Basically a cut-off
doc_topic = lsa.fit_transform(cv_test_lemmat_desc)
lsa.explained_variance_ratio_ 
# How much do these new PCs account for the variance... should sum to 1.

array([0.00220864, 0.00199208, 0.00154285, 0.00100567, 0.00095834,
       0.0008569 , 0.000832  , 0.00076034, 0.00071326, 0.00067895,
       0.00060982, 0.00062165, 0.00060997, 0.00058301, 0.00055834,
       0.00054789, 0.00050285, 0.00048793, 0.00048281, 0.00047519])

In [9]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
                            # Use argsort to see hightest features in order

In [10]:
display_topics(lsa, cv_test_lemmat.get_feature_names(), 5)


Topic  0
black cherry, cabernet sauvignon, full bodied, cabernet franc, blend cabernet

Topic  1
cabernet sauvignon, cabernet franc, blend cabernet, blend cabernet sauvignon, merlot cabernet

Topic  2
full bodied, dry full, ripe full, dry full bodied, red cherry

Topic  3
red berry, red cherry, grained tannin, fine grained, fine grained tannin

Topic  4
black currant, blackberry black, blackberry black currant, firm tannin, black plum

Topic  5
grained tannin, fine grained, fine grained tannin, black currant, firm fine

Topic  6
pinot noir, red cherry, raspberry cherry, red currant, crisp acidity

Topic  7
red cherry, black plum, baking spice, firm tannin, cabernet franc

Topic  8
cabernet franc, merlot cabernet, merlot cabernet sauvignon, blend merlot, blend merlot cabernet

Topic  9
black plum, firm tannin, ripe black, baking spice, plum berry

Topic  10
crisp acidity, sauvignon blanc, firm tannin, baking spice, bright acidity

Topic  11
baking spice, bright acidity, pinot noir, lea

# SVD LSA with TDIDF using SVD
### NOT LSA WITHIN GENSIM-- See other doc

In [19]:
# TFIDF
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,4), min_df=2, max_df = .5) 
# max_df = ignore terms that appear in more than 50% of docs (doesn't change anything)
# min_df = ignore terms that appear in less than 2 docs (REDUCES NUMB OF VECTS DRASTICALLY)
tfidf_fit_1 = tfidf_vectorizer.fit_transform(df['lemmat_desc'])

In [20]:
tfidf_fit_1.shape # Number of features tfidf made

(119988, 396585)

In [21]:
feat_names = tfidf_vectorizer

In [22]:
lsa_tfidf_1 = TruncatedSVD(25, random_state=44, n_iter=20) # truncated because we are telling it to only keep the first n. Basically a cut-off
doc_topic_tfidf = lsa_tfidf_1.fit_transform(tfidf_fit_1)
print(lsa_tfidf_1.explained_variance_ratio_)

[0.00034618 0.00200525 0.00146398 0.00136427 0.00112893 0.00105568
 0.00101781 0.00096974 0.00087408 0.00080491 0.00076755 0.00074786
 0.00072578 0.00070727 0.00068538 0.00066503 0.00064244 0.00063204
 0.00062374 0.0006222  0.00060237 0.00059397 0.00058699 0.00058432
 0.00057141]


In [23]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
                            # Use argsort to see hightest features in order

In [24]:
#display_topics(lsa, vectorizer.get_feature_names(), 5)
display_topics(lsa_tfidf_1, tfidf_vectorizer.get_feature_names(),5)


Topic  0
acidity, cherry, finish, tannin, black

Topic  1
apple, crisp, citrus, lemon, acidity

Topic  2
cabernet, finish, sauvignon, cabernet sauvignon, blend

Topic  3
cabernet, sauvignon, cabernet sauvignon, blend, merlot

Topic  4
red, bright, cherry, acidity, crisp

Topic  5
black, black cherry, white, apple, alongside

Topic  6
cherry, pinot, noir, pinot noir, bodied

Topic  7
full, bodied, full bodied, red, ripe

Topic  8
dry, bodied, full, full bodied, pinot

Topic  9
wood, spice, aging, pinot, noir

Topic  10
dry, red, sweet, tannin, oak

Topic  11
crisp, oak, acidity, bright, vanilla

Topic  12
oak, apple, red, vanilla, currant

Topic  13
spice, show, well, bright, touch

Topic  14
sweet, lemon, lime, plum, black

Topic  15
white, currant, black, citrus, black currant

Topic  16
spice, dry, wood, note, baking spice

Topic  17
ripe, lemon, berry, white, blackberry

Topic  18
soft, well, lemon, note, oak

Topic  19
soft, wood, texture, aging, lemon

Topic  20
ripe, citrus, che

# SVD LSA: tfidf with 2,4 n grams

In [25]:
# TFIDF
tfidf_vectorizer_24 = TfidfVectorizer(ngram_range=(2,4), min_df=3, max_df = .5) 
# max_df = ignore terms that appear in more than 50% of docs (doesn't change anything)
# min_df = ignore terms that appear in less than 2 docs (REDUCES NUMB OF VECTS DRASTICALLY)
tfidf_fit_24 = tfidf_vectorizer_24.fit_transform(df['lemmat_desc'])

lsa_tfidf_24 = TruncatedSVD(25, random_state=44, n_iter=20) # truncated because we are telling it to only keep the first n. Basically a cut-off
doc_topic_tfidf_24 = lsa_tfidf_24.fit_transform(tfidf_fit_24)
print(lsa_tfidf_24.explained_variance_ratio_)

def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
                            # Use argsort to see hightest features in order

#display_topics(lsa, vectorizer.get_feature_names(), 5)
display_topics(lsa_tfidf_24, tfidf_vectorizer_24.get_feature_names(),5)

[0.00094724 0.00063741 0.00059277 0.00050964 0.0005163  0.00052005
 0.00048697 0.00041826 0.00042403 0.00040563 0.00040198 0.00038324
 0.00037488 0.00037074 0.00035796 0.0003345  0.00034719 0.00033382
 0.00032526 0.00032549 0.00031732 0.00030828 0.0002995  0.00027777
 0.00027881]

Topic  0
cabernet sauvignon, blend cabernet, cabernet franc, blend cabernet sauvignon, black cherry

Topic  1
cabernet sauvignon, blend cabernet, blend cabernet sauvignon, cabernet franc, cabernet sauvignon merlot

Topic  2
full bodied, ripe full, pinot noir, ripe full bodied, dry full

Topic  3
black currant, red berry, pinot noir, red cherry, firm tannin

Topic  4
black currant, blackberry black, full bodied, blackberry black currant, sauvignon cabernet

Topic  5
sauvignon cabernet, cabernet sauvignon cabernet, sauvignon cabernet franc, cabernet sauvignon cabernet franc, cabernet franc

Topic  6
pinot noir, raspberry cherry, black cherry, cherry cola, crisp acidity

Topic  7
sauvignon blanc, crisp acidity, 

In [26]:
# Get the words/combos tfidf came up with
feat_names = tfidf_vectorizer_24.get_feature_names()

In [27]:
doc_topic_tfidf.shape

(119988, 25)

In [28]:
lsa_tfidf_24.singular_values_ # topics

array([11.74799602,  9.9349905 ,  8.54921564,  8.13119161,  7.97800422,
        7.89573787,  7.64614881,  7.26755652,  7.1296583 ,  6.97516248,
        6.95201116,  6.82357851,  6.70844277,  6.67495545,  6.55078888,
        6.47263244,  6.45289971,  6.3307428 ,  6.25318919,  6.24667861,
        6.169946  ,  6.08055203,  5.99206658,  5.86040553,  5.80077007])

In [29]:
lsa_tfidf_24.explained_variance_ratio_

array([0.00094724, 0.00063741, 0.00059277, 0.00050964, 0.0005163 ,
       0.00052005, 0.00048697, 0.00041826, 0.00042403, 0.00040563,
       0.00040198, 0.00038324, 0.00037488, 0.00037074, 0.00035796,
       0.0003345 , 0.00034719, 0.00033382, 0.00032526, 0.00032549,
       0.00031732, 0.00030828, 0.0002995 , 0.00027777, 0.00027881])

In [30]:
lsa_tfidf_24.components_[0]

array([1.30516095e-03, 3.54106980e-04, 1.22988530e-04, ...,
       1.82685674e-04, 6.10127205e-05, 8.62560472e-05])

# VT cosine for rec sys

In [31]:
# Get a matrix with docs(review text) as rows, topics as columns
# Put in pandas to examine, but can do np if we wanted.
Vt_tfidf_24 = pd.DataFrame(doc_topic_tfidf.round(5),
             index = df['lemmat_desc'],
             )

In [32]:
Vt_tfidf_24

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
lemmat_desc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
include tropical broom brimstone dried herb overly expressive offering apple citrus dried alongside brisk acidity,0.06673,0.05707,0.03598,-0.00730,0.03266,0.07153,-0.00143,0.02057,-0.00615,0.02051,...,-0.00285,-0.00405,-0.04350,-0.04161,0.01689,0.00593,-0.10869,-0.05319,-0.04608,-0.01787
ripe smooth still structured firm tannin filled juicy red berry freshened acidity already drinkable although certainly better,0.08622,-0.01781,-0.09952,0.02318,0.04571,-0.01730,-0.05641,0.00731,-0.00902,0.03057,...,-0.00565,-0.02165,0.00452,0.00740,-0.01011,0.00223,-0.00583,0.01363,0.01082,0.02635
tart snappy lime flesh rind dominate pineapple poke crisp acidity underscoring stainless fermented,0.05072,0.08173,0.01092,0.01972,0.02504,0.00473,0.00599,-0.02535,0.01895,-0.01614,...,-0.02554,-0.00160,0.00577,0.00671,0.01476,0.01420,-0.00923,0.01401,-0.02648,0.00434
pineapple rind lemon pith blossom start bit opulent note honey drizzled guava mango giving slightly astringent finish,0.05942,0.06113,0.05453,-0.02031,-0.03392,0.01257,-0.01272,-0.02194,-0.00507,0.00799,...,0.00439,0.00821,0.00748,0.04115,0.01598,-0.00970,-0.00454,-0.01782,0.00479,0.03850
much regular come across rather rough tannic rustic earthy herbal characteristic nonetheless think pleasantly unfussy country hearty winter stew,0.03770,-0.00818,0.01314,-0.02071,-0.01689,-0.03286,-0.03503,-0.02121,0.02724,-0.00632,...,-0.00517,-0.00871,0.00258,-0.01423,0.01131,-0.00619,-0.01592,0.00988,0.00785,0.00475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
note honeysuckle cantaloupe sweeten deliciously feather spätlese intensely juicy quenching stream tart tangerine grapefruit acidity yet wrap kiss honey,0.04834,0.04760,0.01035,0.00242,0.00721,0.01343,-0.00100,-0.00879,0.01112,0.00078,...,-0.00400,0.00540,0.00684,0.03162,-0.00339,0.00569,0.00958,-0.05225,0.00533,0.02558
given much decade prior mean pre cellared drinking baked cherry cocoa coconut combine gracefully soft secondary compote highlight,0.04870,-0.01360,0.00511,-0.01560,0.00489,-0.02132,0.02760,-0.00384,-0.03327,-0.01803,...,-0.00674,-0.00511,-0.02515,0.02605,0.02439,0.00009,0.00826,0.01206,-0.01144,-0.00958
well give crisp dry ripe although spice subdued favor serious structure couple,0.07979,0.03143,-0.06621,0.04017,-0.00263,-0.00756,0.00043,-0.03331,0.02257,0.01984,...,-0.03433,0.05165,0.01268,-0.00446,-0.05018,0.05243,-0.01980,0.03757,-0.00151,-0.02143
dry pinot crisp acidity also weight solid powerful spice baked apple structure still developing need,0.08170,0.05279,-0.04613,0.02721,0.00701,-0.00432,0.00136,-0.05330,0.04922,0.03433,...,-0.02843,0.05222,-0.02492,-0.04877,-0.02617,-0.02808,-0.01317,0.02879,-0.01034,-0.00183


# Testing on smaller trunc df
#### Function to loop through vt matrix and get cosine similarity for each combo


In [None]:
vt_trunc = Vt_tfidf_24[:100]

In [None]:
vt_trunc

In [None]:
[vt_trunc.iloc[0].values]

In [None]:
vt_trunc.T.shape

In [None]:
vt_trunc.index[0]

In [None]:
vt_trunc.iloc[0].index

# Working recommendation function below-- on trunc df. Need to apply to larger df for final
- Now need to get the top wines and get the names...so need to link back to original matrix.

In [None]:
def get_wine_rec(new_wine, vt, num_recom=5):
    recs = []
    for i in range(vt.shape[0]): # go through rows (rows = docs)
        cur_vec = [vt.iloc[i].values] # values for current row in loop
        cur_vec_index = i # maybe use this to go back to wine name later...?
        # Similarity
        sim = cosine_similarity(cur_vec, new_wine)
        recs.append(sim)
        final_recs = [j[0] for j in sorted(recs, key=lambda x : x[0], reverse=True)]
        final_recs.pop(0) # Take out 0th because it's the wine with itself 1.0 similarity
        top_recs = final_recs[: num_recom]
        exploration_recs = final_recs[10:15]
    print("Top {} recommendations are {}".format(num_recom, top_recs))
    print("Some wines that are a little different that you might like are: {}".format(exploration_recs))
    

In [None]:
# Use an existing wine to test as new input. Later it will be a raw desc that will be vectorized via model
new_input = [Vt_tfidf_24.iloc[44].values] # New input must be in [] to avoid reshape error in np

In [33]:
new_test = [vt_trunc.iloc[2].values]

NameError: name 'vt_trunc' is not defined

In [None]:
get_wine_rec(new_test, vt_trunc, num_recom=5) # works quickly on small df, but taking a while on full d

## Not running on full df....

In [757]:
############get_wine_rec(new_input, Vt_tfidf_24,num_recom=5)

KeyboardInterrupt: 

### Need to link these wines back to original df so we can then use price region name etc...


In [723]:
a = vt_trunc.copy()

In [726]:
index = [i for i in range(a.shape[0])]

In [733]:
a.index

Index(['include tropical broom brimstone dried herb overly expressive offering apple citrus dried alongside brisk acidity',
       'ripe smooth still structured firm tannin filled juicy red berry freshened acidity already drinkable although certainly better',
       'tart snappy lime flesh rind dominate pineapple poke crisp acidity underscoring stainless fermented',
       'pineapple rind lemon pith blossom start bit opulent note honey drizzled guava mango giving slightly astringent finish',
       'much regular come across rather rough tannic rustic earthy herbal characteristic nonetheless think pleasantly unfussy country hearty winter stew',
       'blackberry raspberry show typical navarran whiff herb horseradish mouth fairly full bodied tomatoey acidity spicy herbal complement plum finish grabby',
       'bright informal red open candied berry white savory herb carry balanced acidity soft tannin',
       'dry restrained spice profusion balanced acidity firm texture much food',
    