In [2]:
import pandas as pd

In [3]:
reviews = pd.read_csv('data/winemag-data-130k-v2.csv')

In [4]:
reviews.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [5]:
wines = reviews[['title','description']]

In [6]:
wines.head()

Unnamed: 0,title,description
0,Nicosia 2013 Vulkà Bianco (Etna),"Aromas include tropical fruit, broom, brimston..."
1,Quinta dos Avidagos 2011 Avidagos Red (Douro),"This is ripe and fruity, a wine that is smooth..."
2,Rainstorm 2013 Pinot Gris (Willamette Valley),"Tart and snappy, the flavors of lime flesh and..."
3,St. Julian 2013 Reserve Late Harvest Riesling ...,"Pineapple rind, lemon pith and orange blossom ..."
4,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,"Much like the regular bottling from 2012, this..."


In [7]:
# Clean descriptions - remove certain punctuation
wines = wines.assign(clean_desc=wines.description.str.replace('[^ ,;\.\?\-—!\w\d]', ''))

In [8]:
# Clean descriptions - remove multiple white spaces
wines = wines.assign(clean_desc=wines.clean_desc.str.replace('\s', ' '))

In [9]:
wines = wines.assign(words=wines.clean_desc.str.split(pat=' '))

In [10]:
wines.head()

Unnamed: 0,title,description,clean_desc,words
0,Nicosia 2013 Vulkà Bianco (Etna),"Aromas include tropical fruit, broom, brimston...","Aromas include tropical fruit, broom, brimston...","[Aromas, include, tropical, fruit,, broom,, br..."
1,Quinta dos Avidagos 2011 Avidagos Red (Douro),"This is ripe and fruity, a wine that is smooth...","This is ripe and fruity, a wine that is smooth...","[This, is, ripe, and, fruity,, a, wine, that, ..."
2,Rainstorm 2013 Pinot Gris (Willamette Valley),"Tart and snappy, the flavors of lime flesh and...","Tart and snappy, the flavors of lime flesh and...","[Tart, and, snappy,, the, flavors, of, lime, f..."
3,St. Julian 2013 Reserve Late Harvest Riesling ...,"Pineapple rind, lemon pith and orange blossom ...","Pineapple rind, lemon pith and orange blossom ...","[Pineapple, rind,, lemon, pith, and, orange, b..."
4,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,"Much like the regular bottling from 2012, this...","Much like the regular bottling from 2012, this...","[Much, like, the, regular, bottling, from, 201..."


In [11]:
def split_pairs(x):
    """Split a list into a list of 2-tuples."""
    output = []
    for i in range(0, len(x)):
        if i == 0:
            output.append(tuple(['_START', x[i]]))
        output.append(tuple(x[i:i+2]))
    return output

In [12]:
sample = wines.iloc[0,3]
[x for x in split_pairs(sample)]

[('_START', 'Aromas'),
 ('Aromas', 'include'),
 ('include', 'tropical'),
 ('tropical', 'fruit,'),
 ('fruit,', 'broom,'),
 ('broom,', 'brimstone'),
 ('brimstone', 'and'),
 ('and', 'dried'),
 ('dried', 'herb.'),
 ('herb.', 'The'),
 ('The', 'palate'),
 ('palate', 'isnt'),
 ('isnt', 'overly'),
 ('overly', 'expressive,'),
 ('expressive,', 'offering'),
 ('offering', 'unripened'),
 ('unripened', 'apple,'),
 ('apple,', 'citrus'),
 ('citrus', 'and'),
 ('and', 'dried'),
 ('dried', 'sage'),
 ('sage', 'alongside'),
 ('alongside', 'brisk'),
 ('brisk', 'acidity.'),
 ('acidity.',)]

In [13]:
wines = wines.assign(pairs=wines.apply(lambda row: split_pairs(row.words), axis=1))

In [14]:
words = wines['pairs'].apply(pd.Series).stack().to_frame(name='pairs')

In [15]:
punctuation = ['.', '?', '!']
words = words.assign(
    word_1=words['pairs'].apply(lambda x: '_START' if any([x[0].endswith(p) for p in punctuation]) else x[0]),
    word_2=words['pairs'].apply(lambda x: x[1] if len(x) > 1 else '_END')
)

In [16]:
words.head()

Unnamed: 0,Unnamed: 1,pairs,word_1,word_2
0,0,"(_START, Aromas)",_START,Aromas
0,1,"(Aromas, include)",Aromas,include
0,2,"(include, tropical)",include,tropical
0,3,"(tropical, fruit,)",tropical,"fruit,"
0,4,"(fruit,, broom,)","fruit,","broom,"


In [17]:
words = words.reset_index()
words = words[['pairs', 'word_1', 'word_2']]

In [33]:
wordcounts = words[['word_1', 'word_2']].groupby(['word_1', 'word_2']).size().reset_index(name='n')

In [34]:
# Remove any empty words, filter out pairs that occur infrequently to make it faster
# len(wordcounts.index)
wordcounts = wordcounts[(wordcounts.word_1 != '') & (wordcounts.word_2 != '') & (wordcounts.n >= 5)]

In [35]:
# TODO: something is not right. n should always be <= w1_freq
word_1_freqs = wordcounts.groupby('word_1').size().reset_index(name='w1_freq')

In [36]:
wordcounts = wordcounts.merge(word_1_freqs, how='inner', on='word_1')

In [37]:
wordcounts = wordcounts.assign(weight=wordcounts.n/wordcounts.w1_freq)

In [None]:
# Create a matrix of weights
M = wordcounts[['word_1', 'word_2', 'weight']].pivot(index='word_2', columns='word_1', values='weight')

In [None]:
M.index.sample(1, weights=M['_START'])

In [38]:
wordcounts.sort_values('n')

Unnamed: 0,word_1,word_2,n,w1_freq,weight
90242,"tag,",this,5,1,5.000000
30627,blueberry,jam.,5,42,0.119048
14110,_START,Proprietor,5,2391,0.002091
86390,some,roundness,5,329,0.015198
14108,_START,Proper,5,2391,0.002091
86395,some,sediment.,5,329,0.015198
30629,blueberry,"juice,",5,42,0.119048
86400,some,sharpness.,5,329,0.015198
60567,"juniper,",cola,5,2,2.500000
30632,blueberry,notes.,5,42,0.119048


In [None]:
# Everything below is not for the matrix approach

In [None]:
wordcounts = wordcounts.sort_values('count', ascending=False)

In [None]:
# Group by word_1, create a list of (word_2, count) tuples ordered by count
wordcounts = wordcounts.assign(next_words=wordcounts[['word_2', 'count']].values.tolist())

In [None]:
starting_words = wordcounts[['word_1', 'next_words']].groupby('word_1').agg(lambda x: list(x))

In [None]:
starting_words