In [None]:
import re
import numpy as np
import pandas as pd

In [None]:
reviews = pd.read_csv('beer_reviews.csv')

In [None]:
# clean reviews
words_list = reviews.review.map(lambda x: re.findall('\w+', x.lower()))
words = np.concatenate(words_list)

In [None]:
words_list

0       [so, ok, on, the, real, i, gave, up, a, ton, f...
1       [poured, into, a, snifter, glass, a, dark, cho...
2       [if, there, s, any, beer, that, deserves, the,...
3       [this, brew, has, been, on, my, bucket, list, ...
4       [i, didnt, think, i, was, going, to, give, it,...
                              ...                        
5548                        [look, dark, not, too, thick]
5549                                        [great, quad]
5550    [look, dark, amber, little, head, short, lived...
5551    [i, ve, had, the, delight, of, both, the, 2014...
5552                   [2015, is, my, personal, favorite]
Name: review, Length: 5553, dtype: object

In [None]:
# perform count of all words
from collections import Counter
counter = Counter(words)
# convert counter to dataframe for calculations and manipulation
word_counts_df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
word_counts_df.rename(columns={'index': 'Word', 0: 'Frequency'}, inplace=True)
word_counts_df.head()
word_counts_df.to_csv('word_frequencies.csv')

In [None]:
# chosen by reviewing csv
attribute_candidates = ['dark', 'chocolate', 'bourbon', 'coffee', 'vanilla', 'sweet', 'thick', 'orange', 'smooth', 'fruit', 'coconut', 'light', 'maple', 'hazy', 'balanced', 'creamy', 'rich', 'oak', 'citrus', 'sour', 'sweetness', 'juice', 'cinnamon', 'caramel', 'tropical', 'aged', 'peach', 'hop', 'golden', 'juicy', 'hops']

In [None]:
df_coc = pd.DataFrame(columns=list(attribute_candidates), index=list(attribute_candidates))
df_coc.fillna(0, inplace=True)

# copy of df_coc to ensure that an interaction is only counted once per message
df_coc_counts = df_coc.copy()

In [None]:
# iterate through messages where the models have been replaced with the brands
for message in words_list:
  # reset the interaction DF to 0 to retrack interactions for each new message
  df_coc_counts.iloc[:] = 0
  for i, word in enumerate(message):
    if word in attribute_candidates:
      # if another mention of a brand is within 20 words, count it as an interaction
      for j in range(20):
        # check that the index is still within the length of the message
        if i+j+1 in range(len(message)):
          # check if a different brand is mentioned in the range, not the same brand capture before, and not already logged as an interaction in the interaction dataframe
          if message[i+j+1] in attribute_candidates and message[i+j+1] != word and df_coc_counts.loc[word][message[i+j+1]] < 1:
            # update both dfs, counting number of interactions in total, logging an interaction
            df_coc.loc[word][message[i+j+1]] += 1
            df_coc.loc[message[i+j+1]][word] += 1
            df_coc_counts.loc[word][message[i+j+1]] = 1
            df_coc_counts.loc[message[i+j+1]][word] = 1

df_coc

In [None]:
# copy over brand co-occurrences dataframe to calculate lifts
lifts = df_coc.copy()

In [None]:
# calculate lifts
for i in lifts:
  # each i value is one of the top 10 brands
  # find the count for the ith brand in the top10 brands df
  b1_df = word_counts_df[word_counts_df['Word'] == i]
  # this gets the count using iloc
  b1 = b1_df['Frequency'].iloc[0]
  for j in lifts:
    # get the other count for the jth brand that brand i is co-occurring with
    b2_df = word_counts_df[word_counts_df['Word'] == j]
    b2 = b2_df['Frequency'].iloc[0]
    # calculating the lift for the i and j brands
    # number of comments: 5553 * co-occurrences of brand i and j OVER
    # product of the count of brands i and j
    lifts.loc[i, j] = (lifts.loc[i, j]*5553)/(b1*b2)

lifts_display = lifts.copy()

for i in range(len(lifts_display.columns)):
  for j in range(i, len(lifts_display)):
    lifts_display.iloc[i, j] = np.nan

lifts_display = lifts_display.T
lifts_display

Unnamed: 0,dark,chocolate,bourbon,coffee,vanilla,sweet,thick,orange,smooth,fruit,...,juice,cinnamon,caramel,tropical,aged,peach,hop,golden,juicy,hops
dark,,1.734715,1.571665,1.261978,1.526905,1.188978,1.388227,0.319968,0.810978,0.954175,...,0.141493,1.238066,2.465538,0.037245,0.748861,0.03977,0.159984,0.203448,0.041165,0.124223
chocolate,,,2.299272,2.310375,2.48949,1.534747,1.463686,0.0,1.394251,0.589556,...,0.072398,1.990953,2.193999,0.038114,0.804661,0.040698,0.245578,0.0,0.042127,0.169497
bourbon,,,,1.407574,2.777128,1.592336,1.37427,0.0,1.649958,0.647694,...,0.043883,2.194147,3.058663,0.09241,1.765121,0.0,0.198471,0.0,0.051068,0.205475
coffee,,,,,1.847124,1.595534,0.964438,0.0,1.494414,0.233628,...,0.0,2.387001,1.669316,0.0,0.971795,0.0,0.155708,0.0,0.0,0.053735
vanilla,,,,,,1.9988,1.220266,0.069611,1.270314,0.783342,...,0.051304,2.770433,3.161335,0.108038,1.031817,0.230724,0.058009,0.0,0.059705,0.300281
sweet,,,,,,,1.295371,0.726837,1.350956,1.0633,...,0.964245,1.499937,2.759765,1.015264,0.453628,0.843185,0.363418,0.24648,0.748089,0.627075
thick,,,,,,,,0.955505,1.045193,0.481032,...,0.833951,0.889547,1.909479,0.70246,0.706196,0.250028,0.314311,0.255809,0.647003,0.585728
orange,,,,,,,,,0.698031,1.51731,...,4.757309,0.0,0.064075,3.940428,0.0,2.35339,1.075801,3.137441,1.919245,1.633529
smooth,,,,,,,,,,0.818238,...,0.643077,0.900308,1.624015,1.286495,1.021056,0.578406,1.090673,0.59178,1.421915,0.903334
fruit,,,,,,,,,,,...,2.426917,0.3569,1.658405,5.260964,0.453338,1.765544,1.372036,1.067398,2.824307,2.088912




In [None]:
from itertools import combinations
# find best combination of 3 lifts
combinations = combinations(attribute_candidates, 3)
avgs = {}
for combination in combinations:
  a1 = combination[0]
  a2 = combination[1]
  a3 = combination[2]
  l1 = lifts.loc[a1, a2]
  l2 = lifts.loc[a1, a3]
  l3 = lifts.loc[a2, a3]
  avg = (l1 + l2 + l3) / 3
  avgs[combination] = avg

In [None]:
lift_combos_df = pd.DataFrame.from_dict(avgs, orient='index')
lift_combos_df.sort_values(by=0, ascending=False, inplace=True)
lift_combos_df

Unnamed: 0,0
"(citrus, tropical, hops)",4.903734
"(orange, hazy, golden)",4.845812
"(fruit, citrus, tropical)",4.489482
"(orange, hazy, juice)",4.378430
"(orange, juice, tropical)",4.228272
...,...
"(maple, sour, hop)",0.061126
"(coffee, sour, hops)",0.047050
"(vanilla, sour, hop)",0.035619
"(coconut, sour, hops)",0.031843


Based on the above analysis, we will use "Citrus" "Tropical" and "Hops" as our 3 attributes