In [7]:
import pandas as pd
import scipy
from nltk.sentiment.vader import SentimentIntensityAnalyzer;

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import nltk.data

In [3]:
data = pd.read_csv("/data/beer/train.csv")
test = pd.read_csv("/data/beer/test.csv")

In [6]:
sid = SentimentIntensityAnalyzer()
sentiment = [sid.polarity_scores(str(review))["compound"] for review in data["review/text"]]

In [26]:
sid = SentimentIntensityAnalyzer()
test_sentiment = [sid.polarity_scores(str(review))["compound"] for review in test["review/text"]]

In [134]:
# data["user/ageInSeconds"] = data["user/ageInSeconds"].fillna(data['user/ageInSeconds'].mean())
# X["age"] = data['user/ageInSeconds'] / (3600 * 24 * 365)

In [147]:
X["sentiment"] = sentiment
X["abv"] = data["beer/ABV"]

In [10]:
appearanceWords = ["appearance", "character", "image", "look", "presence", "presentation", "air", "attitude", "bearing", "blind",
              "carriage", "cast", "condition", "countenance", "demeanor", "dress", "expression", "face", "fashion", "feature",
              "figure", "form", "front", "guise", "looks", "manner", "mannerism", "mien", "mode", "outline", "pose", "screen",
              "semblance", "shape", "stamp",'amber','brown','caramel','chocolate','clean','clouded','coffee','creamy','crisp',
              'hazy','head','lacing','orange','orangeamber','pour','red','rich', 'ruby','snifterdeep','solid','tan','toast',
              'vanilla','white']

aromaWords = ["aroma", "bouquet", "odor", "perfume", "scent", "spice", "balm", "fragrance", "incense", "redolence", "smell",
         'burn','candy','coffee','floral','fruity','grapefruity','herbal','lingering','mild','peppery','rich','roasted',
         'sharp','skunky','spice','sweet','toast','vanilla']


palateWords = ["palate", "bias", "partiality", "penchant", "tendency", "attraction", "inclination", "preference", "appetite", 
          "disposition", "gusto", "zest", 'bodied','carbonated','crisp','dense','heavy','kick','lingering','rich','sharp',
          'smooth','soft']

tasteWords = ["taste", "aftertaste", "aroma", "bang", "bitter", "drive", "ginger", "jolt", "kick", "oomph", "palatableness",
         "piquancy", "punch", "relish", "salt", "sapidity", "savor", "savoriness", "smack", "sour", "sting", "suggestion",
         "sweet", "tang", "wallop", "zest", "zing", "zip", "sapor", 'absinthe','aftertaste','biscuity','bitterness','bland',
         'bold','bourbon','burn', 'candy','caramel','carbonated','coffee','creamy','crisp','dense','floral','hop','kick',
         'lingering','malt','mild','peppery','refreshing','rich','roasted','sharp','sour','spice', 'sugar','sweet','tart',
         'tangy','vanilla']

In [16]:
def sentenceSentiment(data):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = [tokenizer.tokenize(str(review)) for review in data['review/text']]
    
    aroma = [0 for _ in sentences]
    palate = [0 for _ in sentences]
    taste = [0 for _ in sentences]
    appearance = [0 for _ in sentences]

    i = 0
    for paragraph in sentences:
        aromaCount = 0
        palateCount = 0
        tasteCount = 0
        appearanceCount = 0

        for sentence in paragraph:
            alphas = ' ' + ''.join(x for x in sentence if x.isalpha() or x == ' ') + ' '
            if any(' ' + a + ' ' in alphas for a in aromaWords):
                aromaCount += 1
                aroma[i] += sid.polarity_scores(sentence)["compound"]
            if any(' ' + a + ' ' in alphas for a in palateWords):
                palateCount += 1
                palate[i] += sid.polarity_scores(sentence)["compound"]
            if any(' ' + a + ' ' in alphas for a in tasteWords):
                tasteCount += 1
                taste[i] += sid.polarity_scores(sentence)["compound"]
            if any(' ' + a + ' ' in alphas for a in appearanceWords):
                appearanceCount += 1
                appearance[i] += sid.polarity_scores(sentence)["compound"]

        if aromaCount > 0:
            aroma[i] /= aromaCount    
        if palateCount > 0:
            palate[i] /= palateCount
        if tasteCount > 0:
            taste[i] /= tasteCount
        if appearanceCount > 0:
            appearance[i] /= appearanceCount
        i += 1
        
    return {"appearance" : appearance, "aroma" : aroma, "palate" : palate, "taste" : taste}

In [17]:
sentences = sentenceSentiment(data)

In [19]:
test_sentences = sentenceSentiment(test)

review/palate:     0.266360520049
review/aroma:      0.27499270306
review/taste:      0.277464650465
review/appearance: 0.236480775284
review/overall:    0.310799823099

review/palate:     0.253683886305
review/aroma:      0.253140656648
review/taste:      0.259079936169
review/appearance: 0.2215648635
review/overall:    0.298516358874

review/palate:     0.253673400152
review/aroma:      0.253174497474
review/taste:      0.259068148975
review/appearance: 0.221632953296
review/overall:    0.298497046681

review/palate:     0.253498607764
review/aroma:      0.251732535731
review/taste:      0.257598991441
review/appearance: 0.219973970342
review/overall:    0.298497046681

appearance 0.217367350449
aroma 0.246278263103
overall 0.291123078719
palate 0.248465609238
taste 0.249787413555

appearance 0.217314205533
aroma 0.246408043701
overall 0.290893370928
palate 0.248406747248
taste 0.24962118813

In [54]:
reviews = ["appearance", "aroma", "overall", "palate", "taste"]
predictions = pd.DataFrame(index = test["index"])
mse = {}

diff = list(set(pd.get_dummies(data["beer/style"]).columns) - set(pd.get_dummies(test["beer/style"]).columns))

# vectorizer = CountVectorizer(ngram_range=(2,2), stop_words="english", max_features=1000)
vectorizer = CountVectorizer(stop_words="english", max_features=1000)
X_train_counts = vectorizer.fit_transform([str(text) for text in data['review/text']])

tf_transformer = TfidfTransformer().fit(X_train_counts)
X_tfidf = tf_transformer.transform(X_train_counts)

X = pd.DataFrame(X_tfidf.todense())

X["sentiment"] = sentiment
X["abv"] = data["beer/ABV"]

style = pd.get_dummies(data["beer/style"])
for col in style:
    X["style:" + str(col)] = style[col]

for col in diff:
    del X["style:" + col]

# X_test_counts = vectorizer.transform([str(text) for text in test['review/text']])
# X_test_tfidf = tf_transformer.transform(X_test_counts)

# X_test = pd.DataFrame(X_test_tfidf.todense())
    
# X_test["sentiment"] = test_sentiment
# X_test["abv"] = test["beer/ABV"]

# style = pd.get_dummies(test["beer/style"])
# for col in style:
#     X_test["style:" + str(col)] = style[col]

In [36]:
for review in reviews:
    model = LinearRegression()
        
    if review != "overall":            
        X[review] = sentences[review]

    X.columns = X.columns.astype(str)
    X.sort_index(axis=1, inplace=True)
        
#     model.fit(X, data["review/"+review])
    print(review.ljust(13), X.shape)
    mse[review] = -cross_val_score(model, X, data["review/" + review], cv=5, scoring="neg_mean_squared_error").mean()
    
    if review != "overall":            
        del X[review]
        
#     if review != "overall":            
#         X_test[review] = test_sentences[review]

#     X_test.columns = X_test.columns.astype(str)
#     X_test.sort_index(axis=1, inplace=True)
    
#     print(review)
#     predictions["review/"+review] = model.predict(X_test)

#     if review != "overall:
#         del X_test[review]

mse

appearance    (37500, 1095)
aroma         (37500, 1095)
overall       (37500, 1094)
palate        (37500, 1095)
taste         (37500, 1095)


{'appearance': 0.21998901557331241,
 'aroma': 0.24597798523763581,
 'overall': 0.2914406947853439,
 'palate': 0.24921256398604541,
 'taste': 0.24866960899616641}

In [29]:
predictions.to_csv("output.csv")

In [44]:
import operator

In [None]:
toAdd = X.columns.tolist()
modelCols = []

lastScore = 0

for _ in range(len(toAdd)):
    vals = dict()
    
    for add in toAdd:
        model = LinearRegression()
        
        X_temp = X[modelCols + [add]]
        y = data["review/taste"]
        
        vals[add] = -cross_val_score(model, X_temp, y, cv=5, scoring="neg_mean_squared_error").mean()

    sort = sorted(vals.items(), key=operator.itemgetter(1))[0:50]
    
    print("Best this round:", sort)

    if (lastScore != 0):
        improvement = sort[0][1] - lastScore
        print("Improvement =", improvement)
        if (improvement > 0):
            break

    for i in sort:
        toAdd.remove(i[0])
        modelCols.append(i[0])
    
    lastScore = sort[0][1]
    print()
    
print("\nFinal Model:", modelCols)